In [102]:
import re
import collections
import shutil

num_movie_scripts = 2318
vocabulary_size = 10000
fraction_dev = 50

path_for_x_train = 'X_train.txt'
path_for_y_train = 'y_train.txt'
path_for_x_dev = 'X_dev.txt'
path_for_y_dev = 'y_dev.txt'


_PAD = b"_PAD"
_GO = b"_GO"
_EOS = b"_EOS"
_UNK = b"_UNK"
_START_VOCAB = [_PAD, _GO, _EOS, _UNK]

PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3

_WORD_SPLIT = re.compile(b"([.,!?\":;)(])")
_DIGIT_RE = re.compile(br"\d")

#FROM DATA UTILS
# Build the dictionary with word-IDs from self-made dictionary and replace rare words with UNK token.
def build_dataset(words, vocabulary_size):
    count = [['_UNK', -1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count = unk_count + 1
        data.append(index)
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reverse_dictionary

def create_vocabulary(dictionary, vocabulary_path):
    f = open(vocabulary_path, 'w')
    
    for key in dictionary:
        f.write(dictionary[key] + '\n')
    f.close()

def initialize_vocabulary(vocabulary_path):
  # finds vocabulary file
  if gfile.Exists(vocabulary_path):
    rev_vocab = []
    with gfile.GFile(vocabulary_path, mode="rb") as f:
      rev_vocab.extend(f.readlines())
    rev_vocab = [line.strip() for line in rev_vocab]
    vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
    return vocab, rev_vocab
  else:
    raise ValueError("Vocabulary file %s not found.", vocabulary_path)


def generate_encoded_files2(x_train_file, y_train_file, x_dev_file, y_dev_file, tokenized_sentences, dictionary):
    """Sentence A is in x_train, Sentence B in y_train"""
    encoded_holder = []
    unk_id = dictionary['_UNK']
    for sentence in tokenized_sentences:
        encoded_holder.append(encode_sentence(sentence, dictionary, unk_id))

    f1 = open(x_train_file, 'w')
    f2 = open(y_train_file, 'w')
    fraction = int(len(encoded_holder) / fraction_dev)
    if (len(encoded_holder) % 2 == 0):
        end = len(encoded_holder)
    else:
        end = len(encoded_holder)-1

    for i in xrange(0,fraction,2):
        f1.write(str(encoded_holder[i]) + '\n')
        f2.write(str(encoded_holder[i+1]) + '\n')

    f1.close()
    f2.close()

    d1 = open(x_dev_file, 'w')
    d2 = open(y_dev_file, 'w')

    for i in xrange(fraction, end, 2):
        d1.write(str(encoded_holder[i]) + '\n')
        d2.write(str(encoded_holder[i+1]) + '\n')    

    d1.close()
    d2.close()


def generate_encoded_files(x_train_file, y_train_file, x_dev_file, y_dev_file, tokenized_sentences, dictionary):
    """Sentence A is in x_train and y_train, Sentence B in X_train and y_train"""
    encoded_holder = []
    f1 = open(x_train_file, 'w')

    last_line = tokenized_sentences.pop()
    first_line = tokenized_sentences.pop(0)
    dev_counter = int(len(tokenized_sentences) - len(tokenized_sentences)/fraction_dev)
    
    print last_line
    print first_line
    print dev_counter
    

    unk_id = dictionary['_UNK']
    first_line_encoded = encode_sentence(first_line, dictionary, unk_id)
    f1.write(first_line_encoded + '\n')

    # Creates data for X_train
    for x in xrange(dev_counter):
        encoded_sentence = encode_sentence(tokenized_sentences[x], dictionary, unk_id)
        encoded_holder.append(encoded_sentence)
        f1.write(encoded_sentence + '\n') # Write sentence to file
    f1.close()

    d1 = open(x_dev_file, 'w')
    # Creates data for x_dev_file
    for x in xrange(dev_counter, len(tokenized_sentences)):
        encoded_sentence = encode_sentence(tokenized_sentences[x], dictionary, unk_id)
        encoded_holder.append(encoded_sentence)
        d1.write(encoded_sentence + '\n') # Write sentence to file

    d1.close()

    # Creates data for y_train
    f2 = open(y_train_file, 'w')

    for x in xrange(dev_counter + 1):
        f2.write(encoded_holder[x] + '\n') # Write sentence to file

    f2.close()

    # Creates data for y_dev
    d2 = open(y_dev_file, 'w')
    for x in xrange(dev_counter + 1, len(tokenized_sentences)):
        d2.write(encoded_holder[x] + '\n') # Write sentence to file

    last_line_encoded = encode_sentence(last_line, dictionary, unk_id)
    d2.write(last_line_encoded + '\n')
    d2.close()

def basic_tokenizer(sentence):
  """Very basic tokenizer: split the sentence into a list of tokens"""
  words = []
  for space_separated_fragment in sentence.strip().split():
    words.extend(re.split(_WORD_SPLIT, space_separated_fragment))
  return [w for w in words if w]


def encode_sentence(sentence, dictionary, unk_id):
    # Extract first word (and don't add any space)
    if not sentence:
        return ""
    first_word = sentence.pop(0)
    if first_word in dictionary:
        encoded_sentence = str(dictionary[first_word])
    else:
        encoded_sentence = str(unk_id)

    # Loop rest of the words (and add space in front)
    for word in sentence:
        if word in dictionary:
            encoded_word = dictionary[word]
        else:
            encoded_word = unk_id
        encoded_sentence += " " + str(encoded_word)
    return encoded_sentence


def sentence_to_token_ids(sentence, vocabulary):
  """Convert a string to list of integers representing token-ids.

  For example, a sentence "I have a dog" may become tokenized into
  ["I", "have", "a", "dog"] and with vocabulary {"I": 1, "have": 2,
  "a": 4, "dog": 7"} this function will return [1, 2, 4, 7].

  Returns:
    a list of integers, the token-ids for the sentence.
  """
  words = basic_tokenizer(sentence)
  return [vocabulary.get(w, UNK_ID) for w in words]


def read_data(num_movie_scripts):
    data_tokens = []
    # Append each line in file to the set
    for i in range(0, num_movie_scripts):
        path = '../data/'+str(i)+'raw.txt'
        print 'Reading ', path, '...'
        lines = [line.rstrip('\n') for line in open(path)]
        data_tokens_temp = []
        for line in lines:
            # Tokenize each sentence
            data_tokens_temp.extend(re.findall(r'\S+', line))
        data_tokens.extend(data_tokens_temp)

    return data_tokens


# Reads data and puts every sentence in a TWO DIMENSIONAL array as tokens
# data_tokens[0] = ['This', 'is', 'a', 'sentence']
def read_sentences(num_movie_scripts):
    data_tokens = []
    # Append each line in file to the set
    for i in range(0, num_movie_scripts):
        path = '../data/'+str(i)+'raw.txt'
        print 'Reading ', path, '...'
        lines = [line.rstrip('\n') for line in open(path)]
        data_tokens_temp = []
        for line in lines:
            # Tokenize each sentence
            data_tokens_temp.append(re.findall(r'\S+', line))
        data_tokens.extend(data_tokens_temp)
    return data_tokens






In [3]:
#tokenized_data = read_data(2318)
tokenized_data = read_data(100)

Reading  ../data/0raw.txt ...
Reading  ../data/1raw.txt ...
Reading  ../data/2raw.txt ...
Reading  ../data/3raw.txt ...
Reading  ../data/4raw.txt ...
Reading  ../data/5raw.txt ...
Reading  ../data/6raw.txt ...
Reading  ../data/7raw.txt ...
Reading  ../data/8raw.txt ...
Reading  ../data/9raw.txt ...
Reading  ../data/10raw.txt ...
Reading  ../data/11raw.txt ...
Reading  ../data/12raw.txt ...
Reading  ../data/13raw.txt ...
Reading  ../data/14raw.txt ...
Reading  ../data/15raw.txt ...
Reading  ../data/16raw.txt ...
Reading  ../data/17raw.txt ...
Reading  ../data/18raw.txt ...
Reading  ../data/19raw.txt ...
Reading  ../data/20raw.txt ...
Reading  ../data/21raw.txt ...
Reading  ../data/22raw.txt ...
Reading  ../data/23raw.txt ...
Reading  ../data/24raw.txt ...
Reading  ../data/25raw.txt ...
Reading  ../data/26raw.txt ...
Reading  ../data/27raw.txt ...
Reading  ../data/28raw.txt ...
Reading  ../data/29raw.txt ...
Reading  ../data/30raw.txt ...
Reading  ../data/31raw.txt ...
Reading  ../data/3

In [4]:
len(tokenized_data)

630129

In [8]:
tokenized_data[:10]

['hello', 'hi', 'hey', 'yo', 'hey', 'hi', 'hey', 'you', 'hi', 'hey']

In [9]:
counter  = collections.Counter(tokenized_data)

In [10]:
print counter.most_common(10)

[('.', 63113), (',', 28430), ('you', 18459), ('the', 18184), ('i', 12468), ('!', 12030), ('to', 11752), ('a', 10492), ('...', 7415), ('of', 6917)]


In [12]:
len(counter)

24171

In [16]:
li = [(x,y) for x, y in counter.iteritems()]

In [18]:
words_10 = filter(lambda (x,y): y <=10,li)
words_100 = filter(lambda (x,y): 10 <y<100,li)
words_1000 =filter(lambda (x,y): 100 <y <=1000,li)
words_10000 = filter(lambda (x,y): 1000<y <=10000,li)

In [19]:
print len(words_10)

124920


In [20]:
print len(words_100)

24057


In [21]:
print len(words_1000)

5623


In [25]:
print words_1000[:100]

[('woods', 634), ('hanging', 631), ('woody', 222), ('screaming', 503), ('wooden', 120), ('wednesday', 238), ('shows', 629), ('kid?', 324), ('dna', 260), ('inevitable', 103), ('cocksucker', 115), ('travel', 606), ('fit', 821), ('bringing', 596), ('fin', 128), ('hurting', 265), ('effects', 176), ('size', 762), ('silent', 433), ('disturbed', 121), ('breed', 105), ('knight', 285), ('hiya', 110), ('old?', 104), ('draw', 584), ('affairs', 184), ('tech', 161), ('plate', 370), ('iate', 137), ('job?', 362), ('nicely', 143), ('patch', 225), ("don'tyou", 105), ('lots', 857), ('nature', 640), ('lot?', 103), ('lookin', 584), ('fry', 196), ('spit', 325), ('doubts', 112), ('spin', 230), ('hong', 221), ('corporate', 222), ('hah', 123), ('hal', 277), ('ham', 174), ('hat', 680), ('crowd', 492), ('crown', 296), ('bottom', 817), ('marshall', 128), ('honeymoon', 217), ('shoots', 141), ('raped', 199), ("else's", 213), ('passenger', 105), ('disgrace', 123), ('deputy', 108), ('corps', 160), ('whoever', 555), 

In [22]:
print len(words_10000)

977


In [24]:
print words_10000[:100]

[('kids', 3935), ('excuse', 5847), ('wrong', 6459), ('fix', 1163), ('needed', 1287), ('master', 1714), ('feeling', 2205), ('saying', 3340), ('congratulations', 1033), ('minute', 4332), ('ground', 1189), ('turned', 1542), ('single', 1226), ('being', 6365), ('company', 1857), ('learn', 1761), ('understand?', 1186), ('touch', 2581), ('30', 1972), ('lady', 2514), ('parents', 1957), ('couple', 2713), ('makes', 3327), ("'", 1916), ('tonight', 4531), ('stuff', 3619), ('become', 2015), ('problem', 4496), ('worth', 1499), ('another', 6995), ('thanks', 9046), ('test', 1092), ('gun', 3238), ('guy', 9440), ('brain', 1198), ('drop', 2217), ('tomorrow', 4341), ('caught', 1387), ('father', 7088), ('taking', 3350), ('food', 2152), ('death', 3298), ("isn't", 7461), ('iike', 2174), ('poor', 1897), ('ass', 5735), ('ask', 6748), ("haven't", 4653), ('evening', 1678), ("i'ii", 1031), ('<i>', 2207), ('american', 1508), ("wouldn't", 5399), ('telling', 2455), ('jump', 1138), ('pretty', 5091), ('meet', 5275), (

In [31]:
sorted_words = sorted(words_10000,key=lambda (x,y): -y)

In [32]:
sorted_words

[('big', 9907),
 ("won't", 9818),
 ('told', 9792),
 ('old', 9788),
 ('fucking', 9765),
 ('does', 9723),
 ('guys', 9691),
 ('always', 9636),
 ("we'll", 9633),
 ('after', 9557),
 ("i'd", 9512),
 ('things', 9510),
 ('nice', 9496),
 ('guy', 9440),
 ('believe', 9383),
 ("you've", 9350),
 ('long', 9194),
 ('thanks', 9046),
 ('leave', 9036),
 ('years', 9021),
 ("you'll", 8942),
 ('everything', 8880),
 ('hi', 8814),
 ('three', 8724),
 ('feel', 8543),
 ('gotta', 8541),
 ('stay', 8478),
 ("doesn't", 8458),
 ('listen', 8450),
 ('place', 8333),
 ('hear', 8322),
 ('money', 8321),
 ('fine', 8293),
 ('kill', 8192),
 ('every', 8167),
 ('move', 8149),
 ('bad', 8146),
 ('name', 8128),
 ('wanna', 8125),
 ('through', 8021),
 ('made', 7997),
 ('dead', 7957),
 ('hell', 7827),
 ('ok', 7733),
 ('world', 7712),
 ('coming', 7699),
 ('hold', 7647),
 ('kind', 7608),
 ('left', 7608),
 ('here?', 7591),
 ('baby', 7588),
 ('lot', 7472),
 ("isn't", 7461),
 ('okay?', 7448),
 ('boy', 7427),
 ('girl', 7356),
 ('remember'

Characters to Remove
'
<i>
haven't
bastard
fuck
cocksucker

All punctuations
All words followed by punctuation
Comma, Full stop, ellipsis, ...., :,;
All numbers 

In [None]:
def make_files(num_movie_scripts, vocabulary_size, fraction_dev=50, path_for_x_train = 'X_train.txt', path_for_y_train = 'y_train.txt', path_for_x_dev = 'X_dev.txt', path_for_y_dev = 'y_dev.txt'):
    # Generate dictionary for dataset
    print '------------------------------------------------'
    print ' Generating dictionary based on ', str(num_movie_scripts), ' scripts'
    print '------------------------------------------------'

    
    data, count, dictionary, reverse_dictionary = build_dataset(tokenized_data, vocabulary_size)
    create_vocabulary(reverse_dictionary, 'vocabulary_for_movies.txt')


    # Generate an encoded file using the freated dictionary
    print '------------------------------------------------'
    print ' Creating encoded file using created dictionary'
    print ' (Saved in  ', path_for_x_train, ')'
    print '------------------------------------------------'
    tokenized_sentences = read_sentences(num_movie_scripts)
    generate_encoded_files(path_for_x_train, path_for_y_train, path_for_x_dev, path_for_y_dev, tokenized_sentences, dictionary)





In [16]:
punctuations = {".":1,",":1,"<":1,">":1,"'":1,";":1,":":1,"{":1,"}":1,"[":1,"]":1,"-":1,"_":1,"+":1,"=":1,"*":1,"&":1,"`":1,"~":1,"@":1,"#":1,"$":1,"%":1,"^":1,"(":1,")":1,"/":1,"?":1,"1":1,"..":1,"...":1,"....":1}
punctuations2 = {".":1,",":1,"<":1,">":1,"'":1,";":1,":":1,"{":1,"}":1,"[":1,"]":1,"-":1,"_":1,"+":1,"=":1,"*":1,"&":1,"`":1,"~":1,"@":1,"#":1,"$":1,"%":1,"^":1,"(":1,")":1,"/":1,"1":1,"..":1,"...":1,"....":1}
def remove_punctuation(word):    
    if word in punctuations:
        return ""
    return word


In [17]:
count =0
for word in counter.keys():
    if any(char in punctuations2 for char in word):
        print word
    else:
        count+=1

tom's
'about
madman's
<perhaps
captain's
otsu's
'new
cheyenne's
cuttin'you
secret's
kananga's
maj.
blain's
us$2
baby's
convoy's
leader's
don'tyou
%%%
heller's
'nother
.
bond's
else's
tank's
razor's
they'ii
when's
when'd
bittrich's
on,
'puke
squire's
could've
alex's
can't
kin's
Hihih,
nurhachi's
hippy's
arthur's
baxter's
911
partner's
o'hallorhan
197
191
19,
would've?
video's
chief's
r.
victoria's
world...
paul's
first,
effect's
'leader
312
310
'so
'eye
bowen's
o'neill
ma'am?
key's
'killed
senator's
goin'in
association's
show's
intel's
simon's
''em
'for
'fox
<i>according
peterson's
kuato's
wife's
colonel's
'
turok's
'was
'war
'moron
'way
'when
joe's
bondoc's
these'll
robin's
result's
'course
'expelled
>
richard's
'like
attorney's
intentlon>
governor's
cousin's
'used
janero's
clnclantflt's
'comin
nina's
'walls
ifit's
'politics
leto's
comeback's
'silence
<i>you
o'toole
'spider
montgomery's
children's
capone's
jap's
1600
1603
'technology
creed's
i'llmarryboth
dues,
marion's
'success
o'cloc

In [18]:
print count

22241


In [19]:
len(counter)

24171

In [20]:
count =0
for word in counter.keys():
    if any(char in ["'"] for char in word):
        print word    
        count+=1

tom's
'about
madman's
captain's
otsu's
'new
cheyenne's
cuttin'you
secret's
kananga's
blain's
baby's
convoy's
leader's
don'tyou
heller's
'nother
bond's
else's
tank's
razor's
they'ii
when's
when'd
bittrich's
'puke
squire's
could've
alex's
can't
kin's
nurhachi's
hippy's
arthur's
baxter's
partner's
o'hallorhan
would've?
video's
chief's
victoria's
paul's
effect's
'leader
'so
'eye
bowen's
o'neill
ma'am?
key's
'killed
senator's
goin'in
association's
show's
intel's
simon's
''em
'for
'fox
peterson's
kuato's
wife's
colonel's
'
turok's
'was
'war
'moron
'way
'when
joe's
bondoc's
these'll
robin's
result's
'course
'expelled
richard's
'like
attorney's
governor's
cousin's
'used
janero's
clnclantflt's
'comin
nina's
'walls
ifit's
'politics
leto's
comeback's
'silence
o'toole
'spider
montgomery's
children's
capone's
jap's
'technology
creed's
i'llmarryboth
marion's
'success
o'clock
let's
cimmerian's
code's
elias'll
hadn't
'love
ekonomo's
hauser's
didn't
bird's
state's
zagon's
geek's
gray's
creep's
theywon'

In [21]:
print count

1483


In [22]:
count =0
for word in counter.keys():
    if word[0] == "'":
        print word    
        count+=1

'about
'new
'nother
'puke
'leader
'so
'eye
'killed
''em
'for
'fox
'
'was
'war
'moron
'way
'when
'course
'expelled
'like
'used
'comin
'walls
'politics
'silence
'spider
'technology
'success
'love
'tis
'til
'at?
'part
'along?
'low
'will
'sick
'cause
'cambodia
'gun
'guy
'dink
'bones
'as
'sanchez
'murderer
'condition
'all
'holes
'high
'cackle
'hit
'his
'him
'trouble
'here?
'pussy
'about?
'how
'story
'86
'88
'mind
'legions
'get
'each
'drug
'business
'lame
'bitch
'you're
'days
'outta
'thing
'for?
'me?
'pride
'regiment
'room
'platoon
'law
'bird
'hospitality
'army
'real
'again?
'cold
'73
'72
'dizzy
'hurt
'compass
'pie
'jokes
'bag
'greased
'no
'eight
'blow
'us
'up
'captain
'boats
'partner
'village
'loser
'jobs
'beautiful
'say
'outside
'very
'hearts
'now
'california
'09er
'run
'shit
'asshole
'driver
'truck
'pretzels?
'yet
'tonight?
'in
'em
'blade
'march
'cook
'too
'twas
'feet
'hell
'killers
'wins
'behind
'apollo
'cuz
'bye
'ready
'on?
'hello
'till
'me
'my
'bummer
'worth
'year
'any
'and
'nothin
'27

In [23]:
print count

411


In [26]:
unique_words = sorted(counter.keys())

In [76]:
punctuations = {"!":1,".":1,"<":1,">":1,";":1,":":1,"{":1,"}":1,"[":1,"]":1,"-":1,"_":1,"+":1,"=":1,"*":1,"&":1,"`":1,"~":1,"@":1,"#":1,"%":1,"^":1,"(":1,")":1,"/":1,"..":1,"...":1,"....":1}
apos = "'"
dollar = "$"
qm ="?"
digits =""

def clean_up_data(word):
    if "<i>" in word:
        word = word.replace("<i>","")        

    for spl in punctuations.keys():
        if word and spl in word:
            word = word.replace(spl,"")
    
    if word and "$" in word:
        word = "<amount>"
    
    if word and word.isdigit():
        if ("19" in word or "18" in word) and len(word) <=5:
            word="<year>"
        word = "<number>"
    
    
    if "?" in word:
        if not word[-1] == "?":
            print "QM = %s"%word            
    
    
    #Check if it is time
    for c in word:
        if c.isdigit() and not word.isdigit():              
            if "am" in word or "pm" in word:
                word = "<time>"
    
    
    #Check if is reference to 1st/2nd/3rd/4th
    for c in word:
        if c.isdigit() and not word.isdigit():
            if "st" in word or "nd" in word or "rd" in word or "th" in word:
                word = "<number nth>"
    
    
    for c in word:
        if c.isdigit() and "'" in word:
            word = "<apos number>"
    
    for c in word:
        if c.isdigit() and ("ft" in word or "lbs" in word or "psi" in word or word[-1]=="s" or word[-1] =="m"):
            word = "<number units>"
    
    #Check if it is year
    for c in word:
        if c.isdigit() and not word.isdigit():
            word = "<rand_number>"
    
    return word

In [92]:
clean_words = []
for word in tokenized_data:
    new_word = clean_up_data(word)
    if new_word:
        clean_words.append(new_word)       
    

In [93]:
len(clean_words)

517765

In [84]:
sorted(clean_words)[22500:23439]

['vulgarity',
 'vulnerable',
 'w',
 'w?',
 'waaroms',
 'wabash',
 'wach',
 'waddy',
 'wafer',
 'wage',
 'wager',
 'wages',
 'wagging',
 'wagon',
 'wagonload',
 'wagons',
 'wagstaff',
 'wagstaff?',
 'wagstaffhas',
 'wagstiff',
 'wahad',
 'wai',
 'wailing',
 'waistlines',
 'wait',
 "wait'll",
 'wait?',
 'waited',
 'waiter',
 'waitin',
 'waiting',
 'waits',
 'wakaba',
 'wake',
 'wakes',
 'waking',
 'walk',
 'walk?',
 'walked',
 'walker',
 'walkie',
 'walkin',
 'walking',
 'walks',
 'wall',
 'wall?',
 'wallace',
 "wallace's",
 'wallace?',
 'walled',
 'wallens',
 'waller',
 'wallet',
 "wallet's",
 'wallet?',
 'wallets',
 'wallop',
 'wallpaper',
 'walls',
 'walls?',
 'wally',
 'walt',
 'walter',
 'walter?',
 'walther',
 'waltz',
 'waltzin',
 'wampum',
 'wan',
 "wan's",
 'wan?',
 'wand?',
 'wanda',
 'wander',
 'wandered',
 'wandering',
 'wang',
 'wang?',
 'wankers',
 'wanna',
 'wannabe?',
 'want',
 'want?',
 'wanted',
 'wanting',
 'wants',
 'wantyou',
 'war',
 "war's",
 'war?',
 'ward',
 'war

In [94]:
counter_clean = collections.Counter(clean_words)

In [95]:
counter_clean.most_common(10)

[('you', 18464),
 ('the', 18186),
 ('i', 12544),
 ('to', 11752),
 ('a', 10523),
 ('of', 6918),
 ('it', 6710),
 ('and', 6502),
 ('in', 5793),
 ('is', 5496)]

In [96]:
li2 = [(x,y) for x, y in counter_clean.iteritems()]

In [97]:
words2_10 = filter(lambda (x,y): y <=10,li2)
words2_100 = filter(lambda (x,y): 10 <y<100,li2)
words2_1000 =filter(lambda (x,y): 100 <y <=1000,li2)
words2_10000 = filter(lambda (x,y): 1000<y <=10000,li2)

In [98]:
print len(words2_10)
print len(words2_100)
print len(words2_1000)
print len(words2_10000)

19944
2931
480
77


In [101]:
len(counter_clean)

23440

In [103]:
tokenized_data = read_data(num_movie_scripts)
data, count, dictionary, reverse_dictionary = build_dataset(tokenized_data, vocabulary_size)
create_vocabulary(reverse_dictionary, 'vocabulary_for_movies.txt')

Reading  ../data/0raw.txt ...
Reading  ../data/1raw.txt ...
Reading  ../data/2raw.txt ...
Reading  ../data/3raw.txt ...
Reading  ../data/4raw.txt ...
Reading  ../data/5raw.txt ...
Reading  ../data/6raw.txt ...
Reading  ../data/7raw.txt ...
Reading  ../data/8raw.txt ...
Reading  ../data/9raw.txt ...
Reading  ../data/10raw.txt ...
Reading  ../data/11raw.txt ...
Reading  ../data/12raw.txt ...
Reading  ../data/13raw.txt ...
Reading  ../data/14raw.txt ...
Reading  ../data/15raw.txt ...
Reading  ../data/16raw.txt ...
Reading  ../data/17raw.txt ...
Reading  ../data/18raw.txt ...
Reading  ../data/19raw.txt ...
Reading  ../data/20raw.txt ...
Reading  ../data/21raw.txt ...
Reading  ../data/22raw.txt ...
Reading  ../data/23raw.txt ...
Reading  ../data/24raw.txt ...
Reading  ../data/25raw.txt ...
Reading  ../data/26raw.txt ...
Reading  ../data/27raw.txt ...
Reading  ../data/28raw.txt ...
Reading  ../data/29raw.txt ...
Reading  ../data/30raw.txt ...
Reading  ../data/31raw.txt ...
Reading  ../data/3

In [104]:
tokenized_sentences = read_sentences(num_movie_scripts)

Reading  ../data/0raw.txt ...
Reading  ../data/1raw.txt ...
Reading  ../data/2raw.txt ...
Reading  ../data/3raw.txt ...
Reading  ../data/4raw.txt ...
Reading  ../data/5raw.txt ...
Reading  ../data/6raw.txt ...
Reading  ../data/7raw.txt ...
Reading  ../data/8raw.txt ...
Reading  ../data/9raw.txt ...
Reading  ../data/10raw.txt ...
Reading  ../data/11raw.txt ...
Reading  ../data/12raw.txt ...
Reading  ../data/13raw.txt ...
Reading  ../data/14raw.txt ...
Reading  ../data/15raw.txt ...
Reading  ../data/16raw.txt ...
Reading  ../data/17raw.txt ...
Reading  ../data/18raw.txt ...
Reading  ../data/19raw.txt ...
Reading  ../data/20raw.txt ...
Reading  ../data/21raw.txt ...
Reading  ../data/22raw.txt ...
Reading  ../data/23raw.txt ...
Reading  ../data/24raw.txt ...
Reading  ../data/25raw.txt ...
Reading  ../data/26raw.txt ...
Reading  ../data/27raw.txt ...
Reading  ../data/28raw.txt ...
Reading  ../data/29raw.txt ...
Reading  ../data/30raw.txt ...
Reading  ../data/31raw.txt ...
Reading  ../data/3

In [122]:
cnter = collections.Counter(tokenized_data)

print len(cnter)



155830


In [123]:
tc = cnter.most_common(100000)

In [131]:
c =0
for value in cnter.values():
    if value <100:
        c+=1

In [132]:
c

148977

In [125]:
tc[-1]

('miramonti', 1)

In [105]:
len(tokenized_sentences)

2667881

In [107]:
tokenized_sentences[1000000:1000020]

[['calm', 'down', '.'],
 ['what?'],
 ['who', 'car', 'is', 'that', 'coming', 'up', 'creeping?'],
 ['drive', 'by', '!', 'drive', 'by', '!'],
 ['get', 'off', 'my', 'ass', '!'],
 ['man', ',', "let's", 'go', 'in', 'the', 'house', '.'],
 ['slippin', "'."],
 ['wish', 'you', "weren't", 'slippin', "'today", '.'],
 ['shit', '!'],
 ['who', 'was', 'that?'],
 ['big', 'worm', '.'],
 ['hey', ',', 'um', ',', 'somebody', ',', 'uh', ',', 'page', 'smokey?'],
 ["don't", 'play', 'dumb', ',', 'nigger', '.'],
 ['you', 'know', 'who', 'this', 'is', '.'],
 ['you', 'got', 'my', 'money?'],
 ['man', ',', 'you', "don't", 'have', 'to', 'holler', ',', 'man', '.'],
 ['i',
  "ain't",
  'got',
  'it',
  'yet',
  ',',
  'but',
  "i'm",
  'going',
  'to',
  'get',
  'it',
  '...'],
 ['what', 'happened?'],
 ['man', ',', 'he', 'just', 'hung', 'up', '.'],
 ['i', 'think', 'we', 'better', 'stay', 'in', 'the', 'house', '.']]

In [108]:
generate_encoded_files(path_for_x_train, path_for_y_train, path_for_x_dev, path_for_y_dev, tokenized_sentences, dictionary)

['i', 'run', 'out', '.']
['hello']
2614522


In [110]:
for sentence in tokenized_sentences[:10000]:
    for word in sentence:
        if word and word[0] == "'" and not len(word)==1:
            print sentence
            break

['with', 'the', '\'"', 'today', 'today', '\'"?']
['you', 'sing', '\'"', 'kiyomoto', '\'"?']
['and', 'my', 'teacher', 'praised', 'my', 'rendition', 'of', '\'"', 'ochudo', "'"]
['know', 'why', 'they', 'call', 'this', '\'"', 'dimple', '\'"?']
["weather's", 'fine', '\'"', ',', '\'"', 'great', 'for', 'golf', "'"]
['the', 'meaning', 'of', '\'"', 'great', 'for', 'golf', '\'"?']
['it', 'rains', 'the', 'weather', 'is', 'certainly', 'not', '\'"', 'fine', "'"]
['great', 'for', 'golf', '\'"', ',', 'did', 'it', 'rain', 'all', 'around', 'you?']
['opposite', 'approach', '\'"?']
['you', 'do', ',', "i'll", 'use', 'the', '\'"', 'opposite', 'opposite', '\'"', 'approach']
['again', 'sounds', 'like', '\'"', 'opposite', "'"]
["'n", 'drink', 'is', 'possible', 't', "'however", ',', 'n', 'minute', 'or', 'twenty', 'is', '.']
['i', ',', 'but', 'why', 'you', "doesn't", "'t", "'s", 'try?']
['is', 'japanese', 'for', "'hello", "'."]
['is', "'will", 'catch', "'in", 't', 'japanese?']
[',', "'and", "'?"]
["'t", "'t", '

In [112]:
words = set()
for word in tokenized_data:
    if "'" in word and len(word) <10:
        words.add(word)
        
print len(words)

11797


In [116]:
wordsli = [x for x in words if not "'s" in x]

In [118]:
wordsli2 = [ x for x in words if "'"==x[0]]

In [119]:
wordsli2

["'survivor",
 "'that`s",
 "'morons",
 "'daddy",
 "'sorr",
 "'sort",
 "'key",
 "'unsafe",
 "'parties",
 "'ken",
 "'lazy",
 "'i'ii",
 "'jokes",
 "'smack",
 "'about",
 "'periods",
 "'susie",
 "'round[",
 "'germans",
 "'squozen",
 "'sell",
 "'piccolo",
 "'worked",
 "'pleasure",
 "'joke?",
 "'beggin",
 "'aneurysm",
 "'twist",
 "'shite",
 "'geting",
 "'notion",
 "'circled",
 "'pure",
 "'presents",
 "'means",
 "'that`d",
 "'circles",
 "'odour's",
 "'kate",
 "'garbage",
 "'broke",
 "'fun",
 "'switchin",
 "'finger",
 "'her?",
 "'rabbit's",
 "'mbug",
 "'hero",
 "'here",
 "'herd",
 "'net",
 "'new",
 "'feats",
 "'damn",
 "'ned",
 "'raised",
 "'es",
 "'bad",
 "'bag",
 "'nephew",
 '\'maru"',
 "'bar",
 "'crabs",
 "'america",
 "'ex",
 "'am?",
 "'greased",
 "'grateful",
 "'nicked",
 "'worm",
 "'work",
 "'word",
 "'kick",
 "'mcqueen",
 "'corpse",
 "'driven",
 "'sneak",
 "'en",
 "'that[",
 "'chelsea?",
 "'crack",
 "'trust",
 "'shivah",
 "'trousers",
 "'arties",
 "'office",
 "'start",
 "'toilet",
 "'left

In [138]:
import matplotlib.pyplot as plt
%matplotlib inline

In [149]:
vals = sorted(cnter.values())
n = len(vals)

In [176]:
vocabulary_size = 7000
tokenized_data = read_data(num_movie_scripts)
data, count, dictionary, reverse_dictionary = build_dataset(tokenized_data, vocabulary_size)
create_vocabulary(reverse_dictionary, 'vocabulary_for_movies.txt')

Reading  ../data/0raw.txt ...
Reading  ../data/1raw.txt ...
Reading  ../data/2raw.txt ...
Reading  ../data/3raw.txt ...
Reading  ../data/4raw.txt ...
Reading  ../data/5raw.txt ...
Reading  ../data/6raw.txt ...
Reading  ../data/7raw.txt ...
Reading  ../data/8raw.txt ...
Reading  ../data/9raw.txt ...
Reading  ../data/10raw.txt ...
Reading  ../data/11raw.txt ...
Reading  ../data/12raw.txt ...
Reading  ../data/13raw.txt ...
Reading  ../data/14raw.txt ...
Reading  ../data/15raw.txt ...
Reading  ../data/16raw.txt ...
Reading  ../data/17raw.txt ...
Reading  ../data/18raw.txt ...
Reading  ../data/19raw.txt ...
Reading  ../data/20raw.txt ...
Reading  ../data/21raw.txt ...
Reading  ../data/22raw.txt ...
Reading  ../data/23raw.txt ...
Reading  ../data/24raw.txt ...
Reading  ../data/25raw.txt ...
Reading  ../data/26raw.txt ...
Reading  ../data/27raw.txt ...
Reading  ../data/28raw.txt ...
Reading  ../data/29raw.txt ...
Reading  ../data/30raw.txt ...
Reading  ../data/31raw.txt ...
Reading  ../data/3

In [177]:
tokenized_sentences = read_sentences(num_movie_scripts)

Reading  ../data/0raw.txt ...
Reading  ../data/1raw.txt ...
Reading  ../data/2raw.txt ...
Reading  ../data/3raw.txt ...
Reading  ../data/4raw.txt ...
Reading  ../data/5raw.txt ...
Reading  ../data/6raw.txt ...
Reading  ../data/7raw.txt ...
Reading  ../data/8raw.txt ...
Reading  ../data/9raw.txt ...
Reading  ../data/10raw.txt ...
Reading  ../data/11raw.txt ...
Reading  ../data/12raw.txt ...
Reading  ../data/13raw.txt ...
Reading  ../data/14raw.txt ...
Reading  ../data/15raw.txt ...
Reading  ../data/16raw.txt ...
Reading  ../data/17raw.txt ...
Reading  ../data/18raw.txt ...
Reading  ../data/19raw.txt ...
Reading  ../data/20raw.txt ...
Reading  ../data/21raw.txt ...
Reading  ../data/22raw.txt ...
Reading  ../data/23raw.txt ...
Reading  ../data/24raw.txt ...
Reading  ../data/25raw.txt ...
Reading  ../data/26raw.txt ...
Reading  ../data/27raw.txt ...
Reading  ../data/28raw.txt ...
Reading  ../data/29raw.txt ...
Reading  ../data/30raw.txt ...
Reading  ../data/31raw.txt ...
Reading  ../data/3

In [168]:
new_Counter = collections.Counter(tokenized_data)

In [169]:
len(new_Counter)

148974

In [178]:
generate_encoded_files(path_for_x_train, path_for_y_train, path_for_x_dev, path_for_y_dev, tokenized_sentences, dictionary)

['i', 'run', 'out']
['hello']
2614522


In [171]:
len(dictionary)

10000

In [174]:
c =0
for value in new_Counter.values():
    if value <100:
        c+=1

In [175]:
c

142243