In [6]:
import re
import collections
import shutil

num_movie_scripts = 2318
vocabulary_size = 10000
fraction_dev = 50

path_for_x_train = 'X_train.txt'
path_for_y_train = 'y_train.txt'
path_for_x_dev = 'X_dev.txt'
path_for_y_dev = 'y_dev.txt'


_PAD = b"_PAD"
_GO = b"_GO"
_EOS = b"_EOS"
_UNK = b"_UNK"
_START_VOCAB = [_PAD, _GO, _EOS, _UNK]

PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3

_WORD_SPLIT = re.compile(b"([.,!?\":;)(])")
_DIGIT_RE = re.compile(br"\d")

#FROM DATA UTILS
# Build the dictionary with word-IDs from self-made dictionary and replace rare words with UNK token.
def build_dataset(words, vocabulary_size):
    count = [['_UNK', -1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count = unk_count + 1
        data.append(index)
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reverse_dictionary

def create_vocabulary(dictionary, vocabulary_path):
    f = open(vocabulary_path, 'w')
    
    for key in dictionary:
        f.write(dictionary[key] + '\n')
    f.close()

def initialize_vocabulary(vocabulary_path):
  # finds vocabulary file
  if gfile.Exists(vocabulary_path):
    rev_vocab = []
    with gfile.GFile(vocabulary_path, mode="rb") as f:
      rev_vocab.extend(f.readlines())
    rev_vocab = [line.strip() for line in rev_vocab]
    vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
    return vocab, rev_vocab
  else:
    raise ValueError("Vocabulary file %s not found.", vocabulary_path)


def generate_encoded_files2(x_train_file, y_train_file, x_dev_file, y_dev_file, tokenized_sentences, dictionary):
    """Sentence A is in x_train, Sentence B in y_train"""
    encoded_holder = []
    unk_id = dictionary['_UNK']
    for sentence in tokenized_sentences:
        encoded_holder.append(encode_sentence(sentence, dictionary, unk_id))

    f1 = open(x_train_file, 'w')
    f2 = open(y_train_file, 'w')
    fraction = int(len(encoded_holder) / fraction_dev)
    if (len(encoded_holder) % 2 == 0):
        end = len(encoded_holder)
    else:
        end = len(encoded_holder)-1

    for i in xrange(0,fraction,2):
        f1.write(str(encoded_holder[i]) + '\n')
        f2.write(str(encoded_holder[i+1]) + '\n')

    f1.close()
    f2.close()

    d1 = open(x_dev_file, 'w')
    d2 = open(y_dev_file, 'w')

    for i in xrange(fraction, end, 2):
        d1.write(str(encoded_holder[i]) + '\n')
        d2.write(str(encoded_holder[i+1]) + '\n')    

    d1.close()
    d2.close()


def generate_encoded_files(x_train_file, y_train_file, x_dev_file, y_dev_file, tokenized_sentences, dictionary):
    """Sentence A is in x_train and y_train, Sentence B in X_train and y_train"""
    encoded_holder = []
    f1 = open(x_train_file, 'w')

    last_line = tokenized_sentences.pop()
    first_line = tokenized_sentences.pop(0)
    dev_counter = int(len(tokenized_sentences) - len(tokenized_sentences)/fraction_dev)

    unk_id = dictionary['_UNK']
    first_line_encoded = encode_sentence(first_line, dictionary, unk_id)
    f1.write(first_line_encoded + '\n')

    # Creates data for X_train
    for x in xrange(dev_counter):
        encoded_sentence = encode_sentence(tokenized_sentences[x], dictionary, unk_id)
        encoded_holder.append(encoded_sentence)
        f1.write(encoded_sentence + '\n') # Write sentence to file
    f1.close()

    d1 = open(x_dev_file, 'w')
    # Creates data for x_dev_file
    for x in xrange(dev_counter, len(tokenized_sentences)):
        encoded_sentence = encode_sentence(tokenized_sentences[x], dictionary, unk_id)
        encoded_holder.append(encoded_sentence)
        d1.write(encoded_sentence + '\n') # Write sentence to file

    d1.close()

    # Creates data for y_train
    f2 = open(y_train_file, 'w')

    for x in xrange(dev_counter + 1):
        f2.write(encoded_holder[x] + '\n') # Write sentence to file

    f2.close()

    # Creates data for y_dev
    d2 = open(y_dev_file, 'w')
    for x in xrange(dev_counter + 1, len(tokenized_sentences)):
        d2.write(encoded_holder[x] + '\n') # Write sentence to file

    last_line_encoded = encode_sentence(last_line, dictionary, unk_id)
    d2.write(last_line_encoded + '\n')
    d2.close()

def basic_tokenizer(sentence):
  """Very basic tokenizer: split the sentence into a list of tokens"""
  words = []
  for space_separated_fragment in sentence.strip().split():
    words.extend(re.split(_WORD_SPLIT, space_separated_fragment))
  return [w for w in words if w]


def encode_sentence(sentence, dictionary, unk_id):
    # Extract first word (and don't add any space)
    if not sentence:
        return ""
    first_word = sentence.pop(0)
    if first_word in dictionary:
        encoded_sentence = str(dictionary[first_word])
    else:
        encoded_sentence = str(unk_id)

    # Loop rest of the words (and add space in front)
    for word in sentence:
        if word in dictionary:
            encoded_word = dictionary[word]
        else:
            encoded_word = unk_id
        encoded_sentence += " " + str(encoded_word)
    return encoded_sentence


def sentence_to_token_ids(sentence, vocabulary):
  """Convert a string to list of integers representing token-ids.

  For example, a sentence "I have a dog" may become tokenized into
  ["I", "have", "a", "dog"] and with vocabulary {"I": 1, "have": 2,
  "a": 4, "dog": 7"} this function will return [1, 2, 4, 7].

  Returns:
    a list of integers, the token-ids for the sentence.
  """
  words = basic_tokenizer(sentence)
  return [vocabulary.get(w, UNK_ID) for w in words]


def read_data(num_movie_scripts):
    data_tokens = []
    # Append each line in file to the set
    for i in range(0, num_movie_scripts):
        path = '../data/'+str(i)+'raw.txt'
        print 'Reading ', path, '...'
        lines = [line.rstrip('\n') for line in open(path)]
        data_tokens_temp = []
        for line in lines:
            # Tokenize each sentence
            data_tokens_temp.extend(re.findall(r'\S+', line))
        data_tokens.extend(data_tokens_temp)

    return data_tokens


# Reads data and puts every sentence in a TWO DIMENSIONAL array as tokens
# data_tokens[0] = ['This', 'is', 'a', 'sentence']
def read_sentences(num_movie_scripts):
    data_tokens = []
    # Append each line in file to the set
    for i in range(0, num_movie_scripts):
        path = '../data/'+str(i)+'raw.txt'
        print 'Reading ', path, '...'
        lines = [line.rstrip('\n') for line in open(path)]
        data_tokens_temp = []
        for line in lines:
            # Tokenize each sentence
            data_tokens_temp.append(re.findall(r'\S+', line))
        data_tokens.extend(data_tokens_temp)
    return data_tokens






In [7]:
tokenized_data = read_data(2318)

Reading  ../data/0raw.txt ...
Reading  ../data/1raw.txt ...
Reading  ../data/2raw.txt ...
Reading  ../data/3raw.txt ...
Reading  ../data/4raw.txt ...
Reading  ../data/5raw.txt ...
Reading  ../data/6raw.txt ...
Reading  ../data/7raw.txt ...
Reading  ../data/8raw.txt ...
Reading  ../data/9raw.txt ...
Reading  ../data/10raw.txt ...
Reading  ../data/11raw.txt ...
Reading  ../data/12raw.txt ...
Reading  ../data/13raw.txt ...
Reading  ../data/14raw.txt ...
Reading  ../data/15raw.txt ...
Reading  ../data/16raw.txt ...
Reading  ../data/17raw.txt ...
Reading  ../data/18raw.txt ...
Reading  ../data/19raw.txt ...
Reading  ../data/20raw.txt ...
Reading  ../data/21raw.txt ...
Reading  ../data/22raw.txt ...
Reading  ../data/23raw.txt ...
Reading  ../data/24raw.txt ...
Reading  ../data/25raw.txt ...
Reading  ../data/26raw.txt ...
Reading  ../data/27raw.txt ...
Reading  ../data/28raw.txt ...
Reading  ../data/29raw.txt ...
Reading  ../data/30raw.txt ...
Reading  ../data/31raw.txt ...
Reading  ../data/3

In [8]:
len(tokenized_data)

17632363

In [9]:
tokenized_data[:5]

['hello', 'hi', 'hey', 'yo', 'hey']

In [11]:
counter  = collections.Counter(tokenized_data)

[('.', 1726654), (',', 863833), ('you', 538325), ('the', 455405), ('i', 415128), ('to', 327376), ('a', 299244), ('!', 279330), ('...', 236629), ('and', 195023), ('it', 184577), ('of', 169440), ('that', 146788), ('in', 145439), ('is', 144287), ('me', 133347), ('this', 115723), ('what', 111736), ('on', 106665), ('my', 105234)]


In [12]:
print counter.most_common(200)

[('.', 1726654), (',', 863833), ('you', 538325), ('the', 455405), ('i', 415128), ('to', 327376), ('a', 299244), ('!', 279330), ('...', 236629), ('and', 195023), ('it', 184577), ('of', 169440), ('that', 146788), ('in', 145439), ('is', 144287), ('me', 133347), ('this', 115723), ('what', 111736), ('on', 106665), ('my', 105234), ('your', 101482), ('we', 99799), ('for', 97905), ('no', 96067), ("i'm", 96031), ('have', 90956), ("don't", 89834), ('do', 89679), ('are', 86522), ('be', 84472), ("it's", 83794), ('not', 79963), ('know', 78814), ('was', 78178), ('he', 75157), ('all', 74972), ('with', 73256), ('get', 70578), ('just', 66941), ('but', 64221), ('go', 62492), ('so', 61679), ('here', 61497), ('like', 57219), ('out', 55748), ('up', 55001), ("you're", 53314), ('come', 53181), ('can', 52620), ('right', 49932), ('got', 49191), ('oh', 48045), ('about', 46942), ('they', 46716), ('one', 46636), ('at', 46243), ('if', 46233), ('him', 46097), ('there', 45648), ('now', 45429), ("that's", 45005), ('y

In [13]:
len(counter)

155830

In [16]:
li = [(x,y) for x, y in counter.iteritems()]

In [18]:
words_10 = filter(lambda (x,y): y <=10,li)
words_100 = filter(lambda (x,y): 10 <y<100,li)
words_1000 =filter(lambda (x,y): 100 <y <=1000,li)
words_10000 = filter(lambda (x,y): 1000<y <=10000,li)

In [19]:
print len(words_10)

124920


In [20]:
print len(words_100)

24057


In [21]:
print len(words_1000)

5623


In [25]:
print words_1000[:100]

[('woods', 634), ('hanging', 631), ('woody', 222), ('screaming', 503), ('wooden', 120), ('wednesday', 238), ('shows', 629), ('kid?', 324), ('dna', 260), ('inevitable', 103), ('cocksucker', 115), ('travel', 606), ('fit', 821), ('bringing', 596), ('fin', 128), ('hurting', 265), ('effects', 176), ('size', 762), ('silent', 433), ('disturbed', 121), ('breed', 105), ('knight', 285), ('hiya', 110), ('old?', 104), ('draw', 584), ('affairs', 184), ('tech', 161), ('plate', 370), ('iate', 137), ('job?', 362), ('nicely', 143), ('patch', 225), ("don'tyou", 105), ('lots', 857), ('nature', 640), ('lot?', 103), ('lookin', 584), ('fry', 196), ('spit', 325), ('doubts', 112), ('spin', 230), ('hong', 221), ('corporate', 222), ('hah', 123), ('hal', 277), ('ham', 174), ('hat', 680), ('crowd', 492), ('crown', 296), ('bottom', 817), ('marshall', 128), ('honeymoon', 217), ('shoots', 141), ('raped', 199), ("else's", 213), ('passenger', 105), ('disgrace', 123), ('deputy', 108), ('corps', 160), ('whoever', 555), 

In [22]:
print len(words_10000)

977


In [24]:
print words_10000[:100]

[('kids', 3935), ('excuse', 5847), ('wrong', 6459), ('fix', 1163), ('needed', 1287), ('master', 1714), ('feeling', 2205), ('saying', 3340), ('congratulations', 1033), ('minute', 4332), ('ground', 1189), ('turned', 1542), ('single', 1226), ('being', 6365), ('company', 1857), ('learn', 1761), ('understand?', 1186), ('touch', 2581), ('30', 1972), ('lady', 2514), ('parents', 1957), ('couple', 2713), ('makes', 3327), ("'", 1916), ('tonight', 4531), ('stuff', 3619), ('become', 2015), ('problem', 4496), ('worth', 1499), ('another', 6995), ('thanks', 9046), ('test', 1092), ('gun', 3238), ('guy', 9440), ('brain', 1198), ('drop', 2217), ('tomorrow', 4341), ('caught', 1387), ('father', 7088), ('taking', 3350), ('food', 2152), ('death', 3298), ("isn't", 7461), ('iike', 2174), ('poor', 1897), ('ass', 5735), ('ask', 6748), ("haven't", 4653), ('evening', 1678), ("i'ii", 1031), ('<i>', 2207), ('american', 1508), ("wouldn't", 5399), ('telling', 2455), ('jump', 1138), ('pretty', 5091), ('meet', 5275), (

In [31]:
sorted_words = sorted(words_10000,key=lambda (x,y): -y)

In [32]:
sorted_words

[('big', 9907),
 ("won't", 9818),
 ('told', 9792),
 ('old', 9788),
 ('fucking', 9765),
 ('does', 9723),
 ('guys', 9691),
 ('always', 9636),
 ("we'll", 9633),
 ('after', 9557),
 ("i'd", 9512),
 ('things', 9510),
 ('nice', 9496),
 ('guy', 9440),
 ('believe', 9383),
 ("you've", 9350),
 ('long', 9194),
 ('thanks', 9046),
 ('leave', 9036),
 ('years', 9021),
 ("you'll", 8942),
 ('everything', 8880),
 ('hi', 8814),
 ('three', 8724),
 ('feel', 8543),
 ('gotta', 8541),
 ('stay', 8478),
 ("doesn't", 8458),
 ('listen', 8450),
 ('place', 8333),
 ('hear', 8322),
 ('money', 8321),
 ('fine', 8293),
 ('kill', 8192),
 ('every', 8167),
 ('move', 8149),
 ('bad', 8146),
 ('name', 8128),
 ('wanna', 8125),
 ('through', 8021),
 ('made', 7997),
 ('dead', 7957),
 ('hell', 7827),
 ('ok', 7733),
 ('world', 7712),
 ('coming', 7699),
 ('hold', 7647),
 ('kind', 7608),
 ('left', 7608),
 ('here?', 7591),
 ('baby', 7588),
 ('lot', 7472),
 ("isn't", 7461),
 ('okay?', 7448),
 ('boy', 7427),
 ('girl', 7356),
 ('remember'

Characters to Remove
'
<i>
haven't
bastard
fuck
cocksucker

All punctuations
All words followed by punctuation
Comma, Full stop, ellipsis, ...., :,;
All numbers 

In [None]:
def make_files(num_movie_scripts, vocabulary_size, fraction_dev=50, path_for_x_train = 'X_train.txt', path_for_y_train = 'y_train.txt', path_for_x_dev = 'X_dev.txt', path_for_y_dev = 'y_dev.txt'):
    # Generate dictionary for dataset
    print '------------------------------------------------'
    print ' Generating dictionary based on ', str(num_movie_scripts), ' scripts'
    print '------------------------------------------------'

    
    data, count, dictionary, reverse_dictionary = build_dataset(tokenized_data, vocabulary_size)
    create_vocabulary(reverse_dictionary, 'vocabulary_for_movies.txt')


    # Generate an encoded file using the freated dictionary
    print '------------------------------------------------'
    print ' Creating encoded file using created dictionary'
    print ' (Saved in  ', path_for_x_train, ')'
    print '------------------------------------------------'
    tokenized_sentences = read_sentences(num_movie_scripts)
    generate_encoded_files(path_for_x_train, path_for_y_train, path_for_x_dev, path_for_y_dev, tokenized_sentences, dictionary)



