In [None]:
import time
import numpy as np
import pandas as pd
import os
import pickle as pkl
from matplotlib import pyplot as plt
from collections import defaultdict
import nltk
import re
import itertools
import unittest
import RegexTester
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize

from collections import Counter

%matplotlib inline

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.optimizers import SGD

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional, TimeDistributed
from keras.models import Model, load_model

from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers, regularizers, optimizers
from keras.callbacks import History, CSVLogger

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

# Raw Data - will not use

In [None]:
data1 = pd.read_csv('genres.csv', header=None, encoding = "ISO-8859-1")
data2 = pd.read_csv('genres2.csv', header=None, encoding = "ISO-8859-1")
data3 = pd.read_csv('genres3.csv', header=None, encoding = "ISO-8859-1")

data = pd.concat([data1, data2, data3])

In [None]:
data = data.reset_index(drop=True)
data.groupby(1)[0].nunique()

# Preprocessing

In [None]:
#Lemmetizing Function
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def nltk2wn_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return wordnet.NOUN


In [None]:
#Tokenize w/lemmetization AFTER removing stopwords - TOKENIZER 1
def tokenize(plots, lemmatize = False):
    
    def re_sub(pattern, replace):
        return re.sub(pattern, replace, plots)
    
    plots = plots.lower() #lowercase
    plots = re_sub(r"[-+]?[.\d]*[\d]+[:,/.\d]*", "<number>") #generic tag for numbers
    plots = re_sub(r"([!?.]){2,}", r"\1") #Convert multiple punctuations to the last punctuation mark
    plots = plots.replace('-',' ') #separating hyphenated words
    plots = plots.replace('_','') #remove underscores
    plots = re_sub(r'(?<!\w)([a-zA-Z])\.', r'\1') #remove periods from abbreviations
    plots = re_sub('[^\w\s\.\<>\?\!]','') #remove punctuation besides sentence completers and <> for generic number
    plots = plots.lower().split()
    output = list(itertools.chain(*[re.split(r'([^\w<>])', x) for x in plots if x not in stop])) #split sentence enders and remove stopwords
    output = [item for item in output if item != '']
    
    if lemmatize:
        output_lem = nltk.pos_tag(output)
        return [lemmatizer.lemmatize(x[0], pos = nltk2wn_tag(x[1])) for x in output_lem]
    else:
        return output



In [None]:
#Tokenize w/lemmetization BEFORE removing stopwords - TOKENIZER 2
def tokenize(plots, lemmatize = False):
    
    def re_sub(pattern, replace):
        return re.sub(pattern, replace, plots)
    
    plots = plots.lower() #lowercase
    plots = re_sub(r"[-+]?[.\d]*[\d]+[:,/.\d]*", "<number>") #generic tag for numbers
    plots = re_sub(r"([!?.]){2,}", r"\1") #Convert multiple punctuations to the last punctuation mark
    plots = plots.replace('-',' ') #separating hyphenated words
    plots = re_sub(r'(?<!\w)([a-zA-Z])\.', r'\1') #remove periods from abbreviations
    plots = re_sub('[^\w\s\.\<>\?\!]','') #remove punctuation besides sentence completers and <> for generic number
    plots = plots.lower().split()
    if lemmatize:
        plots = nltk.pos_tag(plots)
        plots = [lemmatizer.lemmatize(x[0], pos = nltk2wn_tag(x[1])) for x in plots]
    output = list(itertools.chain(*[re.split(r'([^\w<>])', x) for x in plots if x not in stop])) #split sentence enders and remove stopwords
    output = [item for item in output if item != '']
    return output

In [None]:
def run_tests(test_module, test_names, reload=True):
    import unittest
    if reload:
        import importlib
        importlib.reload(test_module)
    unittest.TextTestRunner(verbosity=2).run(unittest.TestLoader().loadTestsFromNames(test_names, test_module))

In [None]:
run_tests(RegexTester, ["NumberRegex"])
run_tests(RegexTester, ["RepeatedPunctuationRegex"])
run_tests(RegexTester, ["HyphenRegex"])
run_tests(RegexTester, ["AbbreviationRegex"])
run_tests(RegexTester, ["PunctuationRemovalRegex"])

In [None]:
#Test Tokenizer 1
print(tokenize("Hello, MYself dear $$20-30 hello? 2,00.0 A.J.A what??"))
print("\n")
print(full_data["plots"][0])
print("\n")
print(tokenize(full_data["plots"][0]))
print("\n")
print(tokenize(full_data["plots"][0], lemmatize = True))

In [None]:
#TOKENIZER 2
print(tokenize(full_data["plots"][0], lemmatize = True))

# Apply Tokenizer to Plots

In [None]:
start = time.time()
full_data["plots_processed"] = full_data["plots"].apply(lambda row: tokenize(row, lemmatize=True)) #Tokenizer 1
end = time.time()
print("Total Time to tokenize plots:", end - start, "seconds")

In [None]:
full_data["plots_processed"].head()

# Load Word Embeddings

In [None]:
def loadEmbed(file):
    start = time.time()
    print("Loading Embeddings")
    f = open(file, 'r', encoding='utf-8')
    model = {}
    status_every = 100000
    for i, line in enumerate(f):
        if i%status_every == 0:
            print('Processing line {:,}'.format(i))
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",'{:,}'.format(len(model))," words loaded!")
    end = time.time()
    print("Total Time to load embeddings:", end - start, "seconds")
    return model

In [None]:
#TOO LARGE TO PUSH TO GIT, DOWNLOAD SEPARATLEY FROM https://github.com/stanfordnlp/GloVe
glove_dir = './glove.6B/'
glove_filename = 'glove.6B.300d.txt'
glove_fullpath = glove_dir + glove_filename
glove_dd = loadEmbed(glove_fullpath)

**Can also use assignment 2 code for glove embeddings since we are using the same embeddings**

# CHECKPOINT - CREATE PICKLE OBJECTS

In [None]:
def create_pkl_file(obj, filename):
    with open(filename, 'wb') as file:
        pkl.dump(obj, file)


embedding_vocab = list(glove_dd.keys())
create_pkl_file(glove_dd, 'glove_embeddings.pickle')
create_pkl_file(embedding_vocab, 'embedding_vocab.pickle')
create_pkl_file(full_data, 'full_data_w_processed_plots_lemmatized.pickle')

In [None]:
full_data = pkl.load( open("full_data_w_processed_plots_lemmatized.pickle", "rb") )

In [None]:
def embed_plot(plot):
    return np.array([glove_dd.get(word, glove_dd.get("unk")) for word in plot]).astype(np.float32)

In [None]:
embed_plot(full_data["plots_processed"][0])