## Word Embeddings

In [55]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from nltk import word_tokenize     
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer 
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsRestClassifier
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sklearn.neural_network import MLPClassifier
from gensim.models import KeyedVectors

In [56]:
stop_words = set(stopwords.words('english'))

In [57]:
def load_data(subset = None):
    data = pd.read_csv("./dataset_20000.csv")
    if subset is not None:
        return data.head(subset)
    return data

In [58]:
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

In [59]:
dataset = load_data(subset = 5000)

In [60]:
stemmer = PorterStemmer()

In [61]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [62]:
def tokenize(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    filtered = [word for word in tokens if word not in stop_words]
    stems = stem_tokens(filtered, stemmer)
    return stems

In [63]:
x = dataset["summary"]
y = dataset.drop(["summary"], axis=1)

In [64]:
def sent_vectorizer(sent):
    sent_vec =[]
    numw = 0
    for w in sent:
        try:
            if numw == 0:
                sent_vec = word_model[w]
            else:
                sent_vec = np.add(sent_vec, word_model[w])
            numw+=1
        except:
            pass
    
    return np.asarray(sent_vec) / numw

In [65]:
word_model = KeyedVectors.load_word2vec_format('/home/ubuntu/MovieGenrePrediction/embeddings.txt', binary=False)

x_train, x_test, y_train, y_test = train_test_split(mapped, y, test_size=0.2)

transformed_x_test = x_test.map(sent_vectorizer)

transformed_x_train = x_train.map(sent_vectorizer)

In [66]:
def build_model(estimator):
    clf = OneVsRestClassifier(estimator=estimator)
    model = clf.fit(extract_vector_array(transformed_x_train.values), y_train)
    
    predictions = model.predict(extract_vector_array(transformed_x_test.values))
    report = classification_report(y_test, predictions)
    return report

In [67]:
def extract_vector_array(test):
    arr = []
    
    for i, vector in enumerate(test):
        arr.append([])
        for j, elem in enumerate(vector):
            arr[i].append(elem)
            
    return arr

In [68]:
report = build_model(LogisticRegression())

print(report)

             precision    recall  f1-score   support

          0       0.66      0.59      0.62       430
          1       0.59      0.31      0.41       325
          2       0.56      0.15      0.23       189
          3       0.47      0.09      0.15       157
          4       0.70      0.17      0.27       135
          5       0.64      0.15      0.24       124
          6       0.49      0.15      0.23       124
          7       0.55      0.07      0.12        85
          8       0.81      0.20      0.33        83
          9       0.68      0.22      0.33        69
         10       0.33      0.01      0.02        79
         11       0.00      0.00      0.00        73
         12       0.67      0.04      0.08        50
         13       0.00      0.00      0.00        48

avg / total       0.56      0.25      0.32      1971



In [69]:
def build_neural_net(hidden_layers, activation='relu', early_stopping=False):
    clf = MLPClassifier(hidden_layer_sizes=hidden_layers, early_stopping=early_stopping, activation=activation)
    model = clf.fit(extract_vector_array(transformed_x_train.values), y_train)
    
    predictions = model.predict(extract_vector_array(transformed_x_test.values))
    report = classification_report(y_test, predictions)
    return report

In [70]:
report = build_neural_net(hidden_layers=(14,14), activation='relu', early_stopping=False)

  'precision', 'predicted', average, warn_for)


In [71]:
print(report)

             precision    recall  f1-score   support

          0       0.63      0.63      0.63       430
          1       0.63      0.35      0.45       325
          2       0.53      0.33      0.41       189
          3       0.49      0.18      0.26       157
          4       0.57      0.27      0.36       135
          5       0.62      0.40      0.49       124
          6       0.54      0.26      0.35       124
          7       0.61      0.16      0.26        85
          8       0.60      0.33      0.42        83
          9       0.73      0.32      0.44        69
         10       0.53      0.10      0.17        79
         11       0.00      0.00      0.00        73
         12       0.00      0.00      0.00        50
         13       0.50      0.02      0.04        48

avg / total       0.55      0.34      0.40      1971

