## Neural Network

In [83]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from nltk import word_tokenize     
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer 
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import ClassifierChain

In [84]:
def load_data(subset = None):
    data = pd.read_csv("./dataset_20000.csv")
    if subset is not None:
        return data.head(subset)
    return data

In [64]:
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

In [94]:
dataset = load_data(subset = 5000)

In [95]:
stemmer = PorterStemmer()

In [96]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [97]:
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

### Neural Network Classifiers

In [None]:
x = dataset['summary']
y = dataset.drop(["summary"], axis=1)

In [137]:
def build_model(hidden_layers, activatation='relu', early_stopping=False, use_tfidf=False):
    cnt_vect = CountVectorizer(stop_words='english', tokenizer=tokenize)
    tf_transform = TfidfTransformer()

    transformed_x = cnt_vect.fit_transform(x)
    
    if use_tfidf:
        transformed_x = tf_transform.fit(transformed_x)
        
    classifier = MLPClassifier(hidden_layer_sizes=hidden_layers, early_stopping=early_stopping, activation=activation,
                               max_iter=1000)

    x_train, x_test, y_train, y_test = train_test_split(transformed_x, y, test_size=0.2)

    model = classifier.fit(x_train, y_train)
    predictions = model.predict(x_test)
    target_names = dataset.drop(["summary"], axis=1).columns.tolist()
    return classification_report(y_test, predictions, target_names=target_names)

In [133]:
print(report)

                 precision    recall  f1-score   support

          Drama       0.62      0.56      0.59       413
         Comedy       0.54      0.45      0.49       321
       Thriller       0.65      0.43      0.52       180
   Romance Film       0.57      0.27      0.37       171
         Action       0.51      0.19      0.28       135
         Horror       0.91      0.40      0.56       132
  Crime Fiction       0.66      0.31      0.42       135
      Adventure       0.62      0.22      0.33       104
Science Fiction       0.73      0.30      0.43        63
      Animation       0.78      0.12      0.21        59
Romantic comedy       0.71      0.15      0.25        80
        Mystery       0.40      0.08      0.13        77
 Crime Thriller       0.33      0.08      0.12        53
        Fantasy       0.62      0.09      0.16        54

    avg / total       0.62      0.35      0.43      1977



In [134]:
from sklearn.metrics import jaccard_similarity_score
score = jaccard_similarity_score(y_test, predictions)
print(score)