## Classical Models

In [10]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from nltk import word_tokenize     
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer 
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import ClassifierChain

In [11]:
def load_data(subset = None):
    data = pd.read_csv("./dataset_20000.csv")
    if subset is not None:
        return data.head(subset)
    return data

In [12]:
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

In [13]:
dataset = load_data(subset = 1000)

In [14]:
stemmer = PorterStemmer()

In [15]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [16]:
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

### Neural Network Classifiers

In [None]:
x = dataset['summary']
y = dataset.drop(["summary"], axis=1)

cnt_vect = CountVectorizer(stop_words='english', tokenizer=tokenize)
tf_transform = TfidfTransformer()

r = cnt_vect.fit_transform(x)

dense = r.todense()

#for (x,y), value in np.ndenumerate(dense):
#    arr = dense[x]
#    arr[arr > 1] = 1

x = dense

classifier = ClassifierChain(MLPClassifier(hidden_layer_sizes=(30,), activation='logistic', max_iter=1000))

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

model = classifier.fit(x_train, y_train)
predictions = model.predict(x_test)
target_names = dataset.drop(["summary"], axis=1).columns.tolist()
report = classification_report(y_test, predictions, target_names=target_names)

In [18]:
len(cnt_vect.get_feature_names())

4543

In [9]:
print(report)

                 precision    recall  f1-score   support

          Drama       0.67      0.31      0.42        13
         Comedy       0.00      0.00      0.00         4
       Thriller       0.00      0.00      0.00         3
   Romance Film       0.00      0.00      0.00         1
         Action       0.00      0.00      0.00         3
         Horror       0.00      0.00      0.00         2
  Crime Fiction       0.00      0.00      0.00         2
      Adventure       0.00      0.00      0.00         1
Science Fiction       0.00      0.00      0.00         2
      Animation       0.00      0.00      0.00         0
Romantic comedy       0.00      0.00      0.00         0
        Mystery       0.00      0.00      0.00         1
 Crime Thriller       0.00      0.00      0.00         2
        Fantasy       0.00      0.00      0.00         0

    avg / total       0.25      0.12      0.16        34



In [None]:
from sklearn.metrics import jaccard_similarity_score

In [None]:
score = jaccard_similarity_score(y_test, predictions)

In [None]:
print(score)

### KNeighbors Classifier 

In [None]:
classifier = BinaryRelevance(KNeighborsClassifier())

x = dataset['summary']
y = dataset.drop(["summary"], axis=1)

cnt_vect = CountVectorizer(stop_words='english')
tf_transform = TfidfTransformer()

r = cnt_vect.fit_transform(x)
r = tf_transform.fit_transform(r)
x = r

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33)

model = classifier.fit(x_train, y_train)
predictions = model.predict(x_test)
target_names = dataset.drop(["summary"], axis=1).columns.tolist()
report = classification_report(y_test, predictions, target_names=target_names)

In [None]:
print(report)

In [35]:
x_train

<3350x45175 sparse matrix of type '<class 'numpy.float64'>'
	with 407214 stored elements in Compressed Sparse Row format>

In [17]:
x = dataset['summary']
y = dataset.drop(["summary"], axis=1)

cnt_vect = CountVectorizer(stop_words='english', tokenizer=LemmaTokenizer())
tf_transform = TfidfTransformer()

r = cnt_vect.fit_transform(x)

In [18]:
cnt_vect.get_feature_names()

['!',
 '#',
 '$',
 '%',
 '&',
 "'",
 "''",
 "'150",
 "'50s",
 "'52",
 "'60s",
 "'70s",
 "'72",
 "'76",
 "'78",
 "'80s",
 "'86",
 "'88",
 "'96",
 "'a",
 "'admiral",
 "'adverts",
 "'all",
 "'amen",
 "'an",
 "'ancient",
 "'and",
 "'animals",
 "'are",
 "'around",
 "'at",
 "'awakens",
 "'baby",
 "'bad",
 "'balkan",
 "'bananas",
 "'bare",
 "'bazza",
 "'believe",
 "'better",
 "'big",
 "'bilge",
 "'bill",
 "'billy",
 "'bird",
 "'black",
 "'blade",
 "'bleeds",
 "'blood",
 "'bloodstone",
 "'body",
 "'booky",
 "'boy",
 "'brain",
 "'break",
 "'bros",
 "'bug",
 "'bye",
 "'cabin",
 "'caesarion",
 "'calling",
 "'camp",
 "'can't-get-right",
 "'cannon",
 "'cause",
 "'celebrity",
 "'channel",
 "'charlie",
 "'chat",
 "'chatted",
 "'child",
 "'chris",
 "'citizen",
 "'clean",
 "'cloverleaf",
 "'cobra",
 "'coincidental",
 "'collecting",
 "'communist",
 "'condi",
 "'control",
 "'conveniently",
 "'cool",
 "'coping",
 "'coveney",
 "'curly",
 "'d",
 "'daddy",
 "'dancing",
 "'danielle",
 "'david",
 "'death",
 "'