## Classical Models

In [14]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from nltk import word_tokenize     
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer 
from sklearn.multiclass import OneVsRestClassifier

In [15]:
def load_data(subset = None):
    data = pd.read_csv("./dataset_20000.csv")
    if subset is not None:
        return data.head(subset)
    return data

In [16]:
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

In [17]:
dataset = load_data(subset = 5000)

In [18]:
stemmer = PorterStemmer()

In [19]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [20]:
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

### Naiive Bayes Classifier

In [45]:
classifier = BinaryRelevance(GaussianNB())

x = dataset['summary']
y = dataset.drop(["summary"], axis=1)

cnt_vect = CountVectorizer(stop_words='english')
tf_transform = TfidfTransformer()

r = cnt_vect.fit_transform(x)
r = tf_transform.fit_transform(r)
x = r

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

model = classifier.fit(x_train, y_train)
predictions = model.predict(x_test)
target_names = dataset.drop(["summary"], axis=1).columns.tolist()
report = classification_report(y_test, predictions, target_names=target_names)

In [46]:
print(report)

                 precision    recall  f1-score   support

          Drama       0.52      0.46      0.49       622
         Comedy       0.43      0.38      0.40       479
       Thriller       0.21      0.11      0.15       277
   Romance Film       0.20      0.09      0.13       220
         Action       0.26      0.13      0.17       210
         Horror       0.27      0.12      0.16       199
  Crime Fiction       0.23      0.08      0.12       193
      Adventure       0.19      0.10      0.13       143
Science Fiction       0.26      0.13      0.18       105
      Animation       0.46      0.12      0.19       102
Romantic comedy       0.24      0.06      0.10       100
        Mystery       0.14      0.03      0.06        88
 Crime Thriller       0.04      0.01      0.02        89
        Fantasy       0.20      0.06      0.10        78

    avg / total       0.33      0.22      0.25      2905



In [47]:
from sklearn.metrics import jaccard_similarity_score

In [50]:
score = jaccard_similarity_score(y_test, predictions)

In [51]:
print(score)

0.2212239834239834


### KNeighbors Classifier 

In [26]:
classifier = BinaryRelevance(KNeighborsClassifier())

x = dataset['summary']
y = dataset.drop(["summary"], axis=1)

cnt_vect = CountVectorizer(stop_words='english')
tf_transform = TfidfTransformer()

r = cnt_vect.fit_transform(x)
r = tf_transform.fit_transform(r)
x = r

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33)

#model = classifier.fit(x_train, y_train)
#predictions = model.predict(x_test)
#target_names = dataset.drop(["summary"], axis=1).columns.tolist()
#report = classification_report(y_test, predictions, target_names=target_names)

In [None]:
print(report)

In [35]:
x_train

<3350x45175 sparse matrix of type '<class 'numpy.float64'>'
	with 407214 stored elements in Compressed Sparse Row format>

In [17]:
x = dataset['summary']
y = dataset.drop(["summary"], axis=1)

cnt_vect = CountVectorizer(stop_words='english', tokenizer=LemmaTokenizer())
tf_transform = TfidfTransformer()

r = cnt_vect.fit_transform(x)

### Pipelines

In [21]:
x = dataset['summary']
y = dataset.drop(["summary"], axis=1)

cnt_vect = CountVectorizer(stop_words='english')
tf_transform = TfidfTransformer()

pipeline = Pipeline([
    ('cnt', cnt_vect), 
    ('tfidf', tf_transform ),
    ('clf', OneVsRestClassifier(MultinomialNB(
        fit_prior=True, class_prior=None))),
])

In [22]:
from sklearn.model_selection import GridSearchCV

In [30]:
pipeline.fit(x, y)

AttributeError: lower not found