In [6]:
import pandas as pd
import text_processing as text
from sklearn import metrics
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold, cross_val_predict, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

import imblearn.pipeline as balance_pipe
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler


df = pd.read_csv("movie_train.csv",index_col=0,)

df.reset_index(drop=False,inplace=True)
df.rename(mapper={'index':'ID'},axis=1,inplace=True)

print(df.shape)

df.head()




(10682, 7)


Unnamed: 0,ID,Release Year,Title,Plot,Director,Cast,Genre
0,10281,1984,Silent Madness,A computer error leads to the accidental relea...,Simon Nuchtern,"Belinda Montgomery, Viveca Lindfors",horror
1,7341,1960,Desire in the Dust,"Lonnie Wilson (Ken Scott), the son of a sharec...",Robert L. Lippert,"Raymond Burr, Martha Hyer, Joan Bennett",drama
2,10587,1986,On the Edge,"A gaunt, bushy-bearded, 44-year-old Wes Holman...",Rob Nilsson,"Bruce Dern, Pam Grier",drama
3,25495,1988,Ram-Avtar,Ram and Avtar are both childhood best friends....,Sunil Hingorani,"Sunny Deol, Anil Kapoor, Sridevi",drama
4,16607,2013,Machete Kills,Machete Cortez (Danny Trejo) and Sartana River...,Robert Rodriguez,"Danny Trejo, Michelle Rodriguez, Sofía Vergara...",action


Begin Modeling

In [33]:
X = df['Plot']
y = df['Genre']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# skf = StratifiedKFold(n_splits=3)
# for train_index, test_index in skf.split(X, y):
#     X_train, X_test = X[train_index], X[test_index]
#     y_train, y_test = y[train_index], y[test_index]

In [34]:
text_clf = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', SGDClassifier(loss='hinge',penalty='l2',
                           alpha=1e-3, random_state=42,
                           n_jobs=-1)),
 ])

In [35]:
text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)
print(np.mean(predicted == y_test))

0.5778471138845553


In [19]:
parameters = {
     'vect__ngram_range': [(1, 1), (1, 2)],
     'tfidf__use_idf': (True, False),
     'clf__alpha': (1e-2, 1e-3),
 }

In [20]:
# Create grid search 
scorer = metrics.make_scorer(metrics.f1_score, average = 'weighted')
gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1,scoring=scorer)

In [21]:
gs_clf = gs_clf.fit(X_train, y_train)

In [22]:
y_preds = gs_clf.predict(X_test)
print(metrics.accuracy_score(y_test,y_preds))
metrics.f1_score(y_test,y_preds,average='weighted')
pd.DataFrame(metrics.classification_report(y_test,y_preds,output_dict=True))

0.5853354134165366


Unnamed: 0,action,adventure,comedy,crime,drama,horror,romance,thriller,western,accuracy,macro avg,weighted avg
precision,0.613793,0.657143,0.614416,0.555556,0.54474,0.643411,0.436364,0.444444,0.783439,0.585335,0.588145,0.577703
recall,0.364754,0.232323,0.634002,0.053763,0.789658,0.68595,0.116505,0.042781,0.842466,0.585335,0.418023,0.585335
f1-score,0.457584,0.343284,0.624056,0.098039,0.644723,0.664,0.183908,0.078049,0.811881,0.585335,0.433947,0.546228
support,244.0,99.0,847.0,93.0,1141.0,242.0,206.0,187.0,146.0,0.585335,3205.0,3205.0


GridSearch

In [28]:
text_clf = Pipeline([
    
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', SGDClassifier(loss='hinge',penalty='l2',
                           alpha=1e-3, random_state=42,
                           n_jobs=-1)),
 ])

parameters = {
     'vect__ngram_range': [(1, 1), (1, 2)],
     'tfidf__use_idf': (True, False),
     'clf__alpha': (1e-2, 1e-3),
 }

gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1,scoring=scorer)

gs_clf = gs_clf.fit(X_train, y_train)

y_preds = gs_clf.predict(X_test)

In [29]:
y_preds = gs_clf.predict(X_test)
print(metrics.accuracy_score(y_test,y_preds))
metrics.f1_score(y_test,y_preds,average='weighted')
pd.DataFrame(metrics.classification_report(y_test,y_preds,output_dict=True))

0.5853354134165366


Unnamed: 0,action,adventure,comedy,crime,drama,horror,romance,thriller,western,accuracy,macro avg,weighted avg
precision,0.613793,0.657143,0.614416,0.555556,0.54474,0.643411,0.436364,0.444444,0.783439,0.585335,0.588145,0.577703
recall,0.364754,0.232323,0.634002,0.053763,0.789658,0.68595,0.116505,0.042781,0.842466,0.585335,0.418023,0.585335
f1-score,0.457584,0.343284,0.624056,0.098039,0.644723,0.664,0.183908,0.078049,0.811881,0.585335,0.433947,0.546228
support,244.0,99.0,847.0,93.0,1141.0,242.0,206.0,187.0,146.0,0.585335,3205.0,3205.0


In [23]:
import re, nltk, spacy, gensim

In [24]:
# def doc_to_words(docs):
#     for doc in docs:
#         yield(gensim.utils.simple_preprocess(str(doc), deacc=True))
        
# def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
#     """https://spacy.io/api/annotation"""
#     texts_out = []
#     nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
#     for sent in texts:
#         doc = nlp(" ".join(sent)) 
#         texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
#     return texts_out

# def pre_process(documents):
#     return lemmatization(doc_to_words(documents))

In [27]:
df['Processed_Plot'] = text.pre_process(df['Plot'])

In [None]:
## Trying the same thing again with a pre-processed corpus

X = df['Processed_Plot']
y = df['Genre']

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
text_clf = Pipeline([
    
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('clf', SGDClassifier(loss='hinge',penalty='l2',
                           alpha=1e-3, random_state=42,
                           n_jobs=-1)),
 ])

parameters = {
     'vect__ngram_range': [(1, 1), (1, 2)],
     'tfidf__use_idf': (True, False),
     'clf__alpha': (1e-2, 1e-3),
 }

gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1,scoring=scorer)

gs_clf = gs_clf.fit(X_train, y_train)

y_preds = gs_clf.predict(X_test)

In [30]:
print(metrics.f1_score(y_test,y_preds,average='weighted'))
gs_clf.best_params_

0.5462277097245963


{'clf__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

In [38]:
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler

In [40]:
sm_pipeline = make_pipeline(CountVectorizer(),
                            TfidfTransformer(),
                            SMOTE(n_jobs=-1,random_state=42),
                            SGDClassifier(n_jobs=-1,verbose=-1,random_state=42))

sm_pipeline.named_steps

In [42]:
sm_params = {
    'countvectorizer__ngram_range':[(1,1),(1,2)],
    'countvectorizer__max_df':[1,.9],
    'countvectorizer__min_df':[1,.9],
    'tfidftransformer__use_idf':[True,False],
    'tfidftransformer__smooth_idf':[True,False],
    'sgdclassifier__alpha':[.0001,.001],
    'sgdclassifier__loss':['hinge','log']
}

In [45]:
sm_search = GridSearchCV(sm_pipeline,sm_params,cv=5,n_jobs=-1,scoring=scorer,verbose=3)

In [44]:
def pipeline_cv(splits, X, Y, pipeline, average_method):
    
    kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=777)
    accuracy = []
    precision = []
    recall = []
    f1 = []
    for train, test in kfold.split(X, Y):
        lr_fit = pipeline.fit(X[train], Y[train])
        prediction = lr_fit.predict(X[test])
        scores = lr_fit.score(X[test],Y[test])
        
        accuracy.append(scores * 100)
        precision.append(precision_score(Y[test], prediction, average=average_method)*100)
        #print('              negative    neutral     positive')
        #print('precision:',precision_score(Y[test], prediction, average=None))
        recall.append(recall_score(Y[test], prediction, average=average_method)*100)
        #print('recall:   ',recall_score(Y[test], prediction, average=None))
        f1.append(f1_score(Y[test], prediction, average=average_method)*100)
        #print('f1 score: ',f1_score(Y[test], prediction, average=None))
        #print('-'*50)
        
        display(pd.DataFrame(metrics.classification_report(Y[test],prediction,output_dict=True)))
        

    print("accuracy: %.2f%% (+/- %.2f%%)" % (np.mean(accuracy), np.std(accuracy)))
    print("precision: %.2f%% (+/- %.2f%%)" % (np.mean(precision), np.std(precision)))
    print("recall: %.2f%% (+/- %.2f%%)" % (np.mean(recall), np.std(recall)))
    print("f1 score: %.2f%% (+/- %.2f%%)" % (np.mean(f1), np.std(f1)))

In [128]:
pipeline_cv(3,X,y,sm_search,'weighted')

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


Convergence after 8 epochs took 3.37 seconds
Convergence after 9 epochs took 3.65 seconds


[Parallel(n_jobs=-1)]: Done   2 out of   9 | elapsed:    3.7s remaining:   12.8s


Convergence after 9 epochs took 3.93 seconds
Convergence after 10 epochs took 4.23 seconds
Convergence after 10 epochs took 4.23 seconds
Convergence after 11 epochs took 4.31 seconds
Convergence after 11 epochs took 4.37 seconds
Convergence after 12 epochs took 4.40 seconds
Convergence after 13 epochs took 4.42 seconds


[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    4.5s finished


precision: [0.45308311 0.47663551 0.62628337 0.39285714 0.69858871 0.61460957
 0.41111111 0.37142857 0.69642857]
recall:    [0.6101083  0.46363636 0.67180617 0.3        0.55131265 0.87142857
 0.51388889 0.22807018 0.89142857]
f1 score:  [0.52       0.47004608 0.64824655 0.34020619 0.6162739  0.72082718
 0.45679012 0.2826087  0.78195489]
--------------------------------------------------


Unnamed: 0,action,adventure,comedy,crime,drama,horror,romance,thriller,western,accuracy,macro avg,weighted avg
precision,0.453083,0.476636,0.626283,0.392857,0.698589,0.61461,0.411111,0.371429,0.696429,0.595058,0.526781,0.59966
recall,0.610108,0.463636,0.671806,0.3,0.551313,0.871429,0.513889,0.22807,0.891429,0.595058,0.566853,0.595058
f1-score,0.52,0.470046,0.648247,0.340206,0.616274,0.720827,0.45679,0.282609,0.781955,0.595058,0.537439,0.589218
support,277.0,110.0,908.0,110.0,1257.0,280.0,216.0,228.0,175.0,0.595058,3561.0,3561.0


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


Convergence after 8 epochs took 3.95 seconds
Convergence after 9 epochs took 4.44 seconds
Convergence after 9 epochs took 4.55 seconds


[Parallel(n_jobs=-1)]: Done   2 out of   9 | elapsed:    4.5s remaining:   15.7s


Convergence after 10 epochs took 4.92 seconds
Convergence after 11 epochs took 5.07 seconds
Convergence after 11 epochs took 5.12 seconds
Convergence after 11 epochs took 5.11 seconds
Convergence after 12 epochs took 5.21 seconds
Convergence after 13 epochs took 5.28 seconds


[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    5.3s finished


precision: [0.46991404 0.47272727 0.64912281 0.37179487 0.69223206 0.63002681
 0.3465704  0.32544379 0.68949772]
recall:    [0.5942029  0.46846847 0.69273128 0.26605505 0.56006364 0.83928571
 0.44239631 0.24122807 0.86285714]
f1 score:  [0.5248     0.47058824 0.67021843 0.31016043 0.61917326 0.71975498
 0.38866397 0.27707809 0.76649746]
--------------------------------------------------


Unnamed: 0,action,adventure,comedy,crime,drama,horror,romance,thriller,western,accuracy,macro avg,weighted avg
precision,0.469914,0.472727,0.649123,0.371795,0.692232,0.630027,0.34657,0.325444,0.689498,0.593934,0.51637,0.597784
recall,0.594203,0.468468,0.692731,0.266055,0.560064,0.839286,0.442396,0.241228,0.862857,0.593934,0.551921,0.593934
f1-score,0.5248,0.470588,0.670218,0.31016,0.619173,0.719755,0.388664,0.277078,0.766497,0.593934,0.527437,0.589983
support,276.0,111.0,908.0,109.0,1257.0,280.0,217.0,228.0,175.0,0.593934,3561.0,3561.0


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


Convergence after 8 epochs took 3.16 seconds
Convergence after 9 epochs took 3.83 seconds
Convergence after 9 epochs took 3.97 seconds


[Parallel(n_jobs=-1)]: Done   2 out of   9 | elapsed:    3.9s remaining:   13.5s


Convergence after 10 epochs took 4.17 seconds
Convergence after 10 epochs took 4.19 seconds
Convergence after 11 epochs took 4.36 seconds
Convergence after 12 epochs took 4.42 seconds
Convergence after 12 epochs took 4.46 seconds
Convergence after 13 epochs took 4.51 seconds


[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    4.5s finished


precision: [0.45504087 0.45714286 0.60322256 0.4        0.69034995 0.60552764
 0.35099338 0.31818182 0.75242718]
recall:    [0.60288809 0.43636364 0.65969163 0.25688073 0.5183121  0.86071429
 0.49074074 0.24454148 0.88571429]
f1 score:  [0.51863354 0.44651163 0.63019463 0.31284916 0.59208731 0.71091445
 0.40926641 0.27654321 0.81364829]
--------------------------------------------------


Unnamed: 0,action,adventure,comedy,crime,drama,horror,romance,thriller,western,accuracy,macro avg,weighted avg
precision,0.455041,0.457143,0.603223,0.4,0.69035,0.605528,0.350993,0.318182,0.752427,0.576124,0.514765,0.585573
recall,0.602888,0.436364,0.659692,0.256881,0.518312,0.860714,0.490741,0.244541,0.885714,0.576124,0.55065,0.576124
f1-score,0.518634,0.446512,0.630195,0.312849,0.592087,0.710914,0.409266,0.276543,0.813648,0.576124,0.523405,0.571891
support,277.0,110.0,908.0,109.0,1256.0,280.0,216.0,229.0,175.0,0.576124,3560.0,3560.0


accuracy: 58.84% (+/- 0.87%)
precision: 59.43% (+/- 0.62%)
recall: 58.84% (+/- 0.87%)
f1 score: 58.37% (+/- 0.84%)


In [47]:
from sklearn.ensemble import RandomForestClassifier

In [49]:
rf_sm_pipeline = make_pipeline(CountVectorizer(),
                            TfidfTransformer(),
                            SMOTE(n_jobs=-1,random_state=42),
                            RandomForestClassifier(random_state=42))

rf_sm_pipeline.named_steps

{'countvectorizer': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                 dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                 lowercase=True, max_df=1.0, max_features=None, min_df=1,
                 ngram_range=(1, 1), preprocessor=None, stop_words=None,
                 strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                 tokenizer=None, vocabulary=None),
 'tfidftransformer': TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True),
 'smote': SMOTE(k_neighbors=5, n_jobs=-1, random_state=42, sampling_strategy='auto'),
 'randomforestclassifier': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                        criterion='gini', max_depth=None, max_features='auto',
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_sample

In [54]:
rf_sm_params = {
    'countvectorizer__ngram_range':[(1,2),(2,3)],
    'countvectorizer__max_df':[.8,.9],
    'countvectorizer__min_df':[1,.9],
    'tfidftransformer__use_idf':[True,False],
    'tfidftransformer__smooth_idf':[True,False],
    'randomforestclassifier__criterion':['gini','entropy'],
    'randomforestclassifier__max_depth':[None,10]
}

rf_sm_search = GridSearchCV(rf_sm_pipeline,rf_sm_params,cv=5,n_jobs=-1,scoring=scorer,verbose=3)

In [None]:
pipeline_cv(3,X,y,rf_sm_search,'weighted')

Fitting 5 folds for each of 128 candidates, totalling 640 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
