In [3]:
import pandas as pd
#import text_processing as text
from sklearn import metrics
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_predict, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE

## Load the training data set

In [6]:
df = pd.read_csv("movie_train.csv",index_col=0,)

df.reset_index(drop=False,inplace=True)
df.rename(mapper={'index':'ID'},axis=1,inplace=True)

print(df.shape)
df.head()

(10682, 7)


Unnamed: 0,ID,Release Year,Title,Plot,Director,Cast,Genre
0,10281,1984,Silent Madness,A computer error leads to the accidental relea...,Simon Nuchtern,"Belinda Montgomery, Viveca Lindfors",horror
1,7341,1960,Desire in the Dust,"Lonnie Wilson (Ken Scott), the son of a sharec...",Robert L. Lippert,"Raymond Burr, Martha Hyer, Joan Bennett",drama
2,10587,1986,On the Edge,"A gaunt, bushy-bearded, 44-year-old Wes Holman...",Rob Nilsson,"Bruce Dern, Pam Grier",drama
3,25495,1988,Ram-Avtar,Ram and Avtar are both childhood best friends....,Sunil Hingorani,"Sunny Deol, Anil Kapoor, Sridevi",drama
4,16607,2013,Machete Kills,Machete Cortez (Danny Trejo) and Sartana River...,Robert Rodriguez,"Danny Trejo, Michelle Rodriguez, Sofía Vergara...",action


In [7]:
X = df['Plot']
y = df['Genre']

In [8]:
# Create scorer
scorer = metrics.make_scorer(metrics.f1_score, average = 'weighted')

In [6]:
sm_pipeline = make_pipeline(CountVectorizer(),
                            TfidfTransformer(),
                            SMOTE(n_jobs=-1,random_state=42),
                            SGDClassifier(n_jobs=-1,verbose=0,random_state=42))

In [8]:
sm_params = {
    'countvectorizer__ngram_range':[(1,2),(1,3)],
    'countvectorizer__max_df':[.75,.8],
    'countvectorizer__min_df':[2,3,4],
    'tfidftransformer__use_idf':[True],
    'tfidftransformer__smooth_idf':[True],
    'sgdclassifier__alpha':[.0001,.001],
    'sgdclassifier__loss':['squared_hinge','modified_huber']
}

sm_search = GridSearchCV(sm_pipeline,sm_params,cv=5,n_jobs=-1,scoring=scorer,verbose=0)

In [75]:
def pipeline_cv(splits, X, Y, pipeline):
    
    kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=777)
    
    reports = []
    for train, test in kfold.split(X, Y):
        fit = pipeline.fit(X.iloc[train], Y.iloc[train])
        prediction = fit.predict(X.iloc[test])
        
        reports.append(
            pd.DataFrame(
                metrics.classification_report(
                    Y.iloc[test],prediction,output_dict=True
                )
            )
        )

    df_concat = pd.concat([x for x in reports])

    by_row_index = df_concat.groupby(df_concat.index)
    df_means = by_row_index.mean()

    return df_means


In [15]:
pipeline_cv(5,X,y,sm_search)

Unnamed: 0,action,adventure,comedy,crime,drama,horror,romance,thriller,western,accuracy,macro avg,weighted avg
f1-score,0.49799,0.503093,0.640436,0.316614,0.538737,0.691392,0.432284,0.286914,0.782693,0.555606,0.521128,0.554967
precision,0.439015,0.430871,0.626878,0.275486,0.716705,0.606792,0.356742,0.284662,0.678006,0.555606,0.490573,0.589712
recall,0.575904,0.607372,0.65566,0.374872,0.43183,0.804762,0.551628,0.290511,0.925714,0.555606,0.579806,0.555606
support,166.0,66.2,544.8,65.6,754.0,168.0,129.8,137.0,105.0,0.555606,2136.4,2136.4


### Customize the vectorizer

In [18]:
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [20]:
sm_pipeline_customVect = make_pipeline(CountVectorizer(tokenizer=LemmaTokenizer()),
                                       TfidfTransformer(),
                                       SMOTE(n_jobs=-1,random_state=42),
                                       SGDClassifier(n_jobs=-1,verbose=0,random_state=42))

In [21]:
pipeline_cv(5,X,y,sm_pipeline_customVect)

Unnamed: 0,action,adventure,comedy,crime,drama,horror,romance,thriller,western,accuracy,macro avg,weighted avg
f1-score,0.508273,0.492725,0.640416,0.342903,0.541284,0.695955,0.427871,0.280562,0.798318,0.55907,0.525367,0.557589
precision,0.449549,0.422445,0.624285,0.290811,0.711246,0.607127,0.355346,0.290636,0.704056,0.55907,0.495056,0.589748
recall,0.585542,0.592266,0.658962,0.420886,0.437666,0.817857,0.539296,0.271533,0.921905,0.55907,0.582879,0.55907
support,166.0,66.2,544.8,65.6,754.0,168.0,129.8,137.0,105.0,0.55907,2136.4,2136.4


In [60]:
params = {
    'countvectorizer__ngram_range':[(1,2),(1,3)],
    'countvectorizer__max_df':np.linspace(.5,.7,5),
    'countvectorizer__min_df':[1,2,3,4],
    'tfidftransformer__use_idf':[True],
    'tfidftransformer__smooth_idf':[True],
    'sgdclassifier__alpha':np.linspace(.00005,.0002),
    'sgdclassifier__loss':['squared_hinge']
}

search = RandomizedSearchCV(sm_pipeline_customVect,params,cv=5,n_jobs=-1,scoring=scorer,verbose=0)

In [61]:
pipeline_cv(5,X,y,search)



Unnamed: 0,action,adventure,comedy,crime,drama,horror,romance,thriller,western,accuracy,macro avg,weighted avg
f1-score,0.540564,0.527805,0.66505,0.356278,0.65274,0.73526,0.462746,0.297006,0.835034,0.620949,0.563609,0.615287
precision,0.520742,0.53984,0.656511,0.432761,0.655152,0.691477,0.444339,0.384575,0.799301,0.620949,0.569411,0.614454
recall,0.562651,0.51972,0.674381,0.304802,0.650928,0.786905,0.486953,0.242336,0.87619,0.620949,0.567207,0.620949
support,166.0,66.2,544.8,65.6,754.0,168.0,129.8,137.0,105.0,0.620949,2136.4,2136.4


In [64]:
joblib.dump(search.best_estimator_,'saved_models/weighted_f1_62')

['saved_models/weighted_f1_62']

In [65]:
search.best_params_

{'tfidftransformer__use_idf': True,
 'tfidftransformer__smooth_idf': True,
 'sgdclassifier__loss': 'squared_hinge',
 'sgdclassifier__alpha': 0.00014489795918367347,
 'countvectorizer__ngram_range': (1, 3),
 'countvectorizer__min_df': 4,
 'countvectorizer__max_df': 0.7}

In [70]:
params = {
    'countvectorizer__ngram_range':[(1,3)],
    'countvectorizer__max_df':[.6,.65,.7,.75],
    'countvectorizer__min_df':[3,4,5],
    'tfidftransformer__use_idf':[True],
    'tfidftransformer__smooth_idf':[True],
    'sgdclassifier__alpha':[.00015],
    'sgdclassifier__loss':['squared_hinge'],
    
}

search = GridSearchCV(sm_pipeline_customVect,params,cv=5,n_jobs=-1,scoring=scorer,verbose=0)

In [71]:
pipeline_cv(5,X,y,search)

Unnamed: 0,action,adventure,comedy,crime,drama,horror,romance,thriller,western,accuracy,macro avg,weighted avg
f1-score,0.550227,0.519338,0.667796,0.367788,0.648884,0.736027,0.458761,0.292311,0.834191,0.621136,0.563925,0.614946
precision,0.532867,0.539677,0.654729,0.459081,0.650166,0.693006,0.444376,0.383776,0.797249,0.621136,0.57277,0.613958
recall,0.56988,0.504659,0.682091,0.307879,0.648276,0.786905,0.476088,0.236496,0.87619,0.621136,0.565385,0.621136
support,166.0,66.2,544.8,65.6,754.0,168.0,129.8,137.0,105.0,0.621136,2136.4,2136.4


In [74]:
joblib.dump(search.best_estimator_,'saved_models/weighted_f1_621')

['saved_models/weighted_f1_621']