In [3]:
import pandas as pd
import text_processing as text
from sklearn import metrics
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_predict, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler

## Load the training data set

In [6]:
df = pd.read_csv("movie_train.csv",index_col=0,)

df.reset_index(drop=False,inplace=True)
df.rename(mapper={'index':'ID'},axis=1,inplace=True)

print(df.shape)
df.head()

(10682, 7)


Unnamed: 0,ID,Release Year,Title,Plot,Director,Cast,Genre
0,10281,1984,Silent Madness,A computer error leads to the accidental relea...,Simon Nuchtern,"Belinda Montgomery, Viveca Lindfors",horror
1,7341,1960,Desire in the Dust,"Lonnie Wilson (Ken Scott), the son of a sharec...",Robert L. Lippert,"Raymond Burr, Martha Hyer, Joan Bennett",drama
2,10587,1986,On the Edge,"A gaunt, bushy-bearded, 44-year-old Wes Holman...",Rob Nilsson,"Bruce Dern, Pam Grier",drama
3,25495,1988,Ram-Avtar,Ram and Avtar are both childhood best friends....,Sunil Hingorani,"Sunny Deol, Anil Kapoor, Sridevi",drama
4,16607,2013,Machete Kills,Machete Cortez (Danny Trejo) and Sartana River...,Robert Rodriguez,"Danny Trejo, Michelle Rodriguez, Sofía Vergara...",action


In [7]:
X = df['Plot']
y = df['Genre']

In [8]:
# Create scorer
scorer = metrics.make_scorer(metrics.f1_score, average = 'weighted')

In [6]:
sm_pipeline = make_pipeline(CountVectorizer(),
                            TfidfTransformer(),
                            SMOTE(n_jobs=-1,random_state=42),
                            SGDClassifier(n_jobs=-1,verbose=0,random_state=42))

In [7]:
sm_pipeline.named_steps

{'countvectorizer': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                 dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                 lowercase=True, max_df=1.0, max_features=None, min_df=1,
                 ngram_range=(1, 1), preprocessor=None, stop_words=None,
                 strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                 tokenizer=None, vocabulary=None),
 'tfidftransformer': TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True),
 'smote': SMOTE(k_neighbors=5, n_jobs=-1, random_state=42, sampling_strategy='auto'),
 'sgdclassifier': SGDClassifier(alpha=0.0001, average=False, class_weight=None,
               early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
               l1_ratio=0.15, learning_rate='optimal', loss='hinge',
               max_iter=1000, n_iter_no_change=5, n_jobs=-1, penalty='l2',
               power_t=0.5, random_state=42, shuffle=True, tol=0.001

In [8]:
sm_params = {
    'countvectorizer__ngram_range':[(1,2)],
    'countvectorizer__max_df':[.75,.8],
    'countvectorizer__min_df':[1,2],
    'tfidftransformer__use_idf':[True],
    'tfidftransformer__smooth_idf':[True],
    'sgdclassifier__alpha':[.0001,.001],
    'sgdclassifier__loss':['squared_hinge','modified_huber']
}

In [9]:
sm_search = GridSearchCV(sm_pipeline,sm_params,cv=5,n_jobs=-1,scoring=scorer,verbose=0)

In [13]:
def pipeline_cv(splits, X, Y, pipeline):
    
    kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=777)
    
    reports = []
    for train, test in kfold.split(X, Y):
        fit = pipeline.fit(X.iloc[train], Y.iloc[train])
        prediction = fit.predict(X.iloc[test])
        
        reports.append(
            pd.DataFrame(
                metrics.classification_report(
                    Y.iloc[test],prediction,output_dict=True
                )
            )
        )

    df_concat = pd.concat([x for x in reports])

    by_row_index = df_concat.groupby(df_concat.index)
    df_means = by_row_index.mean()

    return df_means


In [15]:
pipeline_cv(5,X,y,sm_search)

Unnamed: 0,action,adventure,comedy,crime,drama,horror,romance,thriller,western,accuracy,macro avg,weighted avg
f1-score,0.49799,0.503093,0.640436,0.316614,0.538737,0.691392,0.432284,0.286914,0.782693,0.555606,0.521128,0.554967
precision,0.439015,0.430871,0.626878,0.275486,0.716705,0.606792,0.356742,0.284662,0.678006,0.555606,0.490573,0.589712
recall,0.575904,0.607372,0.65566,0.374872,0.43183,0.804762,0.551628,0.290511,0.925714,0.555606,0.579806,0.555606
support,166.0,66.2,544.8,65.6,754.0,168.0,129.8,137.0,105.0,0.555606,2136.4,2136.4


### Customize the vectorizer

In [9]:
from text_processing import *

In [10]:
sm_pipeline_customVect = make_pipeline(TextNormalizer(),
                                       TfidfTransformer(),
                                       SMOTE(n_jobs=-1,random_state=42),
                                       SGDClassifier(n_jobs=-1,verbose=0,random_state=42))

In [11]:
vect = TextNormalizer()