In [1]:
import joblib
import pandas as pd
import text_processing as text
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold, cross_val_predict, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv("movie_train.csv",index_col=0,)

df.reset_index(drop=False,inplace=True)
df.rename(mapper={'index':'ID'},axis=1,inplace=True)

print(df.shape)

df.head()





(10682, 7)


Unnamed: 0,ID,Release Year,Title,Plot,Director,Cast,Genre
0,10281,1984,Silent Madness,A computer error leads to the accidental relea...,Simon Nuchtern,"Belinda Montgomery, Viveca Lindfors",horror
1,7341,1960,Desire in the Dust,"Lonnie Wilson (Ken Scott), the son of a sharec...",Robert L. Lippert,"Raymond Burr, Martha Hyer, Joan Bennett",drama
2,10587,1986,On the Edge,"A gaunt, bushy-bearded, 44-year-old Wes Holman...",Rob Nilsson,"Bruce Dern, Pam Grier",drama
3,25495,1988,Ram-Avtar,Ram and Avtar are both childhood best friends....,Sunil Hingorani,"Sunny Deol, Anil Kapoor, Sridevi",drama
4,16607,2013,Machete Kills,Machete Cortez (Danny Trejo) and Sartana River...,Robert Rodriguez,"Danny Trejo, Michelle Rodriguez, Sofía Vergara...",action


In [2]:
X = df['Plot']
X_tokenized = text.pre_process(X)
y = df['Genre']

In [15]:
# First model was trained on stratified Kfold split with k=3. 
# df['Plot'] was fed into the pipeline without preprocessing

model_1_loaded = joblib.load("model_1.pkl")

## Model 2 was trained on SMOTE-sampled, pre-processed text data

model_2_loaded = joblib.load("model_2.pkl")

In [17]:
y_preds = model_1_loaded.predict(X)
print(metrics.accuracy_score(y,y_preds))
metrics.f1_score(y,y_preds,average='weighted')
pd.DataFrame(metrics.classification_report(y,y_preds,output_dict=True))

0.8462834675154466


Unnamed: 0,action,adventure,comedy,crime,drama,horror,romance,thriller,western,accuracy,macro avg,weighted avg
precision,0.898212,0.966245,0.852801,0.981982,0.781201,0.89455,0.919214,0.97397,0.931166,0.846283,0.911038,0.857481
recall,0.786747,0.691843,0.854993,0.664634,0.928117,0.89881,0.64869,0.655474,0.927619,0.846283,0.784103,0.846283
f1-score,0.838793,0.806338,0.853896,0.792727,0.848345,0.896675,0.760614,0.783595,0.929389,0.846283,0.834486,0.84431
support,830.0,331.0,2724.0,328.0,3770.0,840.0,649.0,685.0,525.0,0.846283,10682.0,10682.0


In [None]:
X = text.pre_process(X)

In [16]:
y_preds = model_2_loaded.predict(X)
print(metrics.accuracy_score(y,y_preds))
metrics.f1_score(y,y_preds,average='weighted')
pd.DataFrame(metrics.classification_report(y,y_preds,output_dict=True))

0.7475191911627036


Unnamed: 0,action,adventure,comedy,crime,drama,horror,romance,thriller,western,accuracy,macro avg,weighted avg
precision,0.821975,0.813084,0.602517,0.757098,0.865336,0.910345,0.686854,0.850775,0.839733,0.747519,0.794191,0.780506
recall,0.712048,0.78852,0.896476,0.731707,0.632361,0.785714,0.716487,0.640876,0.958095,0.747519,0.762476,0.747519
f1-score,0.763073,0.800613,0.720673,0.744186,0.730728,0.84345,0.701357,0.731057,0.895018,0.747519,0.770017,0.748431
support,830.0,331.0,2724.0,328.0,3770.0,840.0,649.0,685.0,525.0,0.747519,10682.0,10682.0
