In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [47]:
import pandas as pd
import numpy as np
from collections import Counter

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
import seaborn as sns
sns.set()

from scipy import sparse as sp

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

import metrics

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

In [6]:
data = pd.read_excel("Training sheet.xlsx")

In [7]:
X = data[['production_year', 'movie_sequel', 'creative_type', 'source', 'production_method',
          'genre', 'language', 'board_rating_reason', 'movie_board_rating_display_name',
          'movie_release_pattern_display_name']]

y = data["Category"]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=20)

X_train.reset_index(inplace=True)
X_test.reset_index(inplace=True)


# checking if all the classes are included in test data
from collections import Counter
print(Counter(y_train))
print(Counter(y_test))

Counter({2: 207, 3: 203, 4: 170, 1: 151, 5: 114, 6: 68, 7: 53, 8: 34, 9: 16})
Counter({3: 42, 2: 40, 4: 36, 5: 18, 1: 17, 6: 12, 7: 8, 8: 4, 9: 3})


The seed looks good, we have all classes in test and the sampling proportions looks decent.

In [9]:
# preprocessing the categorical columns of train dataset

# year could be encoded as a quantitative field
# le = LabelEncoder()
# X_train["year"] = pd.Series(le.fit_transform(X_train["production_year"]))

# encoding other categorical variables by one hot encoding
creative_type_cols = pd.get_dummies(X_train['creative_type'])
source_cols = pd.get_dummies(X_train['source'])
production_method_cols = pd.get_dummies(X_train['production_method'])
genre_cols = pd.get_dummies(X_train['genre'])
language_cols = pd.get_dummies(X_train['language'])
rating_cols = pd.get_dummies(X_train['movie_board_rating_display_name'])
release_pattern_display_name = pd.get_dummies(X_train['movie_release_pattern_display_name'])


# doing the same preprocessing for the test set
# X_test["year"] = le.transform(X_test['production_year'])

# encoding other categorical variables by one hot encoding
creative_type_cols_test = pd.get_dummies(X_test['creative_type'])
source_cols_test = pd.get_dummies(X_test['source'])
production_method_cols_test = pd.get_dummies(X_test['production_method'])
genre_cols_test = pd.get_dummies(X_test['genre'])
language_cols_test = pd.get_dummies(X_test['language'])
rating_cols_test = pd.get_dummies(X_test['movie_board_rating_display_name'])
release_pattern_display_name_test = pd.get_dummies(X_test['movie_release_pattern_display_name'])

In [10]:
X_train.head()

Unnamed: 0,index,production_year,movie_sequel,creative_type,source,production_method,genre,language,board_rating_reason,movie_board_rating_display_name,movie_release_pattern_display_name
0,832,2009,0,Factual,Based on Real Life Events,Live Action,Documentary,English,For some language,R,Limited
1,532,2007,0,Contemporary Fiction,Original Screenplay,Live Action,Drama,English,for comic sexual content and some violence,PG-13,Wide
2,1174,2009,0,Factual,Based on Real Life Events,Live Action,Documentary,English,International - to be excluded,Not Rated,Limited
3,751,2009,0,Contemporary Fiction,Original Screenplay,Live Action,Romantic Comedy,English,for language and some suggestive material (rer...,PG,Wide
4,726,2010,0,Science Fiction,Original Screenplay,Live Action,Thriller/Suspense,English,"for some disturbing sequences, and language.",PG-13,Wide


In [11]:
X_test.head()

Unnamed: 0,index,production_year,movie_sequel,creative_type,source,production_method,genre,language,board_rating_reason,movie_board_rating_display_name,movie_release_pattern_display_name
0,700,2010,0,Fantasy,Original Screenplay,Live Action,Horror,English,for violence and terror including disturbing i...,R,Wide
1,359,2010,0,Contemporary Fiction,Spin-Off,Live Action,Comedy,English,for strong sexual content and drug use through...,R,Wide
2,666,2009,0,Fantasy,Original Screenplay,Live Action,Thriller/Suspense,English,"For sexuality, bloody violence, language and b...",R,Wide
3,337,2011,0,Contemporary Fiction,Remake,Live Action,Action,English,"for violence, pervasive language and brief dru...",R,Wide
4,543,2009,0,Factual,Based on Real Life Events,Live Action,Concert/Performance,English,General,G,Wide


In [12]:
X_train_transformed = pd.concat([creative_type_cols, source_cols, 
                     production_method_cols, genre_cols, 
                     language_cols, rating_cols, release_pattern_display_name], axis=1)

X_test_transformed = pd.concat([creative_type_cols_test, source_cols_test, production_method_cols_test,
                    genre_cols_test, language_cols_test, rating_cols_test, release_pattern_display_name_test], axis=1)

In [13]:
X_train_transformed.head()

Unnamed: 0,Contemporary Fiction,Dramatization,Factual,Fantasy,Historical Fiction,Kids Fiction,Multiple Creative Types,Science Fiction,Super Hero,Based on Comic/Graphic Novel,...,PG,PG-13,R,Exclusive,Expands Wide,IMAX,Limited,Oscar Qualifying Run,Special Engagement,Wide
0,0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,1


In [14]:
X_test_transformed.head()

Unnamed: 0,Contemporary Fiction,Dramatization,Factual,Fantasy,Historical Fiction,Kids Fiction,Science Fiction,Super Hero,Based on Comic/Graphic Novel,Based on Factual Book/Article,...,NC-17,Not Rated,PG,PG-13,R,Exclusive,Expands Wide,IMAX,Limited,Wide
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


Certain levels in the test have to be ignored -  because we hadn't seen them in train data. So lets take all the columns of train data. Check if they're present in test data. 

If a column in X_train is not present in the train sample, we add it with all zero values to test.

In [15]:
train_features = X_train_transformed.columns
test_features = X_test_transformed.columns

# features in train which are not present in test
features_ = [feature for feature in train_features if feature not in test_features]

In [16]:
# Setting the one hot encoding of the absent feature to  zero in test data
for feature in features_:
    X_test_transformed[feature] = 0
    
# New levels which have come for columns in test have to be removed now (to avoid the not seen error) - 
# so, taking only the columns which were present in train data

X_test_transformed = X_test_transformed[train_features]

__Lets start with a baseline classification model__

## Multiclass Logistic Regression 

In [17]:
lr_clf = LogisticRegression(multi_class="ovr")
lr_clf.fit(X_train_transformed, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [18]:
y_pred = lr_clf.predict(X_test_transformed)

In [20]:
# Using APHR Bingo as the classification accuracy metric, let's evaluate the classifier

y_pred_train_lr = lr_clf.predict(X_train_transformed)
train_acc = metrics.accuracy(y_train, y_pred_train_lr)
count_bingo, aphr_bingo, percent_bingo = metrics.aphr_bingo(y_train, y_pred_train_lr)
train_aphr_oneaway = metrics.aphr_oneaway(y_train, y_pred_train_lr)
print("Train Accuracy: ", train_acc)
print("Train Bingo: ", percent_bingo)
print("Train Oneaway: ", train_aphr_oneaway)
print("==================================")

y_pred_lr = lr_clf.predict(X_test_transformed)
test_acc = metrics.accuracy(y_test, y_pred_lr)
count_bingo, aphr_bingo, percent_bingo = metrics.aphr_bingo(y_test, y_pred_lr)
test_aphr_oneaway = metrics.aphr_oneaway(y_test, y_pred_lr)

print("Test Accuracy: ", test_acc)
print("Test bingo: ", percent_bingo)
print("Test Oneaway: ", test_aphr_oneaway)

Train Accuracy:  0.352
Train Bingo:  0.334
Train Oneaway:  0.6909
Test Accuracy:  0.256
Test bingo:  0.211
Test Oneaway:  0.7167


Let us vectorize the text field and combine these with the other features as Sparse Matrices

In [21]:
vectorizer = TfidfVectorizer()

# board rating train vectorized
rating_vec = vectorizer.fit_transform(X_train["board_rating_reason"])

text_feature_cols = vectorizer.vocabulary_
rev_dictionary = {v:k for k,v in vectorizer.vocabulary_.items()}
cols_text_features = [v for k,v in rev_dictionary.items()]
X_train_transformed2 = sp.hstack([X_train_transformed, rating_vec])


# vectorizing test data
rating_vec_test = vectorizer.transform(X_test["board_rating_reason"])
X_test_transformed2 = sp.hstack([X_test_transformed, rating_vec_test])

In [22]:
lr_clf = LogisticRegression(multi_class="ovr", C=0.7)
lr_clf.fit(X_train_transformed2, y_train)

LogisticRegression(C=0.7, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [23]:
y_pred_train_lr2 = lr_clf.predict(X_train_transformed2)
train_acc = metrics.accuracy(y_train, y_pred_train_lr2)
count_bingo, aphr_bingo, percent_bingo = metrics.aphr_bingo(y_train, y_pred_train_lr2)
train_aphr_oneaway = metrics.aphr_oneaway(y_train, y_pred_train_lr2)
print("Train Accuracy: ", train_acc)
print("Train Bingo: ", percent_bingo)
print("Train Oneaway: ", train_aphr_oneaway)
print("==================================")


y_pred_lr2 = lr_clf.predict(X_test_transformed2)
test_acc = metrics.accuracy(y_test, y_pred_lr2)
count_bingo, aphr_bingo, percent_bingo = metrics.aphr_bingo(y_test, y_pred_lr2)
test_aphr_oneaway = metrics.aphr_oneaway(y_test, y_pred_lr2)
print("Test Accuracy: ", test_acc)
print("Test Bingo: ", percent_bingo)
print("Test Oneaway: ", test_aphr_oneaway)

Train Accuracy:  0.452
Train Bingo:  0.424
Train Oneaway:  0.7165
Test Accuracy:  0.261
Test Bingo:  0.229
Test Oneaway:  0.7222


_There is a lot of bias error, even with regularization._

__The Bingo Percentage seems to have increased after including text features.
Now let's try other classifiers.__

## SVM

In [24]:
# linear kernel seems to give the best bingo percentage

clf_svm = SVC(C=0.5, kernel='linear')
clf_svm.fit(X_train_transformed, y_train) 

SVC(C=0.5, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [26]:
y_pred_train_svm = clf_svm.predict(X_train_transformed)
train_acc = metrics.accuracy(y_train, y_pred_train_svm)
count_bingo, aphr_bingo, percent_bingo = metrics.aphr_bingo(y_train, y_pred_train_svm)
train_aphr_oneaway = metrics.aphr_oneaway(y_train, y_pred_train_svm)
print("Train Accuracy: ", train_acc)
print("Train Bingo: ", percent_bingo)
print("Train Oneaway: ", train_aphr_oneaway)
print("==================================")


y_pred_svm = clf_svm.predict(X_test_transformed)
test_acc = metrics.accuracy(y_test, y_pred_svm)
count_bingo, aphr_bingo, percent_bingo = metrics.aphr_bingo(y_test, y_pred_svm)
test_aphr_oneaway = metrics.aphr_oneaway(y_test, y_pred_svm)
print("Test Accuracy: ", test_acc)
print("Test Bingo: ", percent_bingo)
print("Test Oneaway: ", test_aphr_oneaway)

Train Accuracy:  0.343
Train Bingo:  0.302
Train Oneaway:  0.6781
Test Accuracy:  0.278
Test Bingo:  0.249
Test Oneaway:  0.7278


__Note: The optimizations on the hyperparameters C and kernel has been made looking at how much they optimize the 
oneaway score.__

### SVM with text features included

In [27]:
clf_svm = SVC(C=1, kernel='linear')
clf_svm.fit(X_train_transformed2, y_train) 

y_pred_train_svm2 = clf_svm.predict(X_train_transformed2)
train_acc = metrics.accuracy(y_train, y_pred_train_svm2)
count_bingo, aphr_bingo, percent_bingo = metrics.aphr_bingo(y_train, y_pred_train_svm2)
train_aphr_oneaway = metrics.aphr_oneaway(y_train, y_pred_train_svm2)
print("Train Accuracy: ", train_acc)
print("Train Bingo: ", percent_bingo)
print("Train Oneaway: ", train_aphr_oneaway)
print("==================================")


y_pred_svm2 = clf_svm.predict(X_test_transformed2)
test_acc = metrics.accuracy(y_test, y_pred_svm2)
count_bingo, aphr_bingo, percent_bingo = metrics.aphr_bingo(y_test, y_pred_svm2)
test_aphr_oneaway = metrics.aphr_oneaway(y_test, y_pred_svm2)
print("Test Accuracy: ", test_acc)
print("Test Bingo: ", percent_bingo)
print("Test Oneaway: ", test_aphr_oneaway)

Train Accuracy:  0.522
Train Bingo:  0.525
Train Oneaway:  0.7569
Test Accuracy:  0.283
Test Bingo:  0.263
Test Oneaway:  0.7444


Let's try an ensemble model

## Random Forests

In [28]:
clf_rf = RandomForestClassifier(n_estimators=30, max_depth=8)
clf_rf.fit(X_train_transformed, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [29]:
y_pred_train_rf = clf_rf.predict(X_train_transformed)
train_acc = metrics.accuracy(y_train, y_pred_train_rf)
count_bingo, aphr_bingo, percent_bingo = metrics.aphr_bingo(y_train, y_pred_train_rf)
train_aphr_oneaway = metrics.aphr_oneaway(y_train, y_pred_train_rf)
print("Train Accuracy: ", train_acc)
print("Train Bingo: ", percent_bingo)
print("Train Oneaway: ", train_aphr_oneaway)
print("==================================")


y_pred_rf = clf_rf.predict(X_test_transformed)
test_acc = metrics.accuracy(y_test, y_pred_rf)
count_bingo, aphr_bingo, percent_bingo = metrics.aphr_bingo(y_test, y_pred_rf)
test_aphr_oneaway = metrics.aphr_oneaway(y_test, y_pred_rf)
print("Test Accuracy: ", test_acc)
print("Test Bingo: ", percent_bingo)
print("Test Oneaway: ", test_aphr_oneaway)

Train Accuracy:  0.494
Train Bingo:  0.465
Train Oneaway:  0.7382
Test Accuracy:  0.278
Test Bingo:  0.208
Test Oneaway:  0.7222


In [30]:
clf_rf = RandomForestClassifier(n_estimators=35, max_depth=6)
clf_rf.fit(X_train_transformed2, y_train) 

y_pred_train_rf2 = clf_svm.predict(X_train_transformed2)
train_acc = metrics.accuracy(y_train, y_pred_train_rf2)
count_bingo, aphr_bingo, percent_bingo = metrics.aphr_bingo(y_train, y_pred_train_rf2)
train_aphr_oneaway = metrics.aphr_oneaway(y_train, y_pred_train_rf2)
print("Train Accuracy: ", train_acc)
print("Train Bingo: ", percent_bingo)
print("Train Oneaway: ", train_aphr_oneaway)
print("==================================")


y_pred_rf2 = clf_rf.predict(X_test_transformed2)
test_acc = metrics.accuracy(y_test, y_pred_rf2)
count_bingo, aphr_bingo, percent_bingo = metrics.aphr_bingo(y_test, y_pred_rf2)
test_aphr_oneaway = metrics.aphr_oneaway(y_test, y_pred_rf2)
print("Test Accuracy: ", test_acc)
print("Test Bingo: ", percent_bingo)
print("Test Oneaway: ", test_aphr_oneaway)

Train Accuracy:  0.522
Train Bingo:  0.525
Train Oneaway:  0.7569
Test Accuracy:  0.267
Test Bingo:  0.201
Test Oneaway:  0.7556


__The max_depth and n_estimator parameters have been tuned to optimize the oneaway score__

## Boosted Trees: XGBoost

In [31]:
clf_xgb = xgb.XGBClassifier(max_depth=3, nthread=-1)
clf_xgb.fit(X_train_transformed, y_train)


y_pred_train_xgb = clf_xgb.predict(X_train_transformed)
train_acc = metrics.accuracy(y_train, y_pred_train_xgb)
count_bingo, aphr_bingo, percent_bingo = metrics.aphr_bingo(y_train, y_pred_train_xgb)
train_aphr_oneaway = metrics.aphr_oneaway(y_train, y_pred_train_xgb)
print("Train Accuracy: ", train_acc)
print("Train Bingo: ", percent_bingo)
print("Train Oneaway: ", train_aphr_oneaway)
print("==================================")


y_pred_xgb = clf_xgb.predict(X_test_transformed)
test_acc = metrics.accuracy(y_test, y_pred_xgb)
count_bingo, aphr_bingo, percent_bingo = metrics.aphr_bingo(y_test, y_pred_xgb)
test_aphr_oneaway = metrics.aphr_oneaway(y_test, y_pred_xgb)
print("Test Accuracy: ", test_acc)
print("Test Bingo: ", percent_bingo)
print("Test Oneaway: ", test_aphr_oneaway)

Train Accuracy:  0.411
Train Bingo:  0.409
Train Oneaway:  0.7156
Test Accuracy:  0.261
Test Bingo:  0.254
Test Oneaway:  0.7333


In [32]:
clf_xgb = xgb.XGBClassifier(max_depth=5, nthread=-1)
clf_xgb.fit(X_train_transformed2, y_train)


y_pred_train_xgb2 = clf_xgb.predict(X_train_transformed2)
train_acc = metrics.accuracy(y_train, y_pred_train_xgb2)
count_bingo, aphr_bingo, percent_bingo = metrics.aphr_bingo(y_train, y_pred_train_xgb2)
train_aphr_oneaway = metrics.aphr_oneaway(y_train, y_pred_train_xgb2)
print("Train Accuracy: ", train_acc)
print("Train Bingo: ", percent_bingo)
print("Train Oneaway: ", train_aphr_oneaway)
print("==================================")


y_pred_xgb2 = clf_xgb.predict(X_test_transformed2)
test_acc = metrics.accuracy(y_test, y_pred_xgb2)
count_bingo, aphr_bingo, percent_bingo = metrics.aphr_bingo(y_test, y_pred_xgb2)
test_aphr_oneaway = metrics.aphr_oneaway(y_test, y_pred_xgb2)
print("Test Accuracy: ", test_acc)
print("Test Bingo: ", percent_bingo)
print("Test Oneaway: ", test_aphr_oneaway)

Train Accuracy:  0.871
Train Bingo:  0.894
Train Oneaway:  0.939
Test Accuracy:  0.272
Test Bingo:  0.252
Test Oneaway:  0.7111


__The max_depth parameter is tuned to optimize the oneaway scores__

## Fusion

Let's build a classifier that uses voting of classification choices from the classifiers above.

In [44]:
# construct a dataframe of all the test data predictions
pred_fusion = pd.concat([pd.Series(list(y_pred_lr2), name="lr"), 
                         pd.Series(list(y_pred_svm2), name="svm"), 
                         pd.Series(list(y_pred_rf2), name="rf"), 
                         pd.Series(list(y_pred_xgb2), name="xgb")], axis=1)

In [62]:
# Pick the class which has been voted the maximum count - otherwise pick the one which Random Forest chose 
# (since RF had maximum one away score)

def vote(candidates):
    # default vote to Random Forest candidate
    default_vote = candidates[2]
    
    most_common = Counter(candidates).most_common()
    if(len(most_common) == 1):
        default_vote = most_common[0][0]
    
    return default_vote
    
pred_fusion["prediction"] = pred_fusion.apply(lambda row: vote([row["lr"], row["svm"], row["rf"], row["xgb"]]), axis=1)

pred_fusion.head(10)

Unnamed: 0,lr,svm,rf,xgb,prediction
0,4,4,4,4,4
1,2,2,3,3,3
2,4,4,4,4,4
3,3,3,3,3,3
4,4,4,4,4,4
5,1,1,1,1,1
6,4,2,4,5,4
7,3,3,3,3,3
8,3,3,3,4,3
9,3,2,3,2,3


__Let's see if this improves the one away score__

In [63]:
fusion_test_acc = metrics.accuracy(y_test, pred_fusion["prediction"])
count_bingo, aphr_bingo, percent_bingo = metrics.aphr_bingo(y_test, pred_fusion["prediction"])
test_aphr_oneaway = metrics.aphr_oneaway(y_test, pred_fusion["prediction"])
print("Test Accuracy: ", test_acc)
print("Test Bingo: ", percent_bingo)
print("Test Oneaway: ", test_aphr_oneaway)

Test Accuracy:  0.272
Test Bingo:  0.201
Test Oneaway:  0.7556
