In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [17]:
import pandas as pd
import numpy as np
from collections import Counter

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
import seaborn as sns
sns.set()

from scipy import sparse as sp

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

import metrics

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

In [4]:
train = pd.read_excel("Training sheet.xlsx")
test = pd.read_excel("Scoring Sheet.xlsx")

In [5]:
X = train[['movie_sequel', 'creative_type', 'source', 'production_method',
          'genre', 'language', 'board_rating_reason', 'movie_board_rating_display_name',
          'movie_release_pattern_display_name']]
y = train["Category"]

In [6]:
# encoding other categorical variables by one hot encoding
creative_type_cols = pd.get_dummies(train['creative_type'])
source_cols = pd.get_dummies(train['source'])
production_method_cols = pd.get_dummies(train['production_method'])
genre_cols = pd.get_dummies(train['genre'])
language_cols = pd.get_dummies(train['language'])
rating_cols = pd.get_dummies(train['movie_board_rating_display_name'])
release_pattern_display_name = pd.get_dummies(train['movie_release_pattern_display_name'])

# encoding other categorical variables by one hot encoding
creative_type_cols_test = pd.get_dummies(test['creative_type'])
source_cols_test = pd.get_dummies(test['source'])
production_method_cols_test = pd.get_dummies(test['production_method'])
genre_cols_test = pd.get_dummies(test['genre'])
language_cols_test = pd.get_dummies(test['language'])
rating_cols_test = pd.get_dummies(test['movie_board_rating_display_name'])
release_pattern_display_name_test = pd.get_dummies(test['movie_release_pattern_display_name'])

In [7]:
train_transformed = pd.concat([creative_type_cols, source_cols, 
                               production_method_cols, genre_cols, 
                               language_cols, rating_cols, release_pattern_display_name], axis=1)

test_transformed = pd.concat([creative_type_cols_test, source_cols_test, 
                              production_method_cols_test, genre_cols_test, 
                              language_cols_test, rating_cols_test, release_pattern_display_name_test], axis=1)

In [8]:
train_features = train_transformed.columns
test_features = test_transformed.columns

# features in train which are not present in test
features_ = [feature for feature in train_features if feature not in test_features]

# Setting the one hot encoding of the absent feature to  zero in test data
for feature in features_:
    test_transformed[feature] = 0
    
# New levels which have come for columns in test have to be removed now (to avoid the not seen error) - 
# so, taking only the columns which were present in train data

test_transformed = test_transformed[train_features]

In [9]:
vectorizer = TfidfVectorizer()

# board rating train vectorized
rating_vec = vectorizer.fit_transform(train["board_rating_reason"])

text_feature_cols = vectorizer.vocabulary_
rev_dictionary = {v:k for k,v in vectorizer.vocabulary_.items()}
cols_text_features = [v for k,v in rev_dictionary.items()]
train_transformed2 = sp.hstack([train_transformed, rating_vec])


# # # vectorizing test data
rating_vec_test = vectorizer.transform(test["board_rating_reason"])
test_transformed2 = sp.hstack([test_transformed, rating_vec_test])

### Choosing two classifiers which provided the best scores during testing

## Random Forest Classifier

Random Forest Classifier seems to have done the best on the classification procedure.
Using it for final prediction.

In [10]:
clf_rf = RandomForestClassifier(n_estimators=30, max_depth=8)
clf_rf.fit(train_transformed, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [11]:
predictions1 = pd.Series(clf_rf.predict(test_transformed), name="Predictions")
predictions1.to_csv("prediction1.csv", index=False)

## Fusion Classifier

In [13]:
clf_lr = LogisticRegression(multi_class="ovr", C=0.7)
clf_lr.fit(train_transformed, y)
pred_lr = clf_lr.predict(test_transformed)

clf_svm = SVC(C=0.5, kernel='linear')
clf_svm.fit(train_transformed, y)
pred_svm = clf_svm.predict(test_transformed)

clf_rf = RandomForestClassifier(n_estimators=30, max_depth=8)
clf_rf.fit(train_transformed, y)
pred_rf = clf_rf.predict(test_transformed)

clf_xgb = xgb.XGBClassifier(max_depth=5, nthread=-1)
clf_xgb.fit(train_transformed, y)
pred_xgb = clf_xgb.predict(test_transformed)

In [14]:
# construct a dataframe of all the test data predictions
pred_fusion = pd.concat([pd.Series(list(pred_lr), name="lr"), 
                         pd.Series(list(pred_svm), name="svm"), 
                         pd.Series(list(pred_rf), name="rf"), 
                         pd.Series(list(pred_xgb), name="xgb")], axis=1)

In [15]:
pred_fusion.head()

Unnamed: 0,lr,svm,rf,xgb
0,6,6,6,9
1,7,6,7,7
2,3,3,3,3
3,4,4,4,8
4,9,9,9,9


In [18]:
def vote(candidates):
    # default vote to Random Forest candidate
    default_vote = candidates[2]
    
    most_common = Counter(candidates).most_common()
    if(len(most_common) == 1):
        default_vote = most_common[0][0]
    
    return default_vote
    
pred_fusion["prediction"] = pred_fusion.apply(lambda row: vote([row["lr"], row["svm"], row["rf"], row["xgb"]]), axis=1)
pred_fusion["prediction"].to_csv("prediction2.csv", index=False)

__Two submissions:__ 
    
    Note: The files are kept in the same folder
    
    1) predictions1.csv
    2) predictions2.csv