In [47]:
import os 
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [48]:
# os.chdir("Desktop")

In [49]:
#Load the preprocessed training set

df= pd.read_csv("hotstar_preprocessed_training_set.csv")

In [50]:
#Re-index the training set 

df.index = df["Unnamed: 0"]

In [51]:
#Drop the redundant index column

df.drop(["Unnamed: 0"],1,inplace=True)

In [52]:
X= df

In [53]:
#From the original non-preprocessed dataset, retrieve the target variable column

original_df= pd.read_json("train_data.json")
original_df= original_df.T
original_df=original_df[:10000]

In [54]:
#Check for class imbalances

original_df["segment"].value_counts()

neg    9250
pos     750
Name: segment, dtype: int64

In [55]:
#Converting the target variable to continuous values
original_df["segment"]= original_df["segment"].apply(lambda x:1 if x =="pos" else 0)

In [56]:
y= original_df["segment"]

In [57]:
#Fill all null values in the feature set X with 0's

X.fillna(0,inplace=True)

In [58]:
#Break the data into training and validation sets

from sklearn.model_selection import train_test_split as tts
X_train, X_val, y_train, y_val= tts(X,y,test_size=0.3,random_state= 42)

In [102]:
#Since the dataset is highly imbalanced, we will use SMOTE to oversample the minority class

from imblearn.over_sampling import SMOTE

In [103]:
sm= SMOTE(random_state=42)

In [104]:
X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel())

In [105]:
# Now lets redo all the above models with SMOTE- oversampling of the minority class data
#This is done to remove the class imbalance in the training dataset

In [129]:
logreg.fit(X_train_res,y_train_res)
smote_logreg= logreg.predict(X_val)
print ("Accuracy Score for Logistic Regression is:" + str(accuracy_score(y_val,smote_logreg)))

Accuracy Score for Logistic Regression is:0.718


In [131]:
dtc.fit(X_train_res,y_train_res)
smote_dtc= dtc.predict(X_val)
print ("Accuracy Score for Decision Tree is:" + str(accuracy_score(y_val,smote_dtc)))

Accuracy Score for Decision Tree is:0.8216666666666667


In [133]:
rfc.fit(X_train_res,y_train_res)
smote_rfc= rfc.predict(X_val)
print ("Accuracy Score for the Random Forest Classifier is:" + str(accuracy_score(y_val,smote_rfc)))

Accuracy Score for the Random Forest Classifier is:0.885


In [134]:
svc.fit(X_train_res,y_train_res)
smote_svc= svc.predict(X_val)
print ("Accuracy Score for the Support Vector Classifier is:" + str(accuracy_score(y_val,smote_svc)))

Accuracy Score for the Support Vector Classifier is:0.911


In [135]:
#Now lets use some boosting algorithms
ada.fit(X_train_res,y_train_res)
smote_ada= ada.predict(X_val)
print ("Accuracy Score for the AdaBoost Classifier is:" + str(accuracy_score(y_val,smote_ada)))

Accuracy Score for the AdaBoost Classifier is:0.7956666666666666


In [136]:
from sklearn.ensemble import GradientBoostingClassifier
gb= GradientBoostingClassifier()

In [138]:
gb.fit(X_train_res,y_train_res)
smote_gb= gb.predict(X_val)
print ("Accuracy Score for the Gradient Boosting Classifier is:" + str(accuracy_score(y_val,smote_gb)))

Accuracy Score for the Gradient Boosting Classifier is:0.8133333333333334


In [139]:
from sklearn.ensemble import BaggingClassifier

In [140]:
bagging = BaggingClassifier()

In [141]:
bagging.fit(X_train_res,y_train_res)
smote_bagging= bagging.predict(X_val)
roc_auc_score(y_val,smote_bagging)
print ("Accuracy Score for the Bagging Classifier is:" + str(accuracy_score(y_val,smote_bagging)))

Accuracy Score for the Bagging Classifier is:0.867


In [142]:
from sklearn.ensemble import VotingClassifier

In [143]:
vc = VotingClassifier(estimators=[("svc", svc), ('logreg', logreg),("dtc",dtc)])

In [144]:
vc.fit(X_train_res,y_train_res)
smote_vc= vc.predict(X_val)
roc_auc_score(y_val,smote_vc)
print ("Accuracy Score for the Voting Classfier Classifier is:" + str(accuracy_score(y_val,smote_vc)))

Accuracy Score for the Voting Classfier Classifier is:0.872


In [147]:
#In this particular case I would go with the Voting Classifier ensemble method as my final model
#SMOTE- oversampling - improved model performance 