In [1]:
import os 
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [4]:
os.chdir("Desktop")

In [5]:
#Load the preprocessed training set

df= pd.read_csv("hotstar_preprocessed_training_set.csv")

In [6]:
#Re-index the training set 

df.index = df["Unnamed: 0"]

In [7]:
#Drop the redundant index column

df.drop(["Unnamed: 0"],1,inplace=True)

In [8]:
X= df

In [9]:
#From the original non-preprocessed dataset, retrieve the target variable column

original_df= pd.read_json("train_data.json")
original_df= original_df.T
original_df=original_df[:10000]

In [10]:
#Check for class imbalances

original_df["segment"].value_counts()

neg    9250
pos     750
Name: segment, dtype: int64

In [11]:
#Converting the target variable to continuous values
original_df["segment"]= original_df["segment"].apply(lambda x:1 if x =="pos" else 0)

In [12]:
y= original_df["segment"]

In [13]:
#Fill all null values in the feature set X with 0's

X.fillna(0,inplace=True)

In [14]:
#Break the data into training and validation sets

from sklearn.model_selection import train_test_split as tts
X_train, X_val, y_train, y_val= tts(X,y,test_size=0.3,random_state= 42)

In [15]:
#Since the dataset is highly imbalanced, we will use SMOTE to oversample the minority class

from imblearn.over_sampling import SMOTE

In [16]:
sm= SMOTE(random_state=42)

In [17]:
X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel())

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

In [23]:
#Instantiate the classifier
logreg= LogisticRegression()

In [24]:
logreg.fit(X_train_res,y_train_res)
smote_logreg= logreg.predict(X_val)
print ("Accuracy Score for Logistic Regression is:" + str(accuracy_score(y_val,smote_logreg)))

Accuracy Score for Logistic Regression is:0.718


In [30]:
confusion_matrix(y_val,smote_logreg)

array([[2009,  744],
       [ 102,  145]])

In [34]:
#Let us know use a Decision Tree classifier which is good at capturing non linear relationships in the data
from sklearn.tree import DecisionTreeClassifier
dtc= DecisionTreeClassifier()

In [35]:
dtc.fit(X_train_res,y_train_res)
smote_dtc= dtc.predict(X_val)
print ("Accuracy Score for Decision Tree is:" + str(accuracy_score(y_val,smote_dtc)))

Accuracy Score for Decision Tree is:0.8223333333333334


In [36]:
confusion_matrix(y_val,smote_dtc)

array([[2393,  360],
       [ 173,   74]])

In [None]:
#We can see that the accuracy score for this algorithm is higher than that of logistic regression

In [37]:
#Let us try a random forest classifier next
from sklearn.ensemble import RandomForestClassifier
rfc= RandomForestClassifier()

In [38]:
rfc.fit(X_train_res,y_train_res)
smote_rfc= rfc.predict(X_val)
print ("Accuracy Score for the Random Forest Classifier is:" + str(accuracy_score(y_val,smote_rfc)))

Accuracy Score for the Random Forest Classifier is:0.888


In [39]:
confusion_matrix(y_val,smote_rfc)

array([[2632,  121],
       [ 215,   32]])

In [40]:
#Support vector machines
from sklearn.svm import SVC
svc= SVC()

In [41]:
svc.fit(X_train_res,y_train_res)
smote_svc= svc.predict(X_val)
print ("Accuracy Score for the Support Vector Classifier is:" + str(accuracy_score(y_val,smote_svc)))

Accuracy Score for the Support Vector Classifier is:0.911


In [42]:
confusion_matrix(y_val,smote_svc)

array([[2732,   21],
       [ 246,    1]])

In [None]:
#Interestingly this model despite having the best overall accuracy score has the highest number of false negatives

In [None]:
#It looks as though our svc model may be overfitted to the training data.
#I will attempt a grid search CV to see if this improves the overall score

In [45]:
from sklearn.model_selection import GridSearchCV

In [47]:
C = [0.001, 0.01, 0.1, 1, 10]
param_grid = {'C': C}
grid_search = GridSearchCV(svc, param_grid, cv=5)
grid_search.fit(X, y)
grid_search.best_params_

{'C': 0.001}

In [49]:
svc= SVC(C= 0.001)
svc.fit(X_train,y_train)

SVC(C=0.001, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [50]:
smote_svc= svc.predict(X_val)
print ("The regularized accuracy score for the Support Vector Classifier is:" + str(accuracy_score(y_val,smote_svc)))

The regularized accuracy score for the Support Vector Classifier is:0.9176666666666666


In [57]:
confusion_matrix(y_val,smote_svc)

array([[2753,    0],
       [ 247,    0]])

In [55]:
#Let us try a boosting model on our dataset
from sklearn.ensemble import AdaBoostClassifier
ada= AdaBoostClassifier()

In [56]:
#Now lets use some boosting algorithms
ada.fit(X_train_res,y_train_res)
smote_ada= ada.predict(X_val)
print ("Accuracy Score for the AdaBoost Classifier is:" + str(accuracy_score(y_val,smote_ada)))

Accuracy Score for the AdaBoost Classifier is:0.7956666666666666


In [58]:
confusion_matrix(y_val,smote_ada)

array([[2283,  470],
       [ 143,  104]])

In [None]:
#Adaboost classifier gives us a large number of False Positives

In [59]:
from sklearn.ensemble import GradientBoostingClassifier
gb= GradientBoostingClassifier()

In [60]:
gb.fit(X_train_res,y_train_res)
smote_gb= gb.predict(X_val)
print ("Accuracy Score for the Gradient Boosting Classifier is:" + str(accuracy_score(y_val,smote_gb)))

Accuracy Score for the Gradient Boosting Classifier is:0.8133333333333334


In [61]:
confusion_matrix(y_val,smote_gb)

array([[2334,  419],
       [ 141,  106]])

In [62]:
from sklearn.ensemble import BaggingClassifier

In [63]:
bagging = BaggingClassifier()

In [65]:
bagging.fit(X_train_res,y_train_res)
smote_bagging= bagging.predict(X_val)
print ("Accuracy Score for the Bagging Classifier is:" + str(accuracy_score(y_val,smote_bagging)))

Accuracy Score for the Bagging Classifier is:0.868


In [66]:
confusion_matrix(y_val,smote_bagging)

array([[2555,  198],
       [ 198,   49]])

In [67]:
from sklearn.ensemble import VotingClassifier

In [68]:
vc = VotingClassifier(estimators=[("svc", svc), ('logreg', logreg),("dtc",dtc)])

In [70]:
vc.fit(X_train_res,y_train_res)
smote_vc= vc.predict(X_val)
print ("Accuracy Score for the Voting Classfier Classifier is:" + str(accuracy_score(y_val,smote_vc)))

Accuracy Score for the Voting Classfier Classifier is:0.8696666666666667


In [71]:
confusion_matrix(y_val,smote_vc)

array([[2562,  191],
       [ 200,   47]])

In [147]:
#In this particular case I would go with Random forest algorithm as my final model
#The boosting algorithms had very high false positive rates
#SVM by itself has a high accuracy rate but prone to overfitting to the training set
#Also SVM also has a high false negative rate