In [1]:
!pip install vecstack

Collecting vecstack
  Downloading https://files.pythonhosted.org/packages/d0/a1/b9a1e9e9e5a12078da1ab9788c7885e4c745358f7e57d5f94d9db6a4e898/vecstack-0.4.0.tar.gz
Building wheels for collected packages: vecstack
  Building wheel for vecstack (setup.py) ... [?25l[?25hdone
  Created wheel for vecstack: filename=vecstack-0.4.0-cp36-none-any.whl size=19879 sha256=0d48aed40fecd5c92acb1621caa0120ed8a921d966612818a7c5b3e6a6ff256a
  Stored in directory: /root/.cache/pip/wheels/5f/bb/4e/f6488433d53bc0684673d6845e5bf11a25240577c8151c140e
Successfully built vecstack
Installing collected packages: vecstack
Successfully installed vecstack-0.4.0


In [0]:
from vecstack import stacking
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

from imblearn.over_sampling import SMOTE 
from sklearn.svm import SVC
from collections import Counter

import warnings
warnings.filterwarnings("ignore")

In [5]:
# To upload our datasets from our working directory we need to mount our drive contents to the colab environment. 
# For the code to do so you can search “mount” in code snippets or use the code given below. 
# Our entire drive contents are now mounted on colab at the location “/gdrive”.

from google.colab import drive
drive.mount('/gdrive')
# Change current working directory to gdrive
%cd /gdrive

trainfile = r'/gdrive/My Drive/Courses/CIS508/RevisedHomesiteTrain.csv'
train_data = pd.read_csv(trainfile)

testfile = r'/gdrive/My Drive/Courses/CIS508/RevisedHomesiteTest.csv'
test_data = pd.read_csv(testfile)

print(train_data.shape)
print(train_data.head()) 

print(test_data.shape)
print(test_data.head()) 

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive
(65000, 596)
   CoverageField11A  ...  QuoteConversion_Flag
0                 2  ...                     0
1                 5  ...                     0
2                 4  ...                     0
3                15  ...                     0
4                 4  ...                     0

[5 rows x 596 columns]
(173836, 596)
   CoverageField11A  CoverageField11B  ...  GeographicField64_TX  GeographicField64
0                13                22  ...                     0                 IL
1                 4                 5  ...                     0                 NJ
2                 3                 3  ...                     0                 NJ
3                 5                 9  ...                     0                 TX
4                12                21  ...                     0                 CA

[5 rows x 596 columns]


In [8]:
# Copy Train data excluding target
trainData_Copy = train_data.drop('QuoteNumber', axis=1).iloc[:, :-1].copy()
testData_Copy = test_data.drop('QuoteNumber', axis = 1).iloc[:, :-1].copy()

# Separate Train data and test data
X_train = trainData_Copy
X_test = testData_Copy
y_train = train_data["QuoteConversion_Flag"]

# Select just Target Column
y_train = train_data.iloc[:, -1]
y_test = test_data.iloc[:, -1]

print(X_train.shape)
print(X_test.head()) 

print(y_train.shape)
print(y_test.head()) 

(65000, 594)
   CoverageField11A  ...  GeographicField64_TX
0                13  ...                     0
1                 4  ...                     0
2                 3  ...                     0
3                 5  ...                     0
4                12  ...                     0

[5 rows x 594 columns]
(65000,)
0    IL
1    NJ
2    NJ
3    TX
4    CA
Name: GeographicField64, dtype: object


In [0]:
# Create Validation Dataset from Training Dataset
from sklearn.model_selection import train_test_split
X_train1, X_val, y_train1, y_val = train_test_split(X_train, y_train, test_size = 0.2)

In [15]:
# Default Decision Tree and Obtain Respective Accuracy
clf = DecisionTreeClassifier()
clf.fit(X_train1, y_train1)
clf_predict = clf.predict(X_val)
print("accuracy Score (training) for Decision TreE:{0:6f}".format(clf.score(X_val, y_val)))
print("Confusion Matrix for Decision Tree")
print(confusion_matrix(y_val,clf_predict))

accuracy Score (training) for Decision TreE:0.880308
Confusion Matrix for Decision Tree
[[9782  743]
 [ 813 1662]]


In [0]:
pred_proba = clf.predict_proba(X_test)
pd.concat([test_data['QuoteNumber'], pd.DataFrame(pred_proba[:, 1], columns = ['QuoteConversion_Flag'])], axis = 1).to_csv('/results_decisiontree.csv', index = None)
from google.colab import files
files.download('/results_decisiontree.csv')

In [0]:
# Default Random Forest and Obtain Respective Accuracy
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_predict = rfc.predict(X_test)
print("Accuracy Score (training) for RandomForest:{0:6f}".format(mlp.score(X_val, y_val)))

pred_proba = rfc.predict_proba(X_test)
pd.concat([test_data['QuoteNumber'], pd.DataFrame(pred_proba[:, 1], columns = ['QuoteConversion_Flag'])], axis = 1).to_csv('/results_randomforest.csv', index = None)

Accuracy Score (training) for RandomForest:0.992308


In [0]:
files.download('/results_randomforest.csv')

In [0]:
# Default Gradient Boosting and Obtain Respective Accuracy 
search_grid = {'n_estimators':[5, 10, 20, 30, 50], 'learning_rate':[0.01, .1]}
abc = GradientBoostingClassifier()
abc.fit(X_train, y_train)
abc_predict = abc.predict(X_test)
print("Accuracy Score (training) for Boosting:{0:6f}".format(abc.score(X_val, y_val)))

pred_proba = abc.predict_proba(X_test)
pd.concat([test_data['QuoteNumber'], pd.DataFrame(pred_proba[:, 1], columns = ['QuoteConversion_Flag'])], axis = 1).to_csv('/results_gradientboosting.csv', index = None)

Accuracy Score (training) for Boosting:0.924462


In [0]:
files.download('/results_gradientboosting.csv')

In [24]:
# Default Neural Network and Obtain Respective Accuracy 
mlp = MLPClassifier()
mlp.fit(X_train, y_train)
mlp_predict = mlp.predict(X_test)
print("Accuracy Score (training) for NeuralNetwork:{0:6f}".format(mlp.score(X_val, y_val)))

pred_proba = mlp.predict_proba(X_test)
pd.concat([test_data['QuoteNumber'], pd.DataFrame(pred_proba[:, 1], columns = ['QuoteConversion_Flag'])], axis = 1).to_csv('/results_neuralnetwork.csv', index = None)

Accuracy Score (training) for NeuralNetwork:0.563462


In [0]:
files.download('/results_neuralnetwork.csv')

In [30]:
# Default K-Nearest Neighbor and Obtain Respective Accuracy 
knc = KNeighborsClassifier()
knc.fit(X_train, y_train)
knc_predict = knc.predict(X_test)
print("Accuracy Score (training) for KNearestNeighbor:{0:6f}".format(knc.score(X_val, y_val)))

pred_proba = knc.predict_proba(X_test)
pd.concat([test_data['QuoteNumber'], pd.DataFrame(pred_proba[:, 1], columns = ['QuoteConversion_Flag'])], axis = 1).to_csv('/results_knearestneighbor.csv', index = None)

Accuracy Score (training) for KNearestNeighbor:0.835538


In [0]:
files.download('/results_knearestneighbor.csv')

In [0]:
# Stacking Models
print("Ensemble Methods Predictions using GradientBoosting, RandomForest and Decision Tree Classifier\n")

models = [GradientBoostingClassifier(**grid_parm_abc), RandomForestClassifier(**grid_parm_rfc), DecisionTreeClassifier(**grid_parm)]
      
S_Train, S_Test = stacking(models,                   
                           X_res, y_res, X_test,   
                           regression = False, 
                           mode = 'oof_pred_bag', 
                           needs_proba = False,
                           save_dir = None, 
                           metric = accuracy_score, 
                           n_folds = 4, 
                           stratified = True,
                           shuffle = True,  
                           random_state = 0,    
                           verbose = 2)

Ensemble Methods Predictions using GradientBoosting, RandomForest and Decision Tree Classifier

task:         [classification]
n_classes:    [2]
metric:       [accuracy_score]
mode:         [oof_pred_bag]
n_models:     [3]

model  0:     [GradientBoostingClassifier]
    fold  0:  [0.93981797]
    fold  1:  [0.94084187]
    fold  2:  [0.93882737]
    fold  3:  [0.94159587]
    ----
    MEAN:     [0.94027077] + [0.00104529]
    FULL:     [0.94027077]

model  1:     [RandomForestClassifier]
    fold  0:  [0.93871824]
    fold  1:  [0.94076602]
    fold  2:  [0.94068568]
    fold  3:  [0.94053398]
    ----
    MEAN:     [0.94017598] + [0.00084574]
    FULL:     [0.94017596]

model  2:     [DecisionTreeClassifier]
    fold  0:  [0.93348502]


In [0]:
# Stacking - Construct a Gradient Boosting Model
model = GradientBoostingClassifier()
    
model = model.fit(S_Train, y_res)
y_pred = model.predict(S_Test)

pred_proba = model.predict_proba(S_Test)
pd.concat([test_data['QuoteNumber'], pd.DataFrame(pred_proba[:, 1], columns = ['QuoteConversion_Flag'])], axis = 1).to_csv('/results_gradientboosting_stacking.csv', index = None)

In [0]:
files.download('/results_gradientboosting_stacking.csv')