In [45]:
import pandas as pd
from scipy import stats
from statsmodels.stats import weightstats
import numpy as np

from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix

# Data Retrieval

In [28]:
orgin_stat_sig = pd.read_csv('orig_stat_sig.csv')
feature_eng = pd.read_csv('feature_engineered_cols.csv')

print(orgin_stat_sig.head())

train_1, test_1 = train_test_split(orgin_stat_sig, train_size=0.8)

train_1_X = train_1.loc[:, 'Daily Time Spent on Site': 'Age']
train_1_y = train_1['Clicked on Ad']

print(feature_eng.head())

train_2, test_2 = train_test_split(feature_eng, train_size=0.8)

train_2_X = train_2.loc[:, 'Daily Time Spent on Site': 'Income/Length']
train_2_y = train_2['Clicked on Ad']

   Daily Time Spent on Site  Area Income  Daily Internet Usage       Age  \
0                  0.249267     0.509691              1.734030 -0.114905   
1                  0.961132     1.002530              0.313805 -0.570425   
2                  0.282083     0.356949              1.287589 -1.139826   
3                  0.577432    -0.014456              1.501580 -0.798185   
4                  0.212664     1.408868              1.038731 -0.114905   

   Clicked on Ad  
0            0.0  
1            0.0  
2            0.0  
3            0.0  
4            0.0  
   Daily Time Spent on Site       Age  Area Income  Daily Internet Usage  \
0                  0.249267 -0.114905     0.509691              1.734030   
1                  0.961132 -0.570425     1.002530              0.313805   
2                  0.282083 -1.139826     0.356949              1.287589   
3                  0.577432 -0.798185    -0.014456              1.501580   
4                  0.212664 -0.114905     1.40886

# Model Preparation

In [29]:
svc = SVC(random_state=42)
svc2 = SVC(random_state=42)

param_grid = [
    {
        'C': [0.1, 0.5, 1, 2.5, 5],
        'kernel': ['poly', 'rbf', 'sigmoid'],
        'degree': [3, 6, 9],
        'gamma': ['auto', 'scale']
    }
]

cv_len = 5

## Testing Just Statistically Significant Features First

In [30]:
svc_grid_search = GridSearchCV(svc, param_grid, cv=cv_len, scoring=['accuracy', 'f1'], refit='f1')

In [31]:
svc_grid_search.fit(train_1_X, train_1_y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=42, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid=[{'C': [0.1, 0.5, 1, 2.5, 5], 'degree': [3, 6, 9],
                          'gamma': ['auto', 'scale'],
                          'kernel': ['poly', 'rbf', 'sigmoid']}],
             pre_dispatch='2*n_jobs', refit='f1', return_train_score=False,
             scoring=['accuracy', 'f1'], verbose=0)

In [32]:
svc_grid_search.best_estimator_

SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=42, shrinking=True, tol=0.001,
    verbose=False)

In [33]:
svc_grid_search.best_params_

{'C': 0.1, 'degree': 3, 'gamma': 'auto', 'kernel': 'rbf'}

In [34]:
svc_grid_search.best_score_

0.968624993605227

## Testing Just New Features + Statistically Significant Features 

In [35]:
svc_grid_search_2 = GridSearchCV(svc2, param_grid, cv=cv_len, scoring=['accuracy', 'f1'], refit='f1')

In [36]:
svc_grid_search_2.fit(train_2_X, train_2_y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=42, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid=[{'C': [0.1, 0.5, 1, 2.5, 5], 'degree': [3, 6, 9],
                          'gamma': ['auto', 'scale'],
                          'kernel': ['poly', 'rbf', 'sigmoid']}],
             pre_dispatch='2*n_jobs', refit='f1', return_train_score=False,
             scoring=['accuracy', 'f1'], verbose=0)

In [37]:
svc_grid_search_2.best_estimator_

SVC(C=0.5, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=42, shrinking=True, tol=0.001,
    verbose=False)

In [38]:
svc_grid_search_2.best_params_

{'C': 0.5, 'degree': 3, 'gamma': 'auto', 'kernel': 'rbf'}

In [39]:
svc_grid_search_2.best_score_

0.9698092330901374

# Statistical Signficance Test

In [59]:
cv_len = 300

def scores_info(scores):
#     print(f'scores: {scores}')
    print(f'mean: {scores.mean()}')
    print(f'std: {scores.std()}')
    print('')

model1 = svc_grid_search.best_estimator_
model2 = svc_grid_search_2.best_estimator_

scores1 = cross_val_score(model1, train_1_X, train_1_y, cv=cv_len)
scores2 = cross_val_score(model2, train_2_X, train_2_y, cv=cv_len)

scores_info(scores1)
scores_info(scores2)

ALPHA = 0.05

statistic, p = stats.ttest_ind(scores1, scores2, equal_var=False) 

print('')
print(statistic)
print(p)

if p < ALPHA:
    print('statistically significant')
else:
    print('fail to reject, not stat sig')

mean: 0.965
std: 0.12086493839543928

mean: 0.9663888888888887
std: 0.12020398043983506


-0.14088840193882027
0.8880055507939072
fail to reject, not stat sig


# Cross Val Predict

In [52]:
def score_comparison_info(y_preds, y_trues):
    conf_matrix = confusion_matrix(y_preds, y_trues)
    f1 = f1_score(y_preds, y_trues)
    precision = precision_score(y_preds, y_trues)
    recall = recall_score(y_preds, y_trues)
    area_under_curve = roc_auc_score(y_preds, y_trues)
    
    return conf_matrix, precision, recall, f1, area_under_curve

### Original Statistically Significant Features

In [62]:
preds = cross_val_predict(model1, train_1_X, train_1_y, cv=cv_len, method='decision_function')
scores_array = []

for i in preds:
    if i > 0:
        scores_array.append(1)
    else:
        scores_array.append(0)  

conf_matrix, precision, recall, f1, area_under_curve = score_comparison_info(scores_array, train_1_y) 
print(f'{conf_matrix},\n precision: {precision},\n recall: {recall},\n f1: {f1},\n Area under Curve: {area_under_curve}')


[[405  18]
 [  7 370]],
 precision: 0.9536082474226805,
 recall: 0.9814323607427056,
 f1: 0.9673202614379086,
 Area under Curve: 0.9694395846266719


### Original Statistically Significant Features + New Features

In [63]:
preds = cross_val_predict(model2, train_2_X, train_2_y, cv=cv_len, method='decision_function')
scores_array = []

for i in preds:
    if i > 0:
        scores_array.append(1)
    else:
        scores_array.append(0)  

conf_matrix, precision, recall, f1, area_under_curve = score_comparison_info(scores_array, train_2_y) 
print(f'{conf_matrix},\n precision: {precision},\n recall: {recall},\n f1: {f1},\n Area under Curve: {area_under_curve}')

[[389  20]
 [  8 383]],
 precision: 0.9503722084367245,
 recall: 0.979539641943734,
 f1: 0.9647355163727959,
 Area under Curve: 0.9653199432212558
