In [91]:
import pandas as pd
from scipy import stats
from statsmodels.stats import weightstats

from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix

# Data Retrieval

In [75]:
orgin_stat_sig = pd.read_csv('orig_stat_sig.csv')
feature_eng = pd.read_csv('feature_engineered_cols.csv')

orgin_stat_sig.head()

train_1, test_1 = train_test_split(orgin_stat_sig, train_size=0.8)

train_1_X = train_1.loc[:, 'Daily Time Spent on Site': 'Age']
train_1_y = train_1['Clicked on Ad']

feature_eng.head()

train_2, test_2 = train_test_split(feature_eng, train_size=0.8)

train_2_X = train_2.loc[:, 'Daily Time Spent on Site': 'Income/Length']
train_2_y = train_2['Clicked on Ad']

# Model Preparation

In [76]:
svc = LinearSVC(random_state=42, max_iter=5000)
svc2 = LinearSVC(random_state=42, max_iter=5000)

param_grid = [
    {
        'C': [0.1, 0.5, 1, 2.5, 5]
    }
]

cv_len = 5

## Testing Just Statistically Significant Features First

In [77]:
svc_grid_search = GridSearchCV(svc, param_grid, cv=cv_len, scoring=['accuracy', 'f1'], refit='f1')

In [78]:
svc_grid_search.fit(train_1_X, train_1_y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='squared_hinge', max_iter=5000,
                                 multi_class='ovr', penalty='l2',
                                 random_state=42, tol=0.0001, verbose=0),
             iid='warn', n_jobs=None, param_grid=[{'C': [0.1, 0.5, 1, 2.5, 5]}],
             pre_dispatch='2*n_jobs', refit='f1', return_train_score=False,
             scoring=['accuracy', 'f1'], verbose=0)

In [79]:
svc_grid_search.best_estimator_

LinearSVC(C=0.1, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=5000,
          multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
          verbose=0)

In [80]:
svc_grid_search.best_params_

{'C': 0.1}

In [81]:
svc_grid_search.best_score_

0.9630114533809045

## Testing Just New Features + Statistically Significant Features 

In [82]:
svc_grid_search_2 = GridSearchCV(svc2, param_grid, cv=cv_len, scoring=['accuracy', 'f1'], refit='f1')

In [83]:
svc_grid_search_2.fit(train_2_X, train_2_y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='squared_hinge', max_iter=5000,
                                 multi_class='ovr', penalty='l2',
                                 random_state=42, tol=0.0001, verbose=0),
             iid='warn', n_jobs=None, param_grid=[{'C': [0.1, 0.5, 1, 2.5, 5]}],
             pre_dispatch='2*n_jobs', refit='f1', return_train_score=False,
             scoring=['accuracy', 'f1'], verbose=0)

In [84]:
svc_grid_search_2.best_estimator_

LinearSVC(C=0.5, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=5000,
          multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
          verbose=0)

In [85]:
svc_grid_search_2.best_params_

{'C': 0.5}

In [86]:
svc_grid_search_2.best_score_

0.9683496732026144

# Statistical Signficance Test

In [127]:
cv_len = 300

def scores_info(scores):
#     print(f'scores: {scores}')
    print(f'mean: {scores.mean()}')
    print(f'std: {scores.std()}\n')

model1 = svc_grid_search.best_estimator_
model2 = svc_grid_search_2.best_estimator_

scores1 = cross_val_score(model1, train_1_X, train_1_y, cv=cv_len)
scores2 = cross_val_score(model2, train_2_X, train_2_y, cv=cv_len)

ALPHA = 0.05

scores_info(scores1)
scores_info(scores2)

statistic, p = stats.ttest_ind(scores1, scores2, equal_var=False) 

print(statistic)
print(p)

if p < ALPHA:
    print('statistically significant')
else:
    print('fail to reject, not stat sig')

mean: 0.9675
std: 0.10794481614849939

mean: 0.9616666666666667
std: 0.12157530816557924

0.6204147676208136
0.5352244224712258
fail to reject, not stat sig


# Cross Val Predict

In [128]:
def score_comparison_info(y_preds, y_trues):
    conf_matrix = confusion_matrix(y_preds, y_trues)
    f1 = f1_score(y_preds, y_trues)
    precision = precision_score(y_preds, y_trues)
    recall = recall_score(y_preds, y_trues)
    area_under_curve = roc_auc_score(y_preds, y_trues)
    
    return conf_matrix, precision, recall, f1, area_under_curve

### Original Statistically Significant Features

In [129]:
preds = cross_val_predict(model1, train_1_X, train_1_y, cv=cv_len, method='decision_function')
scores_array = []

for i in preds:
    if i > 0:
        scores_array.append(1)
    else:
        scores_array.append(0)  

conf_matrix, precision, recall, f1, area_under_curve = score_comparison_info(scores_array, train_1_y) 
print(f'{conf_matrix},\n precision: {precision},\n recall: {recall},\n f1: {f1},\n Area under Curve: {area_under_curve}')

[[392  20]
 [  9 379]],
 precision: 0.949874686716792,
 recall: 0.9768041237113402,
 f1: 0.9631512071156291,
 Area under Curve: 0.9641302171954761


### Original Statistically Significant Features + New Features

In [130]:
preds = cross_val_predict(model2, train_2_X, train_2_y, cv=cv_len, method='decision_function')
scores_array = []

for i in preds:
    if i > 0:
        scores_array.append(1)
    else:
        scores_array.append(0)  

conf_matrix, precision, recall, f1, area_under_curve = score_comparison_info(scores_array, train_2_y) 
print(f'{conf_matrix},\n precision: {precision},\n recall: {recall},\n f1: {f1},\n Area under Curve: {area_under_curve}')

[[388  17]
 [ 13 382]],
 precision: 0.9573934837092731,
 recall: 0.9670886075949368,
 f1: 0.9622166246851387,
 Area under Curve: 0.9625566494764807
