In [67]:
import pandas as pd
from scipy import stats
from statsmodels.stats import weightstats
import warnings

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix

warnings.filterwarnings('ignore')

# Data Retrieval

In [91]:
orgin_stat_sig = pd.read_csv('orig_stat_sig.csv')
feature_eng = pd.read_csv('feature_engineered_cols.csv')

orgin_stat_sig.head()

train_1, test_1 = train_test_split(orgin_stat_sig, train_size=0.8)

train_1_X = train_1.loc[:, 'Daily Time Spent on Site': 'Age']
train_1_y = train_1['Clicked on Ad']

feature_eng.head()

train_2, test_2 = train_test_split(feature_eng, train_size=0.8)

train_2_X = train_2.loc[:, 'Daily Time Spent on Site': 'Income/Length']
train_2_y = train_2['Clicked on Ad']

# Data Preparation

In [48]:
lr = LogisticRegression(random_state=42, solver='liblinear')
lr2 = LogisticRegression(random_state=42, solver='liblinear')

param_grid = [
    {
        'C': [0.1, 0.5, 1, 2.5, 5],
        'n_jobs': [-1]
    }
]

cv_len = 5

## Testing Just Statistically Significant Features First

In [49]:
lr_grid_search = GridSearchCV(lr, param_grid, cv=cv_len, scoring=['accuracy', 'f1'], refit='f1')

In [50]:
lr_grid_search.fit(train_1_X, train_1_y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=42, solver='liblinear',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid=[{'C': [0.1, 0.5, 1, 2.5, 5], 'n_jobs': [-1]}],
             pre_dispatch='2*n_jobs', refit='f1', return_train_score=False,
             scoring=['accuracy', 'f1'], verbose=0)

In [51]:
lr_grid_search.best_estimator_

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=-1, penalty='l2', random_state=42,
                   solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [52]:
lr_grid_search.best_params_

{'C': 1, 'n_jobs': -1}

In [53]:
lr_grid_search.best_score_

0.9656989681054011

## Testing Just New Features + Statistically Significant Features 

In [54]:
lr_grid_search_2 = GridSearchCV(lr2, param_grid, cv=cv_len, scoring=['accuracy', 'f1'], refit='f1')

In [55]:
lr_grid_search_2.fit(train_2_X, train_2_y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=42, solver='liblinear',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid=[{'C': [0.1, 0.5, 1, 2.5, 5], 'n_jobs': [-1]}],
             pre_dispatch='2*n_jobs', refit='f1', return_train_score=False,
             scoring=['accuracy', 'f1'], verbose=0)

In [56]:
lr_grid_search_2.best_estimator_

LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=-1, penalty='l2', random_state=42,
                   solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [57]:
lr_grid_search_2.best_params_

{'C': 0.5, 'n_jobs': -1}

In [58]:
lr_grid_search_2.best_score_

0.9726917044739304

# Statistical Signficance Test

In [86]:
cv_len = 300

def scores_info(scores):
#     print(f'scores: {scores}')
    print(f'mean: {scores.mean()}')
    print(f'std: {scores.std()}')
    print('')

model1 = lr_grid_search.best_estimator_
model2 = lr_grid_search_2.best_estimator_

scores1 = cross_val_score(model1, train_1_X, train_1_y, cv=cv_len)
scores2 = cross_val_score(model2, train_2_X, train_2_y, cv=cv_len)

scores_info(scores1)
scores_info(scores2)

ALPHA = 0.05

statistic, p = stats.ttest_ind(scores1, scores2, equal_var=False) 

print('')
print(statistic)
print(p)

if p < ALPHA:
    print('statistically significant')
else:
    print('fail to reject, not stat sig')

mean: 0.965
std: 0.12426450284239127

mean: 0.9694444444444446
std: 0.11279966946803863


-0.45792554069856006
0.6471738290748057
fail to reject, not stat sig


# Cross Val Predict

In [87]:
def score_comparison_info(y_preds, y_trues):
    conf_matrix = confusion_matrix(y_preds, y_trues)
    f1 = f1_score(y_preds, y_trues)
    precision = precision_score(y_preds, y_trues)
    recall = recall_score(y_preds, y_trues)
    area_under_curve = roc_auc_score(y_preds, y_trues)
    
    return conf_matrix, precision, recall, f1, area_under_curve

### Original Statistically Significant Features

In [88]:
preds = cross_val_predict(model1, train_1_X, train_1_y, cv=cv_len, method='decision_function')
scores_array = []

for i in preds:
    if i > 0:
        scores_array.append(1)
    else:
        scores_array.append(0)  

conf_matrix, precision, recall, f1, area_under_curve = score_comparison_info(scores_array, train_1_y) 
print(f'{conf_matrix},\n precision: {precision},\n recall: {recall},\n f1: {f1},\n Area under Curve: {area_under_curve}')

[[393  18]
 [ 10 379]],
 precision: 0.9546599496221663,
 recall: 0.974293059125964,
 f1: 0.9643765903307888,
 Area under Curve: 0.9652487193440039


### Original Statistically Significant Features + New Features

In [89]:
preds = cross_val_predict(model2, train_2_X, train_2_y, cv=cv_len, method='decision_function')
scores_array = []

for i in preds:
    if i > 0:
        scores_array.append(1)
    else:
        scores_array.append(0)  

conf_matrix, precision, recall, f1, area_under_curve = score_comparison_info(scores_array, train_2_y) 
print(f'{conf_matrix},\n precision: {precision},\n recall: {recall},\n f1: {f1},\n Area under Curve: {area_under_curve}')

[[386  16]
 [  7 391]],
 precision: 0.9606879606879607,
 recall: 0.9824120603015075,
 f1: 0.9714285714285714,
 Area under Curve: 0.9713055326383159


# Running on the Test Dataset

<p>Since the best model is the Logistic Regression model found through GridSearchCV that takes in the originally statistically signficant features along with the new features, I will run the test data through this model and see how well it performs on new data.

In [99]:
test_2_X = test_2.loc[:, 'Daily Time Spent on Site': 'Income/Length']
test_2_y = test_2['Clicked on Ad']

In [100]:
test_y_preds = model2.predict(test_2_X)

In [101]:
conf_matrix, precision, recall, f1, area_under_curve = score_comparison_info(test_y_preds, test_2_y)

In [102]:
print(f'{conf_matrix},\n precision: {precision},\n recall: {recall},\n f1: {f1},\n Area under Curve: {area_under_curve}')

[[104   6]
 [  1  89]],
 precision: 0.9368421052631579,
 recall: 0.9888888888888889,
 f1: 0.9621621621621621,
 Area under Curve: 0.9671717171717171


In [103]:
correct = 0

for pred, true in zip(test_y_preds, test_2_y):
    if pred == true:
        correct += 1
    
print(correct / len(test_2_y)) # Accuracy score.

0.965
