In [64]:
import pandas as pd
from scipy import stats
from statsmodels.stats import weightstats

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix

# Data Retrieval

In [43]:
orgin_stat_sig = pd.read_csv('orig_stat_sig.csv')
feature_eng = pd.read_csv('feature_engineered_cols.csv')

print(orgin_stat_sig.head())
print(feature_eng.head())


train_1, test_1 = train_test_split(orgin_stat_sig, train_size=0.8)
train_2, test_2 = train_test_split(feature_eng, train_size=0.8)

train_1_X = train_1.loc[:, 'Daily Time Spent on Site': 'Age']
train_1_y = train_1['Clicked on Ad']

train_2_X = train_2.loc[:, 'Daily Time Spent on Site': 'Income/Length']
train_2_y = train_2['Clicked on Ad']

   Daily Time Spent on Site  Area Income  Daily Internet Usage       Age  \
0                  0.249267     0.509691              1.734030 -0.114905   
1                  0.961132     1.002530              0.313805 -0.570425   
2                  0.282083     0.356949              1.287589 -1.139826   
3                  0.577432    -0.014456              1.501580 -0.798185   
4                  0.212664     1.408868              1.038731 -0.114905   

   Clicked on Ad  
0            0.0  
1            0.0  
2            0.0  
3            0.0  
4            0.0  
   Daily Time Spent on Site       Age  Area Income  Daily Internet Usage  \
0                  0.249267 -0.114905     0.509691              1.734030   
1                  0.961132 -0.570425     1.002530              0.313805   
2                  0.282083 -1.139826     0.356949              1.287589   
3                  0.577432 -0.798185    -0.014456              1.501580   
4                  0.212664 -0.114905     1.40886

# Model Preparation

In [44]:
rfc = RandomForestClassifier(random_state=42)
rfc2 = RandomForestClassifier(random_state=42)

param_grid = [
    {
        'max_features': ['sqrt', 'log2'],
        'n_estimators': [100, 200, 250, 500],
        'min_impurity_decrease': [0.0, 1e-7] 
    }
]

cv_len = 5

## Testing Just Statistically Significant Features First

In [45]:
rfc_grid_search = GridSearchCV(rfc, param_grid, cv=cv_len, scoring=['accuracy', 'f1'], refit='f1')

In [46]:
rfc_grid_search.fit(train_1_X, train_1_y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False, random_state=42,
                                              verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid=[{

In [47]:
rfc_grid_search.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=250,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [48]:
rfc_grid_search.best_params_

{'max_features': 'sqrt', 'min_impurity_decrease': 0.0, 'n_estimators': 250}

In [49]:
rfc_grid_search.best_score_

0.9605055144537558

## Testing Just New Features + Statistically Significant Features 

In [50]:
rfc_grid_search_2 = GridSearchCV(rfc, param_grid, cv=cv_len, scoring=['accuracy', 'f1'], refit='f1')

In [51]:
rfc_grid_search_2.fit(train_2_X, train_2_y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False, random_state=42,
                                              verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid=[{

In [52]:
rfc_grid_search_2.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [53]:
rfc_grid_search_2.best_params_

{'max_features': 'sqrt', 'min_impurity_decrease': 0.0, 'n_estimators': 500}

In [54]:
rfc_grid_search_2.best_score_

0.9620897702728083

# Statistical Signficance Test

In [72]:
cv_len = 300

def scores_info(scores):
#     print(f'scores: {scores}')
    print(f'mean: {scores.mean()}')
    print(f'std: {scores.std()}')
    print('')
    
model1 = rfc_grid_search.best_estimator_
model2 = rfc_grid_search_2.best_estimator_

scores1 = cross_val_score(model1, train_1_X, train_1_y, cv=cv_len)
scores2 = cross_val_score(model2, train_2_X, train_2_y, cv=cv_len)

scores_info(scores1)
scores_info(scores2)

ALPHA = 0.05

statistic, p = stats.ttest_ind(scores1, scores2, equal_var=False) 

print('')
print(statistic)
print(p)

if p < ALPHA:
    print('statistically significant')
else:
    print('fail to reject, not stat sig')

mean: 0.9633333333333334
std: 0.12378296418418096

mean: 0.9583333333333334
std: 0.13819269959814168


0.46601905019522927
0.6413735997063059
fail to reject, not stat sig


# Cross Val Predict

In [85]:
def score_comparison_info(y_preds, y_trues):
    conf_matrix = confusion_matrix(y_preds, y_trues)
    f1 = f1_score(y_preds, y_trues)
    precision = precision_score(y_preds, y_trues)
    recall = recall_score(y_preds, y_trues)
    area_under_curve = roc_auc_score(y_preds, y_trues)
    
    return conf_matrix, precision, recall, f1, area_under_curve

### Original Statistically Significant Features

In [86]:
preds = cross_val_predict(model1, train_1_X, train_1_y, cv=cv_len, method='predict_proba')

In [87]:
final_scores = []

for pred in preds:
    final_scores.append(pred.argmax())

In [88]:
conf_matrix, precision, recall, f1, area_under_curve = score_comparison_info(final_scores, train_1_y)

In [89]:
print(f'{conf_matrix},\n precision: {precision},\n recall: {recall},\n f1: {f1},\n Area under Curve: {area_under_curve}')

[[392  16]
 [ 14 378]],
 precision: 0.9593908629441624,
 recall: 0.9642857142857143,
 f1: 0.9618320610687023,
 Area under Curve: 0.9625350140056024


### Original Statistically Significant Features + New Features

In [90]:
preds = cross_val_predict(model2, train_2_X, train_2_y, cv=cv_len, method='predict_proba')

final_scores = []

for pred in preds:
    final_scores.append(pred.argmax())

In [92]:
conf_matrix, precision, recall, f1, area_under_curve = score_comparison_info(final_scores, train_2_y)
print(f'{conf_matrix},\n precision: {precision},\n recall: {recall},\n f1: {f1},\n Area under Curve: {area_under_curve}')

[[390  17]
 [ 13 380]],
 precision: 0.9571788413098237,
 recall: 0.9669211195928753,
 f1: 0.9620253164556962,
 Area under Curve: 0.9625760389119168
