In [1]:
import pandas as pd
from sklearn.metrics import roc_auc_score, r2_score

In [12]:
from sklearn.model_selection import cross_val_score

In [33]:
from sklearn.model_selection import GridSearchCV

In [2]:
import pickle

In [4]:
filename = '../features_juli.pkl'
with open(filename,'rb') as file:
    train = pickle.load(file)
    

In [74]:
filename = '../test_features_juli.pkl'
with open(filename,'rb') as file:
    test = pickle.load(file)

In [18]:
from sklearn.ensemble import RandomForestClassifier  
classifier = RandomForestClassifier(n_estimators=300, random_state=0)  


In [5]:
train.columns

Index(['visited site', 'generic listing', 'conversion', 'viewed product',
       'person', 'tiempo_promedio_por_dia', 'label', 'modelo_mas_visto',
       'condition_mode', 'storage_mode', 'campaing_mode', 'cant_model_vistos'],
      dtype='object')

In [92]:
X_train = train[['visited site', 'generic listing', 'conversion', 'viewed product',
       'tiempo_promedio_por_dia', 'modelo_mas_visto',
       'condition_mode', 'storage_mode', 'campaing_mode', 'cant_model_vistos']]

In [93]:
y_train = train.label

In [94]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test =  train_test_split(X_train, y_train, test_size=0.3, random_state=100)

In [95]:
all_accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=5)  

In [96]:
all_accuracies

array([0.94849154, 0.9477557 , 0.9455482 , 0.9477557 , 0.94736842])

In [97]:
all_accuracies.mean()

0.9473839123194299

In [98]:
all_accuracies.std()

0.0009872568652135598

In [167]:
param_grid = { 
    'n_estimators': [200,300, 400],
    'max_depth' : [4,5,6],
    'criterion' :['gini', 'entropy']
}

In [100]:
from sklearn.metrics import roc_auc_score, make_scorer

In [101]:
def ROC_AUC(y_true, y_pred):
    roc_auc = roc_auc_score(y_true, y_pred)
    print('ROC_AUC: %2.9f', roc_auc)
    return roc_auc

In [102]:
def R2(y_true, y_pred):
    r2= r2_score(y_true, y_pred)
    print('R": %2.9f', r2)
    return r2

In [103]:
def two_score(y_true, y_pred):
    R2(y_true, y_pred)
    score = ROC_AUC(y_true, y_pred)
    return score

In [104]:
two_scorer = make_scorer(two_score, greater_is_better=True)

In [152]:
gd_sr = GridSearchCV(estimator=classifier,  
                     param_grid=param_grid,
                     scoring='roc_auc',
                     cv=3,
                     n_jobs=-1, verbose=1)

In [153]:
X_train.columns

Index(['visited site', 'generic listing', 'conversion', 'viewed product',
       'tiempo_promedio_por_dia', 'modelo_mas_visto', 'condition_mode',
       'storage_mode', 'campaing_mode', 'cant_model_vistos'],
      dtype='object')

In [154]:
gd_sr.fit(X_train,y_train)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  7.8min
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:  9.5min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [200, 300, 400], 'max_depth': [4, 5, 6], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [155]:
best_parameters = gd_sr.best_params_  
print(best_parameters)   

{'criterion': 'entropy', 'max_depth': 6, 'n_estimators': 200}


In [156]:
best_result = gd_sr.best_score_  
print(best_result) 

0.6611694461063582


In [158]:
filename = '../features_juli_with_support.pkl'
with open(filename,'rb') as file:
    train_with_support = pickle.load(file)
    

In [157]:
filename = '../test_features_juli_with_support.pkl'
with open(filename,'rb') as file:
    test_with_support = pickle.load(file)

In [160]:
train_with_support.columns

Index(['visited site', 'generic listing', 'conversion', 'viewed product',
       'person', 'tiempo_promedio_por_dia', 'label', 'modelo_mas_visto',
       'condition_mode', 'storage_mode', 'campaing_mode', 'cant_model_vistos',
       'event_support'],
      dtype='object')

In [163]:
X_train_support =  train_with_support[['visited site', 'generic listing', 'conversion', 'viewed product'
       , 'tiempo_promedio_por_dia', 'modelo_mas_visto',
       'condition_mode', 'storage_mode', 'campaing_mode', 'cant_model_vistos',
       'event_support']]

In [165]:
y_train_support = train_with_support.label

In [166]:
from sklearn.model_selection import train_test_split

X_train_support, X_test_support, y_train_support, y_test_support =  train_test_split(X_train_support, y_train_support, test_size=0.3, random_state=100)

In [168]:
gd_sr_with_support = GridSearchCV(estimator=classifier,  
                     param_grid=param_grid,
                     scoring='roc_auc',
                     cv=3,
                     n_jobs=-1, verbose=1)

In [169]:
gd_sr_with_support.fit(X_train_support, y_train_support)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed: 10.2min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [200, 300, 400], 'max_depth': [4, 5, 6], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [174]:
gd_sr_with_support.best_params_

{'criterion': 'entropy', 'max_depth': 6, 'n_estimators': 300}

In [175]:
gd_sr_with_support.best_score_  

0.6929098699113517

In [176]:
X_train_support.columns

Index(['visited site', 'generic listing', 'conversion', 'viewed product',
       'tiempo_promedio_por_dia', 'modelo_mas_visto', 'condition_mode',
       'storage_mode', 'campaing_mode', 'cant_model_vistos', 'event_support'],
      dtype='object')

In [178]:
X_train_support.tiempo_promedio_por_dia = X_train_support.tiempo_promedio_por_dia / 3600

In [181]:
gd_sr_with_support.fit(X_train_support, y_train_support)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:  9.3min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [200, 300, 400], 'max_depth': [4, 5, 6], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [182]:
gd_sr_with_support.best_params_

{'criterion': 'entropy', 'max_depth': 6, 'n_estimators': 300}

In [183]:
gd_sr_with_support.best_score_  

0.6929489695828365

In [187]:
param_grid2 = { 
    'max_depth' : [3,4,5,6,7,8,9,10,11],
}

In [186]:
classifier = RandomForestClassifier(n_estimators=300,criterion='entropy', random_state=0)  

In [188]:
gd_sr2 = GridSearchCV(estimator=classifier,  
                     param_grid=param_grid2,
                     scoring='roc_auc',
                     cv=3,
                     n_jobs=-1, verbose=2)

In [189]:
gd_sr2.fit(X_train_support, y_train_support)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:  6.2min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': [3, 4, 5, 6, 7, 8, 9, 10, 11]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=2)

In [190]:
gd_sr2.best_score_

0.7056085463849652

In [191]:
gd_sr2.best_params_

{'max_depth': 10}

In [197]:
param_grid3 = { 
    'max_depth' : [10,20,30,40],
}

In [198]:
gd_sr3 = GridSearchCV(estimator=classifier,  
                     param_grid=param_grid3,
                     scoring='roc_auc',
                     cv=3,
                     n_jobs=-1, verbose=2)

In [199]:
gd_sr3.fit(X_train_support,y_train_support)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  7.5min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': [10, 20, 30, 40]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring='roc_auc', verbose=2)

In [200]:
gd_sr3.best_score_

0.7056085463849652

In [201]:
gd_sr3.best_params_

{'max_depth': 10}

In [207]:
param_grid4 = { 
    'n_estimators' : [1, 2, 4, 8, 16, 32, 64, 100, 200]
}


In [208]:
gd_sr4 = GridSearchCV(estimator=classifier,  
                     param_grid=param_grid4,
                     scoring='roc_auc',
                     cv=3,
                     n_jobs=-1, verbose=2)

In [209]:
gd_sr4.fit(X_train_support, y_train_support)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:  2.3min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [1, 2, 4, 8, 16, 32, 64, 100, 200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=2)

In [212]:
gd_sr4.best_score_

0.6872756599360859

In [213]:
gd_sr4.best_params_

{'n_estimators': 200}

In [214]:
model = RandomForestClassifier(n_estimators=300, max_depth=10, criterion='entropy')

In [215]:
model.fit(X_train_support, y_train_support)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [216]:
pred = model.predict(X_test_support)

In [217]:
roc_auc_score(y_test_support, pred)

0.4997289972899729

In [219]:
test_with_support.columns

Index(['visited site', 'generic listing', 'conversion', 'viewed product',
       'person', 'tiempo_promedio_por_dia', 'modelo_mas_visto',
       'condition_mode', 'storage_mode', 'campaing_mode', 'cant_model_vistos',
       'event_support'],
      dtype='object')

In [220]:
xtest = test_with_support[['visited site', 'generic listing', 'conversion', 'viewed product',
       'tiempo_promedio_por_dia', 'modelo_mas_visto',
       'condition_mode', 'storage_mode', 'campaing_mode', 'cant_model_vistos',
       'event_support']]

In [221]:
sum(pd.isnull(xtest['visited site']))

299

In [222]:
values = {'visited site': xtest['visited site'].mean(),'generic listing': xtest['generic listing'].mean(), 'conversion': xtest['conversion'].mean(),'viewed product': xtest['viewed product'].mean()}
xtest.fillna(value=values,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


Unnamed: 0,visited site,generic listing,conversion,viewed product,tiempo_promedio_por_dia,modelo_mas_visto,condition_mode,storage_mode,campaing_mode,cant_model_vistos,event_support
0,34.0,6.336274,1.656646,372.00000,17451.590909,10,4.0,32.0,-1,8,0.363903
1,1.0,6.336274,1.656646,3.00000,347.000000,10,5.0,16.0,-1,2,0.420021
2,6.0,14.000000,1.000000,153.00000,11277.000000,6,3.0,16.0,-1,6,0.062016
3,13.0,17.000000,1.656646,339.00000,6850.666667,6,4.0,32.0,-1,6,0.260913
4,5.0,8.000000,1.656646,28.00000,4015.000000,10,3.0,32.0,-1,5,0.138376
5,3.0,3.000000,1.000000,41.00000,34837.000000,10,4.0,16.0,-1,2,0.059208
6,1.0,6.336274,1.656646,1.00000,43.000000,6,3.0,8.0,-1,2,0.692601
7,1.0,6.336274,1.656646,1.00000,2.000000,6,3.0,32.0,-1,2,0.692601
8,1.0,6.336274,1.000000,33.34542,22960.000000,10,3.0,16.0,-1,2,0.108064
9,1.0,6.336274,1.656646,6.00000,875.000000,10,3.0,32.0,-1,3,0.473048


In [223]:
xtest.tiempo_promedio_por_dia = xtest.tiempo_promedio_por_dia / 3600

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [224]:
sum(pd.isnull(xtest[ 'generic listing']))

0

In [225]:
sum(pd.isnull(xtest[ 'conversion']))

0

In [226]:
sum(pd.isnull(xtest['viewed product']))

0

In [81]:
sum(pd.isnull(xtest[ 'tiempo_promedio_por_dia']))

0

In [83]:
sum(pd.isnull(xtest['modelo_mas_visto']))

0

In [84]:
sum(pd.isnull(xtest['condition_mode']))

0

In [85]:
sum(pd.isnull(xtest['storage_mode']))

0

In [None]:
'condition_mode', 'storage_mode', 'campaing_mode', 'cant_model_vistos'

In [227]:
xtest.shape

(19415, 11)

In [234]:
pred = model.predict_proba(xtest)[:,1]

In [240]:
sum(pred<0)

0

In [236]:
df = pd.DataFrame()

In [237]:
df['person'] = test_with_support.person

In [244]:
df['label'] = pred

In [246]:
del df['pred']

In [247]:
df.columns

Index(['person', 'label'], dtype='object')

In [243]:
sum(df.pred  > 1)

0

In [248]:
df.to_csv('pred_random_forest_juli.csv', index=False)