In [1]:
# General use imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_curve, roc_auc_score, accuracy_score, auc
from sklearn import metrics
from sklearn.externals import joblib
from sklearn.model_selection import cross_val_score

In [2]:
# load in the feature data
train = pd.read_csv('features.csv')
test = pd.read_csv('features_test.csv')

In [3]:
train.isnull().any()

id                        False
has_purchased             False
total_actions             False
email_open                False
total_email_open          False
has_email_open            False
form_submit               False
total_form_submit         False
has_form_submit           False
email_click_thru          False
total_email_click_thru    False
has_email_click_thru      False
cust_sup                  False
total_cust_sup            False
has_cust_sup              False
page_view                 False
total_page_view           False
has_page_view             False
web_view                  False
total_web_view            False
has_web_view              False
days_as_user              False
dtype: bool

In [4]:
train['Type'] = 'Train'
test['Type'] = 'Test'
df = pd.concat([train, test], axis = 0)

In [5]:
df.drop(['email_open', 'form_submit', 'email_click_thru', 'cust_sup', 'page_view', 'web_view'], axis = 1, inplace = True)
df.describe()

Unnamed: 0,has_purchased,total_actions,total_email_open,has_email_open,total_form_submit,has_form_submit,total_email_click_thru,has_email_click_thru,total_cust_sup,has_cust_sup,total_page_view,has_page_view,total_web_view,has_web_view,days_as_user
count,474378.0,474378.0,474378.0,474378.0,474378.0,474378.0,474378.0,474378.0,474378.0,474378.0,474378.0,474378.0,474378.0,474378.0,474378.0
mean,0.219215,12.050382,7.889527,0.814108,0.431791,0.20744,0.691288,0.230848,0.219215,0.219215,0.989532,0.146086,0.0,0.0,189.707295
std,0.413715,38.735216,19.949517,0.38902,2.431587,0.405474,3.847939,0.421376,0.413715,0.413715,8.474765,0.353193,0.0,0.0,233.77595
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,0.0,4.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,55.0
75%,0.0,9.0,7.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,385.0
max,1.0,2718.0,630.0,1.0,430.0,1.0,463.0,1.0,1.0,1.0,596.0,1.0,0.0,0.0,670.0


In [6]:
ID_col = ['id']
target_col = ['has_purchased']
category_cols = ['has_email_open', 'has_form_submit', 'has_email_click_thru', 'has_cust_sup', 'has_page_view', 'has_web_view']
num_cols = ['total_actions', 'total_form_submit', 'total_email_click_thru', 'total_cust_sup', 'total_page_view',
            'total_web_view', 'days_as_user']
other_col = ['Type']

In [7]:
num_category_cols = num_cols + category_cols

In [8]:
for var in category_cols:
    number =LabelEncoder()
    df[var] = number.fit_transform(df[var].astype('str'))

df['has_purchased'] = number.fit_transform(df['has_purchased'].astype('str'))

train = df[df['Type'] == 'Train']
test = df[df['Type'] == 'Test']

train['is_train'] = np.random.uniform(0, 1, len(train)) <= 0.75
Train, Validate = train[train['is_train'] == True], train[train['is_train'] == False]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [9]:
features = list(set(list(df.columns)) - set(ID_col) - set(target_col) - set(other_col))

In [10]:
x_train = Train[list(features)].values
y_train = Train['has_purchased'].values
x_validate = Validate[list(features)].values
y_validate = Validate['has_purchased'].values
x_test = test[list(features)].values

In [11]:
random.seed(100)
rf = RandomForestClassifier(n_estimators = 1000)
forest = rf.fit(x_train, y_train)

In [12]:
importances = rf.feature_importances_

In [13]:
indices = np.argsort(importances)[::-1]
for f in range(x_train.shape[1]):
    print("%2d) %-*s %f" %(f + 1, 30, features[f], importances[indices[f]]))

 1) has_email_click_thru           0.397127
 2) total_email_open               0.373130
 3) total_form_submit              0.078799
 4) total_web_view                 0.074480
 5) has_cust_sup                   0.027010
 6) total_actions                  0.017364
 7) has_web_view                   0.011650
 8) has_email_open                 0.010445
 9) has_form_submit                0.004036
10) total_email_click_thru         0.002516
11) days_as_user                   0.002256
12) has_page_view                  0.001189
13) total_page_view                0.000000
14) total_cust_sup                 0.000000


In [20]:
scores = cross_val_score(rf, x_train, y_train, verbose = 5)
print(scores.mean())

[CV]  ................................................................
[CV] ................................. , score=1.000000, total=  53.7s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   53.7s remaining:    0.0s


[CV] ................................. , score=1.000000, total=  53.0s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.8min remaining:    0.0s


[CV] ................................. , score=1.000000, total=  54.2s
1.0


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.7min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.7min finished


In [21]:
status = rf.predict_proba(x_validate)
fpr, tpr, _ = roc_curve(y_validate, status[:, 1])
roc_auc = auc(fpr, tpr)
print roc_auc

1.0


In [13]:
final_status = rf.predict_proba(x_test)
test['has_purchased'] = final_status[:, 1]
test.sort_values('has_purchased', ascending = False, inplace = True)
test_results = test.head(1000)
test_results.to_csv('test_results.csv', columns = ['id'])

MemoryError: 