In [None]:
!pip install fastai==0.7.0

In [2]:
from fastai.imports import *
from fastai.structured import *

from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display

from sklearn import metrics

  from numpy.core.umath_tests import inner1d


In [3]:
df = pd.read_csv('train.csv')
df.shape

(283582, 9)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 283582 entries, 0 to 283581
Data columns (total 9 columns):
id                       283582 non-null int64
week                     283582 non-null int64
center_id                283582 non-null int64
meal_id                  283582 non-null int64
checkout_price           283582 non-null float64
base_price               283581 non-null float64
emailer_for_promotion    283581 non-null float64
homepage_featured        283581 non-null float64
num_orders               283581 non-null float64
dtypes: float64(5), int64(4)
memory usage: 19.5 MB


In [11]:
df.isnull().sum()

id                       0
week                     0
center_id                0
meal_id                  0
checkout_price           0
base_price               1
emailer_for_promotion    1
homepage_featured        1
num_orders               1
dtype: int64

### Not many NaNs hence dropping them. 

In [14]:
df = df.dropna()

In [15]:
df.isnull().sum()

id                       0
week                     0
center_id                0
meal_id                  0
checkout_price           0
base_price               0
emailer_for_promotion    0
homepage_featured        0
num_orders               0
dtype: int64

In [17]:
df.shape

(283581, 9)

In [16]:
# Eval criteria - RMSLE
df.num_orders = np.log(df.num_orders)

In [19]:
y = df['num_orders'] 
df = df.drop('num_orders', axis=1)  

In [20]:
def split_vals(a,n): return a[:n].copy(), a[n:].copy()

n_valid = 56717 
n_trn = len(df)-n_valid
raw_train, raw_valid = split_vals(df, n_trn)
X_train, X_valid = split_vals(df, n_trn)
y_train, y_valid = split_vals(y, n_trn)

X_train.shape, y_train.shape, X_valid.shape

((226864, 8), (226864,), (56717, 8))

In [21]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

In [22]:
m = RandomForestRegressor(n_jobs=-1)
%time m.fit(X_train, y_train)
print_score(m)

CPU times: user 16.3 s, sys: 0 ns, total: 16.3 s
Wall time: 8.54 s
[0.23030120203980875, 0.7003531214963991, 0.9650049104641912, 0.668203018409453]


In [24]:
m = RandomForestRegressor(n_estimators=40, n_jobs=-1)
m.fit(X_train, y_train)
print_score(m)

[0.20123160429300235, 0.6779186775965529, 0.9732818072467286, 0.6891194922847229]


In [25]:
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=5, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)

[0.3825713144924427, 0.6463731120576837, 0.9034306060547904, 0.7173787205696499, 0.816758732253906]


In [26]:
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)

[0.31924941955946645, 0.6499755255866042, 0.932752687578844, 0.7142196911233456, 0.816646796648304]


In [30]:
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=5, max_features=0.6, n_jobs=-1, oob_score=True)
%time m.fit(X_train, y_train)
print_score(m)

CPU times: user 29.9 s, sys: 16 ms, total: 29.9 s
Wall time: 16.1 s
[0.4274301133915026, 0.6584973670984335, 0.8794561851841062, 0.7066768236654051, 0.7925401729207227]


In [32]:
m = RandomForestRegressor(n_estimators=60, min_samples_leaf=5, n_jobs=-1, oob_score=True, random_state=7)
m.fit(X_train, y_train)
print_score(m)

[0.3817052793986679, 0.6438020487212379, 0.9038673237104548, 0.7196226011770788, 0.8190248429389673]


In [34]:
fi = rf_feat_importance(m, X_train); fi[:10]

Unnamed: 0,cols,imp
3,meal_id,0.310987
4,checkout_price,0.286632
2,center_id,0.145838
5,base_price,0.088944
6,emailer_for_promotion,0.049451
1,week,0.048036
7,homepage_featured,0.043218
0,id,0.026894


In [35]:
to_keep = fi[fi.imp>0.005].cols; len(to_keep)

8

In [38]:
m = RandomForestRegressor(n_estimators=80, max_features=0.5, n_jobs=-1, oob_score=True)
%time m.fit(X_train, y_train)
print_score(m)

CPU times: user 1min 17s, sys: 512 ms, total: 1min 18s
Wall time: 42 s
[0.2037501116367254, 0.6583490751045563, 0.9726088409148499, 0.7068089201112813, 0.8016376056393602]


>Lowest validation RMSE: 0.6463731120576837 with the following model configuration:
`RandomForestRegressor(n_estimators=40, min_samples_leaf=5, n_jobs=-1, oob_score=True)`

## Begin H2O's AUtoML wizadry

In [None]:
!pip install -f http://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html h2o

In [None]:
import h2o
from h2o.automl import H2OAutoML

h2o.init()

In [41]:
X_train['num_orders'] = y_train
X_valid['num_orders'] = y_valid

In [42]:
X_train_h2o = h2o.H2OFrame(X_train)
X_valid_h2o = h2o.H2OFrame(X_valid)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [43]:
features = X_train.columns.values.tolist()
target = "num_orders"

In [None]:
X_train_h2o.drop('id')

In [None]:
X_valid_h2o.drop('id')

In [49]:
aml = H2OAutoML(max_models=15, seed = 1, project_name = "genpact_hackathon", sort_metric='RMSE', stopping_metric='RMSE')
aml.train(x = features, y = target, training_frame = X_train_h2o, leaderboard_frame = X_valid_h2o)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [50]:
lb = aml.leaderboard
lb.head(rows=lb.nrows)

model_id,mean_residual_deviance,rmse,mse,mae,rmsle
XRT_1_AutoML_20181216_112141,0.483515,0.695352,0.483515,0.540289,0.130134
StackedEnsemble_AllModels_AutoML_20181216_112141,0.486412,0.697433,0.486412,0.540494,0.13005
StackedEnsemble_BestOfFamily_AutoML_20181216_112141,0.486412,0.697433,0.486412,0.540494,0.13005
DRF_1_AutoML_20181216_112141,0.488741,0.6991,0.488741,0.543096,0.130561
GLM_grid_1_AutoML_20181216_112141_model_1,1.11512,1.05599,1.11512,0.864884,0.192679




In [51]:
test_data = pd.read_csv('test.csv')
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32573 entries, 0 to 32572
Data columns (total 8 columns):
id                       32573 non-null int64
week                     32573 non-null int64
center_id                32573 non-null int64
meal_id                  32573 non-null int64
checkout_price           32573 non-null float64
base_price               32573 non-null float64
emailer_for_promotion    32573 non-null int64
homepage_featured        32573 non-null int64
dtypes: float64(2), int64(6)
memory usage: 2.0 MB


In [52]:
test_data.isnull().sum()

id                       0
week                     0
center_id                0
meal_id                  0
checkout_price           0
base_price               0
emailer_for_promotion    0
homepage_featured        0
dtype: int64

In [53]:
X_test_h2o = h2o.H2OFrame(test_data)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [59]:
preds = aml.predict(X_test_h2o)

drf prediction progress: |████████████████████████████████████████████████| 100%


In [60]:
preds

predict
5.73455
5.86405
5.63785
4.93335
5.13174
5.2487
5.70645
4.59866
3.98451
4.02392




In [70]:
temp=test_data['id']
results=(h2o.as_list(preds["predict"])).iloc[:,0]
final=pd.concat([temp, results], axis=1)
final.rename(columns={'predict': 'num_orders'}, inplace=True)
file_name="h2oautoml_final_av_test.csv"
final.to_csv(file_name, sep=',',index=False) #,encoding='utf-8')

In [71]:
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, n_jobs=-1, oob_score=True)
m.fit(df, y)
preds = m.predict(test_data)

In [74]:
new_submission = pd.read_csv('sample_submission.csv')

In [75]:
new_submission['num_orders'] = preds

In [77]:
new_submission.to_csv('model_rf_submission.csv',index=False)