In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.model_selection import  train_test_split
import time
import lightgbm as lgbm
from sklearn.metrics import mean_squared_error

In [2]:
center_df = pd.read_csv("../fulfilment_center_info.csv", dtype={'center_id':int})
meal_df = pd.read_csv("../meal_info.csv", dtype={'meal_id':int})
train = pd.read_csv("../train.csv", dtype={'week':int, 'meal_id':int, 'center_id':int})

In [3]:
train = pd.merge(train, meal_df, on='meal_id', how='inner')
train = pd.merge(train, center_df, on='center_id', how='inner')

In [4]:
test = pd.read_csv("../test_QoiMO9B.csv", dtype={'week':int, 'meal_id':int, 'center_id':int})
test = pd.merge(test, meal_df, on='meal_id', how='inner')
test = pd.merge(test, center_df, on='center_id', how='inner')

In [5]:
train['train_or_test'] = 'train'
test['train_or_test'] = 'test'
df = pd.concat([train,test], sort=False)

In [6]:
# One hot encoder for categorical features
def ohe(df, colname):
    return pd.concat([df,pd.get_dummies(df[colname])], axis=1)

# Assign year based on week number: Note: Approx
def assign_approx_month(x, month=True):
    if x <= 52:
        year = 2014 
    elif 52 < x < 104:
        year = 2015
    else:
        year = 2016
    
    atime = time.asctime(time.strptime('{} {} 1'.format(year, x%52), '%Y %W %w'))
    if month:
        return atime.split(" ")[1]

# Assign year from week number: EDA shows yearly trends slowly decreasing
def assign_approx_year(x):
    if x <= 52:
        x = 1
    elif 52 < x <= 104:
        x = 2
    else:
        x = 3
    return x

# Important to add combinations of statistical features using historical data
def add_group_features(df, group_by, target, funcs=None):
    if funcs is None:
        funcs = [np.mean, np.median, np.max, np.min, np.std]
    table = df.groupby(group_by)[target].agg(funcs)
    for col in table.columns.tolist():
        table.fillna(table[col].mean(), inplace=True)
    return table

def add_lags_column(df, group_by, target_col, lags=[2,4,6]):
    lag_table = df.groupby(group_by)
    for i in lags:
        df['_'.join([target_col, 'lag', str(i)])] = \
                lag_table[target_col].shift(i).values + np.random.normal(scale=1.6, size=(len(df),))
    return df

def add_ewm_feats(df, group_by, target_col, alpha=[0.9], shift=[2,4,6]):
    gpby = df.groupby(group_by)
    for a in alpha:
        for s in shift:
            df['_'.join([target_col, 'lag', str(s), 'ewm', str(a)])] = \
                gpby[target_col].shift(s).ewm(alpha=a).mean().values
    return df

def add_rmean_feats(df, group_by, target_col, windows=[2,4,6], min_periods=0, 
                             shift=1, win_type=None):
    gpby = df.groupby(group_by)
    for w in windows:
        df['_'.join([target_col, 'rmean', str(w)])] = \
            gpby[target_col].shift(shift).rolling(window=w, 
                                                  min_periods=min_periods,
                                                  win_type=win_type).mean().values +\
            np.random.normal(scale=1.6, size=(len(df),))
    return df

In [7]:
week_of_month = {}
for k in range(1,160):
    week_of_month[k] = str((k%4))+"_wom" if k%4 > 0 else "4_wom"
# Add week of month approx
df['wom'] = df['week'].map(week_of_month)

# Add month begin/end identifiers as features
df['is_month_start'] = df['wom'].apply(lambda x: 1 if x == "1_wom" else 0)
df['is_month_end'] = df['wom'].apply(lambda x: 1 if x == "4_wom" else 0)
df['year'] = df['week'].apply(lambda x: assign_approx_year(x))
df['month'] = df['week'].apply(lambda x: assign_approx_month(x))
df['offer'] = 0
df.loc[(df['emailer_for_promotion']==1) & (df['homepage_featured']==1), 'offer'] = 1
df['diff'] = df['base_price'] - df['checkout_price']
#df.loc[df['diff'] < 0, 'diff'] *= -1

In [8]:
masked_series2 = ((df['train_or_test'] != 'test') & (df['year']==3) & (df['month'].isin(['Jul', 'Aug', 'Sep', 'Oct'])))
df.loc[(masked_series2), 'train_or_test'] = 'hold_out'
print('Train shape: {}'.format(df.loc[df.train_or_test=='train',:].shape))
print('hold out shape: {}'.format(df.loc[df.train_or_test=='hold_out',:].shape))
print('Test shape: {}'.format(df.loc[df.train_or_test=='test',:].shape))

Train shape: (407243, 23)
hold out shape: (49305, 23)
Test shape: (32573, 23)


In [9]:
for col in ['cuisine', 'category', 'region_code', 'center_type', 'city_code', 'month', 'wom', 'year']:
    df[col] = df[col].astype(str)
    df = ohe(df, col)

In [10]:
df.drop(['cuisine', 'category', 'region_code', 'center_type', 'city_code', 'month', 'wom', 'year'], axis=1, inplace=True)

In [11]:
df['train_or_test'].value_counts()

train       407243
hold_out     49305
test         32573
Name: train_or_test, dtype: int64

In [12]:
train = df[df['train_or_test'].isin(['train', 'hold_out'])]
train['num_orders'] = np.log1p(train.num_orders.values)
center_meal_stat = add_group_features(train,['center_id', 'meal_id'], target='num_orders')
train = pd.merge(train, center_meal_stat, how='left', on=['center_id', 'meal_id'])
val = train[train['train_or_test']=='hold_out']
train = train[train['train_or_test']=='train']
test = df[df['train_or_test']=='test']
test = pd.merge(test, center_meal_stat, how='left', on=['center_id', 'meal_id'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [13]:
val.head()

Unnamed: 0,id,week,center_id,meal_id,checkout_price,base_price,emailer_for_promotion,homepage_featured,num_orders,op_area,...,3_wom,4_wom,1,2,3,mean,median,amax,amin,std
130,1197045,131,55,1885,149.44,149.44,0,0,5.09375,2.0,...,1,0,0,0,1,5.313284,5.375278,6.663133,3.713572,0.53191
131,1322745,132,55,1885,151.35,152.35,0,0,5.09375,2.0,...,0,1,0,0,1,5.313284,5.375278,6.663133,3.713572,0.53191
132,1375869,133,55,1885,148.41,148.41,0,0,4.812184,2.0,...,0,0,0,0,1,5.313284,5.375278,6.663133,3.713572,0.53191
133,1112450,134,55,1885,151.35,150.35,0,0,4.418841,2.0,...,0,0,0,0,1,5.313284,5.375278,6.663133,3.713572,0.53191
134,1455692,135,55,1885,150.41,149.41,0,0,5.379897,2.0,...,1,0,0,0,1,5.313284,5.375278,6.663133,3.713572,0.53191


In [14]:
avoid_cols = ['id', 'num_orders', 'train_or_test', 'id', 'year', 'center_id', 'meal_id', 'week']
cols = [col for col in train.columns if col not in avoid_cols]
target = 'num_orders'

In [15]:
lgb_params = {'task':'train', 'boosting_type':'gbdt', 'objective':'regression', 
              'metric': {'rmse'}, 'num_leaves': 10, 'learning_rate': 0.1, 
              'feature_fraction': 0.8, 'max_depth': 5, 'verbose': 0, 
              'num_boost_round':10000, 'nthread':-1, 'early_stopping_rounds':50}

In [16]:
lgbtrain = lgbm.Dataset(data=train[cols], label=train[target].values, 
                       feature_name=cols)
lgbval = lgbm.Dataset(data=val[cols], label=val[target].values, 
                     reference=lgbtrain, feature_name=cols)

In [17]:
def lgb_validation(params, lgbtrain, lgbval, X_val, Y_val):
    t0 = time.time()
    evals_result = {}
    model = lgbm.train(params, lgbtrain, num_boost_round=params['num_boost_round'], 
                      valid_sets=[lgbtrain, lgbval], 
                      early_stopping_rounds=params['early_stopping_rounds'], verbose_eval=500)
    print(model.best_iteration)
    print('Total time taken to build the model: ', (time.time()-t0)/60, 'minutes!!')
    pred_Y_val = model.predict(X_val, num_iteration=model.best_iteration)
    print('RMSE for validation data is:{}'.format(np.sqrt(mean_squared_error(pred_Y_val, Y_val))))
    # pred_Y_val = np.expm1(pred_Y_val)
    # Y_val = np.expm1(Y_val)
    val_df = pd.DataFrame(columns=['true_Y_val','pred_Y_val'])
    val_df['pred_Y_val'] = pred_Y_val
    val_df['true_Y_val'] = Y_val
    print(val_df.shape)
    print(val_df.sample(5))
    
    return model, val_df

In [18]:
model, val_df = lgb_validation(lgb_params, lgbtrain, lgbval, val.loc[:,cols].values, val[target].values)



Training until validation scores don't improve for 50 rounds.
[500]	training's rmse: 0.471231	valid_1's rmse: 0.508245
Early stopping, best iteration is:
[548]	training's rmse: 0.469992	valid_1's rmse: 0.507939
548
('Total time taken to build the model: ', 0.17257219950358074, 'minutes!!')
RMSE for validation data is:0.507939159138
(49305, 2)
       true_Y_val  pred_Y_val
9785     5.743003    5.420583
17000    6.070738    6.354754
48319    5.902633    5.694752
45161    5.313206    5.527860
7258     5.693732    6.427960


In [20]:
train_full = df[df['train_or_test'].isin(['train','hold_out'])]
train_full['num_orders'] = np.log1p(train_full.num_orders.values)
center_meal_stat = add_group_features(train_full,['center_id', 'meal_id'], target='num_orders')
train_full = pd.merge(train_full, center_meal_stat, how='left', on=['center_id', 'meal_id'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [21]:
lgb_params = {'task':'train', 'boosting_type':'gbdt', 'objective':'regression', 
              'metric': {'rmse'}, 'num_leaves': 10, 'learning_rate': 0.1, 
              'feature_fraction': 0.8, 'max_depth': 5, 'verbose': 0, 
              'num_boost_round':548, 'nthread':-1}
lgbtrain = lgbm.Dataset(data=train_full[cols], label=train_full[target].values, 
                       feature_name=cols)
model = lgbm.train(lgb_params, lgbtrain)

In [22]:
y_pred = model.predict(test[cols])
y_pred = np.expm1(y_pred)
submission = test[['id']]
submission['num_orders'] = y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [24]:
submission['num_orders'].head()

0    173.217371
1    170.932290
2    168.665027
3    170.601390
4    166.814995
Name: num_orders, dtype: float64

In [25]:
submission.to_csv("lgbm_jun_sep_valid_0.495_locl_cv.csv", index=False)

In [26]:
ntrees = 250
early_stop = 10
verbose_eval = 50
params = {
    'objective': 'reg:linear',
    'eval_metric': ['rmse'],
    'eta': 0.2,
    'max_depth': 3,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.005,
    'silent': True,
    'random_state': 42424,
    'tree_method': 'approx',
}
dtrain = xgb.DMatrix(train[cols], train[target].values)
dval = xgb.DMatrix(val[cols], val[target].values)
watchlist = [(dtrain, 'train'), (dval, 'valid')]
    
xgb_model = xgb.train(params, dtrain, ntrees, watchlist, maximize=False,
                      verbose_eval=verbose_eval,
                      early_stopping_rounds=early_stop)

[16:16:37] Tree method is selected to be 'approx'
[0]	train-rmse:3.66398	valid-rmse:3.61947
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 10 rounds.
[50]	train-rmse:0.505903	valid-rmse:0.528124
[100]	train-rmse:0.494725	valid-rmse:0.519407
[150]	train-rmse:0.489298	valid-rmse:0.515381
Stopping. Best iteration:
[161]	train-rmse:0.487967	valid-rmse:0.514244



In [None]:
ntrees = 
params = {
    'objective': 'reg:linear',
    'eval_metric': ['rmse'],
    'eta': 0.02,
    'max_depth': 3,
    'min_child_weight': 3,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.005,
    'silent': True,
    'random_state': 42424,
    'tree_method': 'approx',
    'max_delta_step': 1
}
dtrain = xgb.DMatrix(train_full[cols], train_full[target].values)
xgb_model = xgb.train(params, dtrain, ntrees, maximize=False)

In [27]:
y_pred2 = xgb_model.predict(xgb.DMatrix(test[cols]))

In [28]:
y_pred2 = np.expm1(y_pred2)

In [29]:
submission2 = test[['id']]
submission2['num_orders'] =y_pred2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [30]:
submission2['num_orders'].mean()

240.49716

In [31]:
submission['num_orders'] = (0.7*submission['num_orders']) + (0.3*submission2['num_orders'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [32]:
submission['num_orders'] = submission['num_orders'].round()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [33]:
submission.to_csv("lgb_xgb_0.6_0.4_rounded_sub_v7.csv", index=False, header=True)

In [34]:
submission.head()

Unnamed: 0,id,num_orders
0,1028232,175.0
1,1262649,174.0
2,1453211,173.0
3,1262599,174.0
4,1495848,171.0


In [1]:
### A weighted avg was finally used