In [None]:
# Familiar imports
import numpy as np
import pandas as pd

# For ordinal encoding categorical variables, splitting data
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

# For training random forest model
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

import datetime
import locale

In [None]:
# Load the training data
train = pd.read_csv("../input/competitive-data-science-predict-future-sales/sales_train.csv")

# Preview the data
train.head()

In [None]:
train['day'] = train.date.str.split('.', expand=True).iloc[:,0].astype(int)
train['month'] = train.date.str.split('.', expand=True).iloc[:,1].astype(int)
train['year'] = train.date.str.split('.', expand=True).iloc[:,2].astype(int)

In [None]:
train['weekday'] = train['date'].apply(lambda d: (datetime.datetime.strptime(d, '%d.%m.%Y').weekday()))

In [None]:
#train['sum_amount'] = train.item_price * train.item_cnt_day

In [None]:
y = train['item_cnt_day']
X = train.drop(['date','item_cnt_day','item_price'],axis=1)

In [None]:
X.head()

In [None]:
y.head()

In [None]:
test = pd.read_csv("../input/competitive-data-science-predict-future-sales/test.csv")

In [None]:
def cross_join(df_a, df_b, common_key=None):
    if common_key is not None:
        return pd.merge(df_a, df_b, on=common_key, how='outer')

    df_a['tmp'] = 1
    df_b['tmp'] = 1
    return_df = pd.merge(df_a, df_b, how='outer')
    return_df = return_df.drop('tmp',axis=1)
    df_a = df_a.drop('tmp',axis=1)
    df_b = df_b.drop('tmp',axis=1)

    return return_df

In [None]:
#date_df = pd.date_range(start='1/11/2015', end='30/11/2015')
date_value = pd.DataFrame(
    {
        'cdr':pd.date_range(start='11/1/2015', end='11/30/2015'),
        #クロスジョイン用のキー
        #'crossjoinkey':1
    }
)
date_value.head()

In [None]:
test['date_block_num'] = 34
test = cross_join(test,date_value)

In [None]:
test['year'] = test.cdr.astype(str).str.split('-', expand=True).iloc[:,0].astype(int)
test['month'] = test.cdr.astype(str).str.split('-', expand=True).iloc[:,1].astype(int)
test['day'] = test.cdr.astype(str).str.split('-', expand=True).iloc[:,2].astype(int)

In [None]:
test['weekday'] = test['cdr'].astype(str).apply(lambda d: (datetime.datetime.strptime(d, '%Y-%m-%d').weekday()))

In [None]:
test.head()

In [None]:
X_test = test.drop(['cdr','ID'],axis=1)

In [None]:
xgb_params = {'n_estimators': 1000,
          'learning_rate': 0.15,
          'max_depth': 3,
         }

In [None]:
from sklearn.model_selection import KFold
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
splits = 5
skf = KFold(n_splits=splits, shuffle=True, random_state=19)

oof_preds = np.zeros((X.shape[0],))
preds = 0
model_fi = 0
total_mean_rmse = 0

In [None]:
for num, (train_idx, valid_idx) in enumerate(skf.split(X)):
    X_train, X_valid = X.loc[train_idx], X.loc[valid_idx]
    y_train, y_valid = y.loc[train_idx], y.loc[valid_idx]
    
    model = LGBMRegressor(**xgb_params)
    model.fit(X_train, y_train,
              verbose=False,
              # These three parameters will stop training before a model starts overfitting 
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              eval_metric="rmse",
              early_stopping_rounds=100,
              )
    
    # Getting mean test data predictions (i.e. devided by number of splits)
    preds += model.predict(X_test) / splits / 2
    
    # Getting validation data predictions. Each fold model makes predictions on an unseen data.
    # So in the end it will be completely filled with unseen data predictions.
    # It will be used to evaluate hyperparameters performance only.
    oof_preds[valid_idx] = model.predict(X_valid)
    
    # Getting score for a fold model
    fold_rmse = np.sqrt(mean_squared_error(y_valid, oof_preds[valid_idx]))
    print(f"Fold {num} RMSE: {fold_rmse}")

    # Getting mean score of all fold models (i.e. devided by number of splits)
    total_mean_rmse += fold_rmse / splits
    
print(f"\nOverall RMSE: {total_mean_rmse}")

In [None]:
test['item_cnt_month'] = preds
test.head()

In [None]:
#submit = test.drop(['year','month','day','weekday','date_block_num'],axis=1).groupby(['ID','shop_id','item_id']).sum('item_cnt_day')
submit = test.drop(['shop_id','item_id','year','month','day','weekday','date_block_num'],axis=1).groupby(['ID']).sum('item_cnt_month')
submit = submit.drop(['predict','item_cnt_day'],axis=1)
submit.head(100)

In [None]:
#submit.to_csv('../output/submit.csv')
#submit.to_csv('../submit.csv')
submit.to_csv('./submit.csv')


In [None]:
submit = submit.drop(['date_block_num','year','month','day','weekday'],axis=1)

In [None]:
submit = pd.merge(submit, items, on='item_id', how='left')

In [None]:
pd.read_csv("../input/competitive-data-science-predict-future-sales/sample_submission.csv").head()