In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()
import warnings
warnings.filterwarnings('ignore')
import json
from sklearn.metrics import mean_squared_log_error as msle
from sklearn.metrics import mean_squared_error as mse
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from lightgbm import LGBMRegressor
import lightgbm as lgb
import xgboost as xgb
from xgboost import XGBRegressor
import psutil
import os
import datetime

In [None]:
# function to reduce memory usage of pandas dataframes by casting columns to the most memory efficient data type

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
# function to obtain lag features

def getLagFeatures(features, NUM_LAG_DAYS):
    for i in range(1,NUM_LAG_DAYS+1):
        curr = 'lag'+str(i)
        features[curr] = features.groupby('id')['sales'].shift(i)
    features = features.dropna()
    return features

In [None]:
# function to obtain rolling window features (mean, sum, max, min, std)

def getAggregates(features, NUM_LAG_DAYS):
    num_weeks = NUM_LAG_DAYS // 7
    for i in range(1,num_weeks+1):
        start = (i-1)*7+1
        end = start + 7
        cols = ['lag'+str(j) for j in range(start,end)]
        same_day_cols = ['lag'+str(k*7) for k in range(1,num_weeks+1)]
        features['lag_week'+str(i)+'_sum'] = features[cols].sum(axis=1)
        features['lag_week'+str(i)+'_mean'] = features[cols].mean(axis=1)
        features['lag_week'+str(i)+'_min'] = features[cols].min(axis=1)
        features['lag_week'+str(i)+'_max'] = features[cols].max(axis=1)
        features['lag_week'+str(i)+'_std'] = features[cols].std(axis=1)
        features['same_day_mean'] = features[same_day_cols].mean(axis=1)
        features['same_day_std'] = features[same_day_cols].std(axis=1)
        if i == 1:
            continue
        else:
            features['lag_week_diff'+str(i-1)] = features['lag_week'+str(i)+'_mean']-features['lag_week'+str(i-1)+'_mean']
    return features

In [None]:
# function to obtain price features

def getPriceFeatures(prices,id_map):
    prices['last_week_sell_price'] = prices.groupby(['store_id','item_id'])['sell_price'].shift(1)
    prices['max_sell_price'] = prices.groupby(['store_id','item_id'])['sell_price'].transform('max')
    prices['min_sell_price'] = prices.groupby(['store_id','item_id'])['sell_price'].transform('min')
    prices['mean_sell_price'] = prices.groupby(['store_id','item_id'])['sell_price'].transform('mean')
    prices['price_diff'] = prices['sell_price'] - prices['last_week_sell_price']
    prices['price_std'] = prices.groupby(['store_id','item_id'])['sell_price'].transform('std')
    prices['price_norm'] = prices['sell_price']/prices['max_sell_price']
    prices['price_unique'] = prices.groupby(['store_id','item_id'])['sell_price'].transform('nunique')
    prices['item_unique'] = prices.groupby(['store_id','sell_price'])['item_id'].transform('nunique')
    prices = id_map.merge(prices, on=['store_id','item_id'],how='left')
    return prices

In [None]:
# function to obtain date features

def getDateFeatures(calendar):
    calendar = calendar.fillna('None')
    calendar['is_weekend'] = calendar['wday'].apply(lambda x: 1 if (x==1 or x==2) else 0)
    calendar['is_weekend'] = calendar['is_weekend'].astype('category')
    calendar['event_type_1'] = calendar['event_type_1'].astype('category')
    calendar['event_name_1'] = calendar['event_name_1'].astype('category')
    calendar['event_type_2'] = calendar['event_type_2'].astype('category')
    calendar['event_name_2'] = calendar['event_name_2'].astype('category')
    calendar['snap_CA'] = calendar['snap_CA'].astype('uint8')
    calendar['snap_TX'] = calendar['snap_TX'].astype('uint8')
    calendar['snap_WI'] = calendar['snap_WI'].astype('uint8')
    return calendar[['d','wm_yr_wk','wday','month','event_name_1','event_name_2','event_type_1','event_type_2','snap_CA','snap_TX','snap_WI']]


In [None]:
# helper function to set up a base dataframe for merging purposes

def getBaseFeatures(ids, days, calendar, prices):
    base_days = np.asarray([[d]*len(ids) for d in days]).flatten()
    base_ids = ids*len(days)
    base = pd.DataFrame({'id': base_ids,'d':base_days})
    base = base.merge(calendar,on=['d'],how='left')
    base = base.merge(prices, on=['wm_yr_wk','id'],how='left')
    return base

In [None]:
# function to get lag and aggregate features on-the-fly during prediction phase

def getLagFeaturesRecursive(df, d, NUM_LAG_DAYS):
    prev = 'sales'
    for i in range(1,NUM_LAG_DAYS+1):
        curr = 'lag'+str(i)
        df.loc[df.d==d,curr]=df.loc[df.d==d-1,prev].values
        prev = curr
    num_weeks = NUM_LAG_DAYS//7
    for i in range(1,num_weeks+1):
        start = (i-1)*7+1
        end = start+7
        cols = ['lag'+str(j) for j in range(start,end)]
        same_day_cols = ['lag'+str(k*7) for k in range(1,num_weeks+1)]
        df.loc[df.d==d,'lag_week'+str(i)+'_sum'] = df[cols].sum(axis=1)
        df.loc[df.d==d,'lag_week'+str(i)+'_mean'] = df[cols].mean(axis=1)
        df.loc[df.d==d,'lag_week'+str(i)+'_min'] = df[cols].min(axis=1)
        df.loc[df.d==d,'lag_week'+str(i)+'_max'] = df[cols].max(axis=1)
        df.loc[df.d==d,'lag_week'+str(i)+'_std'] = df[cols].std(axis=1)
        df.loc[df.d==d,'same_day_mean'] = df[same_day_cols].mean(axis=1)
        df.loc[df.d==d,'same_day_std'] = df[same_day_cols].std(axis=1)
        if i == 1: 
            continue
        else:
            df.loc[df.d==d,'lag_week_diff'+str(i-1)] = df.loc[df.d==d,'lag_week'+str(i)+'_mean'].values - df.loc[df.d==d,'lag_week'+str(i-1)+'_mean'].values
    return df

In [None]:
def rmse(y_true,y_pred):
    return np.sqrt(mse(y_true,y_pred))

In [None]:
# function to train an lightgbm model

def train_full(store_features,store,desc, NUM_LAG_DAYS):
    model_file = store+'_'+desc+'.txt'
    train = store_features[(store_features.d<=1863) & (store_features.d>=1069+NUM_LAG_DAYS)].dropna()
    val = store_features[(store_features.d > 1863) & (store_features.d < 1914)].dropna()
    test = store_features[store_features.d >= 1914]
    del store_features
    drop_cols = ['sales','id','d','wm_yr_wk','original_id','store_id','item_id']
    num_weeks = NUM_LAG_DAYS // 7
    for j in range(1,num_weeks+1):
        drop_cols += ['lag'+str(j) for j in range(j*7-6,j*7)]
    categorical_cols = ['event_name_1','event_name_2','event_type_1','event_type_2']
    x_train = train.drop(drop_cols,axis=1)
    y_train = train.sales.values
    x_val = val.drop(drop_cols,axis=1)
    y_val = val.sales.values    

    lgb_params = {
        'boosting_type': 'gbdt',         
        'objective': 'regression',       
        'metric': ['rmse'],             
        'subsample': 0.8,                
        'subsample_freq': 1,
        'learning_rate': 0.03,           
        'num_leaves': 2**9-1,            
        'min_data_in_leaf': 2**8-1,      
        'feature_fraction': 0.8,
        'n_estimators': 5000,            
        'early_stopping_rounds': 30,     
        'verbose': -1,
        'max_bin':2**9-1
            } 
    train_set = lgb.Dataset(x_train, y_train)
    val_set = lgb.Dataset(x_val, y_val)
    lgb_model = lgb.train(lgb_params, train_set, num_boost_round = 2000, valid_sets = [train_set, val_set], verbose_eval = 100)
    lgb_model.save_model(model_file)
    val_pred_lgb = lgb_model.predict(x_val, num_iteration=lgb_model.best_iteration)
    val_score_lgb = rmse(val_pred_lgb, y_val)
    
    print(f'final val rmse score: {val_score_lgb}')
    del x_train, y_train
    return lgb_model

In [None]:
# function to predict sales for a specified day

def predict(features, lgb_model, d,NUM_LAG_DAYS):
    X = features[features.d == d]
    drop_cols = ['sales','id','d','wm_yr_wk','original_id','store_id','item_id']
    categorical_cols = ['event_name_1','event_name_2','event_type_1','event_type_2']
    num_weeks = NUM_LAG_DAYS // 7
    for j in range(1,num_weeks+1):
        drop_cols += ['lag'+str(j) for j in range(j*7-6,j*7)]
    X = X.drop(drop_cols,axis=1)
    Y = lgb_model.predict(X,num_iteration=lgb_model.best_iteration)
    return Y

In [None]:
# helper function to perform the 3 phases: feature engineering, model training and prediction

def run_store(store,sales,prices,calendar,id_map,desc,NUM_LAG_DAYS):
    store_features = sales[sales.store_id == store]
    store_features = getLagFeatures(store_features,NUM_LAG_DAYS)
    store_features = getAggregates(store_features,NUM_LAG_DAYS)
    store_features = store_features.drop(['item_id','store_id'],axis=1)
    ids = list(store_features.id.unique())
    days = list(calendar.d.values)
    base = getBaseFeatures(ids,days,getDateFeatures(calendar),getPriceFeatures(prices,id_map))
    store_features = base.merge(store_features,on=['id','d'],how='left')
    del base
    lgb_model = train_full(store_features,store,desc,NUM_LAG_DAYS)
    for i in tqdm(range(1914,1970)):
        store_features = getLagFeaturesRecursive(store_features, i, NUM_LAG_DAYS)
        store_features.loc[store_features.d == i,'sales'] = predict(store_features,lgb_model,i,NUM_LAG_DAYS)

    store_pred = store_features[store_features.d > 1913]
    store_pred = store_pred[['id','d','sales']]
    
    today = datetime.date.today().strftime("%d%m")
    store_pred.to_csv(store+'_'+desc+'.csv')
    del store_features
    del store_pred


In [None]:
def run_full(stores,sales,prices,calendar,id_map,model_file,NUM_LAG_DAYS):
    for store in tqdm(stores):
        run_store(store,sales,prices,calendar,id_map,model_file,NUM_LAG_DAYS)

In [None]:
desc = "2104_morePF_newParams_lag28only_2014onwards_increaseNumLeaves"
NUM_LAG_DAYS = 28

In [None]:
# initialize sales
sales = pd.read_pickle('sales_grid.pkl')
sales = sales[sales.d >= 1069] # 2014 onwards

# intialize calendar
calendar = pd.read_csv('calendar.csv')
calendar.d = calendar.d.apply(lambda d: int(d.split('_')[1])).apply(pd.to_numeric,downcast='unsigned')
calendar = calendar[calendar.d >= 1069] #2014 onwards
calendar = reduce_mem_usage(calendar)

# initialize prices
prices = pd.read_csv('sell_prices.csv')
prices = reduce_mem_usage(prices)

# initialize id map
id_map = pd.read_csv('id_map.csv')
stores = sales.store_id.unique()

In [None]:
# calls on run_store for every store
run_full(stores,sales,prices,calendar,id_map,desc,NUM_LAG_DAYS)

In [None]:
# function to generate kaggle submission file 

def submit(desc,stores,id_map):
    sub_cols = ['id']+['F'+str(i) for i in range(1,29)]
    sub_val = pd.DataFrame(columns=sub_cols)
    sub_eval = pd.DataFrame(columns=sub_cols)
    for store in stores:
        filename = store+'_'+desc+'.csv'
        store_df = pd.read_csv(filename)[['id','d','sales']]
        store_df = store_df.pivot_table(index=['id'],columns='d').reset_index()
        store_df.columns = ['id']+[str(i) for i in range(1914,1970)]
        val_cols = ['id']+[str(i) for i in range(1914,1942)]
        eval_cols = ['id']+[str(i) for i in range(1942,1970)]
        val_df = store_df[val_cols]
        eval_df = store_df[eval_cols]
        
        val_df = id_map.drop(['item_id','store_id'],axis=1).merge(val_df,on='id')
        val_df['original_id'] = val_df['original_id'].apply(lambda x: '_'.join(x.split('_')[:-1])+'_validation')
        val_df = val_df.drop(['id'],axis=1)
        val_df.columns = sub_cols
        sub_val = sub_val.append(val_df)
        
        eval_df = id_map.drop(['item_id','store_id'],axis=1).merge(eval_df,on='id')
        eval_df['original_id'] = eval_df['original_id'].apply(lambda x: '_'.join(x.split('_')[:-1])+'_evaluation')
        eval_df = eval_df.drop(['id'],axis=1)
        eval_df.columns = sub_cols
        sub_eval = sub_eval.append(eval_df)
    sub_val = sub_val.append(sub_eval)
    return sub_val

In [None]:
sub_val= submit(desc,stores,id_map)
sub_val.reset_index().drop(['index'],axis=1).to_csv('submission_'+desc+'.csv',index=False)
sub_cols = ['F'+str(i) for i in range(1,29)]
sub_val[sub_cols] = sub_val[sub_cols].round(0)
sub_val.to_csv('submission_rounded_'+desc+'.csv',index=False)