In [None]:
import numpy as np
import pandas as pd
import os, sys, gc, time, warnings, pickle, psutil, random

from multiprocessing import Pool    

warnings.filterwarnings('ignore')

In [None]:
def model_seed(seed=0):
    random.seed(seed)
    np.random.seed(seed)

#runs func parallely for all items in data
def parallelize(func, data):
    cores_number = np.min([core_count,len(data)])
    pool = Pool(cores_number)
    df = pd.concat(pool.map(func, data), axis=1)
    pool.close()
    pool.join()
    return df

In [None]:
#setting the mask for splitting dataset
train_set_start = 0  
train_set_end = 1913               
prediction_set = 28 

In [None]:
#creation of rolling window columns
def lag_roll(days):
    shift = days[0]
    roll = days[1]
    lag = test_base[['id','d','sales']]
    column = 'rolling_mean_tmp_'+str(shift)+'_'+str(roll)
    lag[column] = lag.groupby(['id'])['sales'].transform(lambda x: x.shift(shift).rolling(roll).mean())
    return lag[[column]]

In [None]:
# We get the data by a particular store id
def get_store_data(store):
    
    df = data[data['store_id']==store]
    df['temp_d'] = pd.to_numeric(data['d'].str[2:])

    final_features = [col for col in list(df) if col not in remove]
    
    #Data fetched using start date 
    df = df[df['temp_d']>=train_set_start].reset_index(drop=True)
    category_columns=['id','item_id','dept_id','cat_id','store_id','state_id','event_name_1','event_type_1','event_name_2','event_type_2','event_name_1_lag_1', 'event_type_1_lag_1',
                   'event_name_1_lag_2', 'event_type_1_lag_2', 'event_name_1_lag_3', 'event_type_1_lag_3']
    for col in category_columns:
        df[col] = df[col].astype('category')
    
    return df, final_features



In [None]:
# Test data being fetched for Training
def test_data():
    test_data = pd.DataFrame()

    for id in store_ids:
        temp_df = pd.read_pickle('test_'+id+'.pkl')
        temp_df['store_id'] = id
        test_data = pd.concat([test_data, temp_df]).reset_index(drop=True)
    
    return test_data


In [None]:
# setting the parameters for LGBM model
import lightgbm as lgb
lgb_params ={
         'boosting_type': 'gbdt',
         'objective': 'tweedie',
         'tweedie_variance_power': 1.4,
         'metric': 'rmse',
         'subsample': 0.5,
         'subsample_freq': 1,
         'learning_rate': 0.01,
         'num_leaves': 2**11-1,
         'min_data_in_leaf': 2**12-1,
         'feature_fraction': 0.5,
         'max_bin': 100,
         'n_estimators': 700,
         'boost_from_average': False,
         'verbose': -1,
}

In [None]:
version = 1                          
seed = 42                        
model_seed(seed)             
lgb_params['seed'] = seed        
core_count = psutil.cpu_count()     

# remove certain features for modelling

remove = ['id','state_id','store_id','date','wm_yr_wk','d','sales']

data_dir = '../input/bestfitting-da-dataset/data.csv'

data = pd.read_csv(data_dir)

print(data.info())

category_columns=['id','item_id','dept_id','cat_id','store_id','state_id','event_name_1','event_type_1','event_name_2','event_type_2','event_name_1_lag_1', 'event_type_1_lag_1',
                   'event_name_1_lag_2', 'event_type_1_lag_2', 'event_name_1_lag_3', 'event_type_1_lag_3']
for col in category_columns:
    data[col] = data[col].astype('category')

#fetching STORES ids
store_ids = data['store_id']
store_ids = list(store_ids.unique())


#SPLITS 
rol_split = []
for i in [1,7,14]:
    for j in [7,14,30,60]:
        rol_split.append([i,j])

In [None]:
for id in store_ids:
    print('Training data for Store: ', id)
    
    # fetching data for current store
    store_df, features_columns = get_store_data(id)
    
    # dividing data into training and testing data
    training_mask = store_df['temp_d']<=train_set_end
    validation_mask = training_mask&(store_df['temp_d']>(train_set_end-prediction_set))
    preds_mask = store_df['temp_d']>(train_set_end-100)
    
    
    training_data = lgb.Dataset(store_df[training_mask][features_columns], 
                       label=store_df[training_mask]['sales'])
    training_data.save_binary('training_data.bin')
    training_data = lgb.Dataset('training_data.bin')
    
    validation_data = lgb.Dataset(store_df[validation_mask][features_columns], 
                       label=store_df[validation_mask]['sales'])
    
    #Dataset saved for later predictions
    #Removing features
    store_df = store_df[preds_mask].reset_index(drop=True)
    keep_cols = [col for col in list(store_df) if '_tmp_' not in col]
    store_df = store_df[keep_cols]
    store_df.to_pickle('test_'+id+'.pkl')
    del store_df
    
    # Launch seed again to ensure lgb training 100% deterministic
    model_seed(seed)
    model_estm = lgb.train(lgb_params,
                          training_data,
                          valid_sets = [validation_data],
                          verbose_eval = 100,
                          )
    
    model_name = 'lgb_model_'+id+'_v'+str(version)+'.bin'
    pickle.dump(model_estm, open(model_name, 'wb'))

    # Remove temporary files and objects to free some space
    
    !rm training_data.bin
    del training_data, validation_data, model_estm
    gc.collect()
    
    # Models features used for predictions
    predict_features = features_columns

In [None]:

# Creating DataFrame to store predictions
prediction = pd.DataFrame()

# Join back the Test dataset with a small part of the training data to make recursive features
test_base = test_data()

# Measuring prediction time 
prediction_time = time.time()

#Each prediction day 

for day in range(1,29):    
    print('Prediction Day:', day)
    start_time = time.time()

    # Make temporary grid to calculate rolling lags
    store_df = test_base.copy()
    parallelize(lag_roll, rol_split)
    store_df = pd.concat([store_df, parallelize(lag_roll, rol_split)], axis=1)
        
    for id in store_ids:
        
        # Read all our models and make predictions for each day
        model_path = 'lgb_model_'+id+'_v'+str(version)+'.bin' 
        
        model_estm = pickle.load(open(model_path, 'rb'))
        
        day_mask = test_base['temp_d']==(train_set_end+day)
        store_mask = test_base['store_id']==id
        
        mask = (day_mask)&(store_mask)
        test_base['sales'][mask] = model_estm.predict(store_df[mask][predict_features])

    temp_df = test_base[day_mask][['id','sales']]
    temp_df.columns = ['id','F'+str(day)]
    if 'id' in list(prediction):
        prediction = prediction.merge(temp_df, on=['id'], how='left')
    else:
        prediction = temp_df.copy()
        
    print(' %0.2f min round ' % ((time.time() - start_time) / 60),
          ' %0.2f min total ' % ((time.time() - prediction_time) / 60),
          ' %0.2f day sales ' % (temp_df['F'+str(day)].sum()))
    del temp_df
    
#saving the predictions in a csv file
prediction = prediction.reset_index(drop=True)
prediction.to_csv('final_prediction.csv')
prediction