#  Rasmus og Lasse Walmart 2/2

We have been asked by Walmart to precisely forecast certain products 28 days in advance for 3 different stores across America. The 3 stores are located in Texas, California and Wisconsin. They all have the same departments, where they sell different items.

For this competition we will measure our models accuracy from the metric is Weighted Root Mean Squared Scaled Error (WRMSSE) 

![](https://i.imgur.com/uqhsf3d.png)

# Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from statsmodels.tsa.arima_model import ARIMA
import gc

# Importing datasets

In [None]:
calendar = pd.read_csv('../input/m5-forecasting-accuracy/calendar.csv')
train_sales = pd.read_csv('../input/m5-forecasting-accuracy/sales_train_validation.csv')
sample_submission = pd.read_csv('../input/m5-forecasting-accuracy/sample_submission.csv')
sell_prices = pd.read_csv('../input/m5-forecasting-accuracy/sell_prices.csv')

To see datasets and infomation about these, please go to our other kernal

# Merging datasets

From our analisys we can see that the 3 tables calender, sell_prices and train_sales all need to be merged together into one dataframe which we want to use to train on model for predictions

Kaggle offers 16GB of ram for free, but unfortunatly this is not enough for a dataset of this size, therefore we have "found" a method to reduce memory usage of the datasets.

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
calendar = reduce_mem_usage(calendar)
sell_prices = reduce_mem_usage(sell_prices)

In [None]:
def melt_train_data(sales_train_validation):
    # Turns the table, so that we keep all our id columns as colums but turn day so that each day is its own row and demand is the value that was in d_something before. Goes from 30490 rows × 1919 columns to 60034810 rows × 9 columns
    sales_train_validation = pd.melt(sales_train_validation, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')
    sales_train_validation = reduce_mem_usage(sales_train_validation)
     # seperate test dataframes
    test_rows = [row for row in sample_submission['id'] if 'validation' in row]
    test = sample_submission[sample_submission['id'].isin(test_rows)]
    
    
    # change column names
    test.columns = ['id', 'd_1914', 'd_1915', 'd_1916', 'd_1917', 'd_1918', 'd_1919', 'd_1920', 'd_1921', 'd_1922', 'd_1923', 'd_1924', 'd_1925', 'd_1926', 'd_1927', 'd_1928', 'd_1929', 'd_1930', 'd_1931', 
                      'd_1932', 'd_1933', 'd_1934', 'd_1935', 'd_1936', 'd_1937', 'd_1938', 'd_1939', 'd_1940', 'd_1941']
   
    
    product = sales_train_validation[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']].drop_duplicates()
    
   
    test = test.merge(product, how = 'left', on = 'id')
    
    test = pd.melt(test, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')
    
    sales_train_validation['part'] = 'train'
    test['part'] = 'test'
    
    data =pd.concat([sales_train_validation, test], axis = 0)
    del(sales_train_validation, test)
    return data

In this method we pandas melt on the sales_train_validation set so it becomes tall dataset insted of a wide one. Basically, you "melt" data so that each row is a unique id-variable combination

In [None]:
data = melt_train_data(train_sales)
del(train_sales)
gc.collect()

In the next section we want to inspect the new demand(day) values 

In [None]:
demand_pattern = data.groupby('day')['demand'].sum().to_frame().reset_index()
demand_pattern['day_number'] = demand_pattern['day'].str.split("_",n = 1, expand = True)[1].astype('int32')


In [None]:
plt.figure(figsize=(15,12))
plt.plot(demand_pattern['demand'])
plt.xlabel('Days', fontsize=18)
plt.ylabel('Demand', fontsize=16)
plt.show()

We can see that at some point around 1000 some data has gone missing and there is a drop in demand from right before the missing data.
For this reason we choose to only keep the data from after the drop. Our reason for this is that this data we have now is more consistent.

In [None]:
demand_pattern = demand_pattern[demand_pattern['day_number']>1101]

In [None]:
plt.figure(figsize=(15,12))
plt.plot(demand_pattern['demand'])
plt.xlabel('Days', fontsize=18)
plt.ylabel('Demand', fontsize=16)
plt.show()

In the graph above we can see that the last part is missing. This is the values we are going to predict later. 

In [None]:
data = data[data['day'].isin(demand_pattern['day'])]
del(demand_pattern)
gc.collect()

In [None]:
def merge_with_calendar(calendar, sales_train_validation):
        data = pd.merge(sales_train_validation, calendar, how = 'left', left_on = ['day'], right_on =['d'])
        data.drop(['d', 'day'], inplace = True, axis = 1)
        print('rows: {} and columns: {}'.format(data.shape[0], data.shape[1]))
        return data

In [None]:
data = merge_with_calendar(calendar, data)

In [None]:
def merge_with_sales(sell_prices, data):
    data = data.merge(sell_prices, on = ['store_id', 'item_id', 'wm_yr_wk'], how = 'left')
    data = data.drop(columns=['wm_yr_wk'])
    print('rows: {} and columns: {}'.format(data.shape[0], data.shape[1]))
    return data

In [None]:
data = merge_with_sales(sell_prices,data)

# Cleaning and Transforming our datasets

To avoid NAN values in our dataset we want to first specify, that a NAN value, is no_event and after we transform the non-numeric value to an numeric value with the label encoder.

In [None]:
def fill_nan_values(data):
        data['event_name_1'] = data['event_name_1'].fillna('no_event')
        data['event_type_1'] = data['event_type_1'].fillna('no_event')
        data['event_name_2'] = data['event_name_2'].fillna('no_event')
        data['event_type_2'] = data['event_type_2'].fillna('no_event')
        return data

In [None]:
   def transform_data(data):
        data = fill_nan_values(data)
        
        le = LabelEncoder()
        
        data['event_type_1'] = le.fit_transform(data.event_type_1)
        data['event_name_1'] = le.fit_transform(data.event_name_1)
        data['event_type_2'] = le.fit_transform(data.event_type_2)
        data['event_name_2'] = le.fit_transform(data.event_name_2)
        
        #Next we want to clean our dataset for non-numeric values. We especially want to transform our different ids for departments, stores and items.
        data['dept_id'] = le.fit_transform(data.dept_id)
        data['cat_id'] = le.fit_transform(data.cat_id)
        data['store_id'] = le.fit_transform(data.store_id)
        data['item_id'] = le.fit_transform(data.item_id)
        data['state_id'] = le.fit_transform(data.state_id)
        
        return data

In [None]:
data = transform_data(data)

We are removing the column weekday as the dataset already have a column called dayofweek.


In [None]:
data = data.drop(columns=['weekday'])

# Feature engineering

Since we have to predict 28 days in the future. We are making new columns to predict rolling mean and std. 28 days in the past. We calculate mean and std for both weekly basis and monthly. 

In [None]:
def create_demand_features(data):
    data['lag'] = data.groupby(['id'])['demand'].transform(lambda d: d.shift(28))
    data['demand_mean_7d'] = data.groupby(['id'])['demand'].transform(lambda d: d.shift(28).rolling(7).mean())
    data['demand_std_7d'] = data.groupby(['id'])['demand'].transform(lambda d: d.shift(28).rolling(7).std())
    data['demand_mean_30d'] = data.groupby(['id'])['demand'].transform(lambda d: d.shift(28).rolling(30).mean())
    data['demand_std_30d'] = data.groupby(['id'])['demand'].transform(lambda d: d.shift(28).rolling(30).std())
    return data

Secondly we are creating a few new price features. With trial and error we have found that std inpacts the model the most. Therefore we calculate std. for the last 7 days and the last month. Same as above. We have also calculated the price change from day to day.

In [None]:
def create_price_features(data):
    data['lag_1d'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.shift(1))
    data['price_change_1d'] = (data['lag_1d'] - data['sell_price']) / (data['lag_1d'])
    data['price_std_7d'] = data.groupby(['id'])['sell_price'].transform(lambda d: d.rolling(7).std())
    data['price_std_30d'] = data.groupby(['id'])['sell_price'].transform(lambda d: d.rolling(30).std())
    data.drop(['lag_1d'], inplace = True, axis = 1)
    return data

The column 'date' was in a format that our model could not use. Therefore we use pandas 'to_datetime' to make in into a value that the model can accept. And while we are at it, we added a column for each type of datetime.

In [None]:
def create_time_features(data):
    data['date'] = pd.to_datetime(data['date'])
    data['year'] = data['date'].dt.year
    data['month'] = data['date'].dt.month
    data['week'] = data['date'].dt.week
    data['day'] = data['date'].dt.day
    data['dayofweek'] = data['date'].dt.dayofweek
    return data

In [None]:
data = create_demand_features(data)
data = create_price_features(data)
data = create_time_features(data)

In [None]:
data = reduce_mem_usage(data)

We have now cleaned all of our data and are now ready to create a model and predict.

# Predictive model

In [None]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

In [None]:
features = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'year', 'month', 'week', 'day', 'dayofweek', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 
            'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'lag', 'demand_mean_7d', 'demand_std_7d', 'demand_mean_30d', 'demand_std_30d', 'price_change_1d', 'price_std_7d', 'price_std_30d']

In [None]:
params = {
        "objective" : "poisson",
        "metric" :"rmse",
        "learning_rate" : 0.1,
        'num_iterations' : 200,
        'num_leaves': 128,
}

We have chosen to use lightgbm as our prediction model. The reason for this choice is space consumption. Lightgbm is a gradient boosting framework that uses tree based learning altorithm. The model uses low memory to run which we were looking for, since our kaggle kept restarting. 
Lightgbm's documentations states that it is a framework to use on larger datasets. 

In [None]:
def run_lgb(data):
    # going to evaluate with the last 28 days
    x_train = data[data['date'] <= '2016-03-27']
    y_train = x_train['demand']
    x_test = data[data['part']=='test']
    y_test = x_test['demand']
    prediction_set = data[data['part']=='test']
    
    train_set = lgb.Dataset(x_train[features], y_train)
    test_set = lgb.Dataset(x_test[features], y_test)
    
    del x_train, y_train

    model = lgb.train(params, train_set, valid_sets = [train_set, test_set], verbose_eval = 20)
    test_pred = model.predict(x_test[features])
    test_score = np.sqrt(mean_squared_error(test_pred, y_test))
    print(f'the test root-mean-square error score is {test_score}')
    y_pred = model.predict(prediction_set[features])
    prediction_set['demand'] = y_pred
    return prediction_set

In [None]:
prediction_set = run_lgb(data)

In this final section of the report we create the final submission file. And make it out to be a csv file for submission.

In [None]:
def predict(prediction_set, submission):
    predictions = prediction_set[['id', 'date', 'demand']]
    predictions = pd.pivot(predictions, index = 'id', columns = 'date', values = 'demand').reset_index()
    predictions.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]

    evaluation_rows = [row for row in submission['id'] if 'evaluation' in row] 
    evaluation = submission[submission['id'].isin(evaluation_rows)]

    validation = submission[['id']].merge(predictions, on = 'id')
    final = pd.concat([validation, evaluation])
    return final

In [None]:
final = predict(prediction_set,sample_submission) 

In [None]:
final.to_csv('Submission_2.csv',index=False)