# Objective

* Make a baseline model that predict the validation (28 days). 
* This competition has 2 stages, so the main objective is to make a model that can predict the demand for the next 28 days

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import dask.dataframe as dd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import dask_xgboost as xgb
import dask.dataframe as dd
from sklearn.cluster import KMeans
from sklearn import preprocessing, metrics
from sklearn.preprocessing import MinMaxScaler
import gc
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


# function to read the data and merge it (ignoring some columns, this is a very fst model)


def read_data():
    print('Reading files...')
    calendar = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/calendar.csv')
    calendar = reduce_mem_usage(calendar)
    print('Calendar has {} rows and {} columns'.format(calendar.shape[0], calendar.shape[1]))
    sell_prices = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sell_prices.csv')
    sell_prices = reduce_mem_usage(sell_prices)
    print('Sell prices has {} rows and {} columns'.format(sell_prices.shape[0], sell_prices.shape[1]))
    sales_train_validation = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv')
    print('Sales train validation has {} rows and {} columns'.format(sales_train_validation.shape[0], sales_train_validation.shape[1]))
    submission = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sample_submission.csv')
    return calendar, sell_prices, sales_train_validation, submission


def melt_and_merge(calendar, sell_prices, sales_train_validation, submission, nrows = 55000000, merge = False):
    
    # melt sales data, get it ready for training
    sales_train_validation = pd.melt(sales_train_validation, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')
    print('Melted sales train validation has {} rows and {} columns'.format(sales_train_validation.shape[0], sales_train_validation.shape[1]))
    sales_train_validation = reduce_mem_usage(sales_train_validation)
    
    # seperate test dataframes
    test1_rows = [row for row in submission['id'] if 'validation' in row]
    test2_rows = [row for row in submission['id'] if 'evaluation' in row]
    test1 = submission[submission['id'].isin(test1_rows)]
    test2 = submission[submission['id'].isin(test2_rows)]
    
    # change column names
    test1.columns = ['id', 'd_1914', 'd_1915', 'd_1916', 'd_1917', 'd_1918', 'd_1919', 'd_1920', 'd_1921', 'd_1922', 'd_1923', 'd_1924', 'd_1925', 'd_1926', 'd_1927', 'd_1928', 'd_1929', 'd_1930', 'd_1931', 
                      'd_1932', 'd_1933', 'd_1934', 'd_1935', 'd_1936', 'd_1937', 'd_1938', 'd_1939', 'd_1940', 'd_1941']
    test2.columns = ['id', 'd_1942', 'd_1943', 'd_1944', 'd_1945', 'd_1946', 'd_1947', 'd_1948', 'd_1949', 'd_1950', 'd_1951', 'd_1952', 'd_1953', 'd_1954', 'd_1955', 'd_1956', 'd_1957', 'd_1958', 'd_1959', 
                      'd_1960', 'd_1961', 'd_1962', 'd_1963', 'd_1964', 'd_1965', 'd_1966', 'd_1967', 'd_1968', 'd_1969']
    
    # get product table
    product = sales_train_validation[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']].drop_duplicates()
    
    # merge with product table
    test2['id'] = test2['id'].str.replace('_evaluation','_validation')
    test1 = test1.merge(product, how = 'inner', on = 'id')
    test2 = test2.merge(product, how = 'inner', on = 'id')
    test2['id'] = test2['id'].str.replace('_validation','_evaluation')
    
    # 
    test1 = pd.melt(test1, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')
    test2 = pd.melt(test2, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')
    
    sales_train_validation['part'] = 'train'
    test1['part'] = 'test1'
    test2['part'] = 'test2'
    
    data = pd.concat([sales_train_validation, test1, test2], axis = 0, ignore_index=True)
    
    del sales_train_validation, test1, test2
    
    # get only a sample for fst training
    data = data.loc[nrows:]
    
    # drop some calendar features
    calendar.drop(['weekday', 'wday', 'month', 'year'], inplace = True, axis = 1)
    
    # delete test2 for now
    data = data[data['part'] != 'test2']
    
    if merge:
        # notebook crash with the entire dataset (maybee use tensorflow, dask, pyspark xD)
        data = pd.merge(data, calendar, how = 'inner', left_on = ['day'], right_on = ['d'])
        data.drop(['d', 'day'], inplace = True, axis = 1)
        # get the sell price data (this feature should be very important)
        data = data.merge(sell_prices, on = ['store_id', 'item_id', 'wm_yr_wk'], how = 'left')
        print('Our final dataset to train has {} rows and {} columns'.format(data.shape[0], data.shape[1]))
    else: 
        pass
    
    gc.collect()
    
    return data
        
calendar, sell_prices, sales_train_validation, submission = read_data()


Reducing sales and sell_prices down to one store, in this case CA_1

In [None]:
one_store_sales = sales_train_validation.loc[sales_train_validation['store_id'] == 'CA_1']
one_store_sell_prices =  sell_prices.loc[sell_prices['store_id'] =='CA_1']

Now we melt and merge so that only data for the one store is included

In [None]:
data = melt_and_merge(calendar, one_store_sell_prices, one_store_sales, submission,nrows=50000, merge = True)

In [None]:
data.head()

In [None]:
def run_lgb(data):
    
    # going to evaluate with the last 28 days
    x_train = data[data['date'] <= '2016-03-27']
    y_train = x_train['demand']
    x_val = data[(data['date'] > '2016-03-27') & (data['date'] <= '2016-04-24')]
    y_val = x_val['demand']
    test = data[(data['date'] > '2016-04-24')]
    del data
    gc.collect()

    # define random hyperparammeters
    params = {
        'boosting_type': 'gbdt',
        'metric': 'rmse',
        'objective': 'regression',
        'n_jobs': -1,
        'seed': 236,
        'learning_rate': 0.1,
        'bagging_fraction': 0.75,
        'bagging_freq': 10, 
        'colsample_bytree': 0.75}

    train_set = lgb.Dataset(x_train[features], y_train)
    val_set = lgb.Dataset(x_val[features], y_val)
    
    del x_train, y_train

    model = lgb.train(params, train_set, num_boost_round = 2500, early_stopping_rounds = 50, valid_sets = [train_set, val_set], verbose_eval = 100)
    val_pred = model.predict(x_val[features])
    val_score = np.sqrt(metrics.mean_squared_error(val_pred, y_val))
    print(f'Our val rmse score is {val_score}')
    y_pred = model.predict(test[features])
    test['demand'] = y_pred
    return test

def predict(test, submission):
    predictions = test[['id', 'date', 'demand']]
    predictions = pd.pivot(predictions, index = 'id', columns = 'date', values = 'demand').reset_index()
    predictions.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]

    evaluation_rows = [row for row in submission['id'] if 'evaluation' in row] 
    evaluation = submission[submission['id'].isin(evaluation_rows)]

    validation = submission[['id']].merge(predictions, on = 'id')
    final = pd.concat([validation, evaluation])
    final.to_csv('submission.csv', index = False)
    

# define list of features
features = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'year', 'month', 'week', 'day', 'dayofweek', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 
            'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'lag_t28', 'lag_t29', 'lag_t30', 'rolling_mean_t7', 'rolling_std_t7', 'rolling_mean_t30', 'rolling_mean_t90', 
            'rolling_mean_t180', 'rolling_std_t30', 'price_change_t1', 'price_change_t365', 'rolling_price_std_t7', 'rolling_price_std_t30', 'rolling_skew_t30', 'rolling_kurt_t30']


def transform_train_and_eval(data):
    data = transform(data)
    data = simple_fe(data)
    # reduce memory for new features so we can train
    data = reduce_mem_usage(data)
    test = run_lgb(data)
    predict(test, submission)
    
transform_train_and_eval(data)* We have the data to build our first model, let's build a baseline and predict the validation data (in our case is test1)

In [None]:
def transform(data):
    
    nan_features = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    for feature in nan_features:
        data[feature].fillna('unknown', inplace = True)
        
    cat = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    for feature in cat:
        encoder = preprocessing.LabelEncoder()
        data[feature] = encoder.fit_transform(data[feature])
    
    return data

def simple_fe(data):
    
    # rolling demand features
    data['lag_t28'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28))
    data['lag_t29'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(29))
    data['lag_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(30))
    data['rolling_mean_t7'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(7).mean())
    data['rolling_std_t7'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(7).std())
    data['rolling_mean_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).mean())
    data['rolling_mean_t90'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(90).mean())
    data['rolling_mean_t180'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(180).mean())
    data['rolling_std_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).std())
    data['rolling_skew_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).skew())
    data['rolling_kurt_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).kurt())
    
    
    # price features
    data['lag_price_t1'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.shift(1))
    data['price_change_t1'] = (data['lag_price_t1'] - data['sell_price']) / (data['lag_price_t1'])
    data['rolling_price_max_t365'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.shift(1).rolling(365).max())
    data['price_change_t365'] = (data['rolling_price_max_t365'] - data['sell_price']) / (data['rolling_price_max_t365'])
    data['rolling_price_std_t7'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.rolling(7).std())
    data['rolling_price_std_t30'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.rolling(30).std())
    data.drop(['rolling_price_max_t365', 'lag_price_t1'], inplace = True, axis = 1)
    
    # time features
    data['date'] = pd.to_datetime(data['date'])
    data['year'] = data['date'].dt.year
    data['month'] = data['date'].dt.month
    data['week'] = data['date'].dt.week
    data['day'] = data['date'].dt.day
    data['dayofweek'] = data['date'].dt.dayofweek
    
    
    return data


We then run the data through the transform and simple_fe functions

In [None]:
data = transform(data)
data = simple_fe(data)

In [None]:
data.head()

Now a new data set just containing the unique item ids, department ids, and category id's were created. I will be doing this in order to ready a dataset for unsupervised clustering.

In [None]:
unique_keys = data[['item_id', 'dept_id', 'cat_id']].drop_duplicates() 


In [None]:
unique_keys.sort_values(by='item_id')

Below I totalled the sum of the demand per day and then averaged them to get the average demand by week.

In [None]:
total_demand_by_week = data.groupby(['item_id','wm_yr_wk']).agg('sum')['demand']
total_demand_by_week.head(10)


In [None]:
avg_demand_by_week = total_demand_by_week.groupby(['item_id']).mean()
avg_demand_by_week.head(10)

In [None]:
unique_keys = unique_keys.merge(avg_demand_by_week, on = 'item_id')
unique_keys.head(10)

Now I found the median sell price per item id 

In [None]:
data_copy = data
median_price_by_item = data_copy[['item_id','sell_price']].groupby(['item_id']).agg('median')['sell_price']
median_price_by_item.head(10)

In [None]:
unique_keys = unique_keys.merge(median_price_by_item, on = 'item_id')

In [None]:
unique_keys.sort_values(by = 'item_id').head(10)

I then normalized both demand and sell price

In [None]:
min_max_scaler = MinMaxScaler()
unique_keys[['demand', 'sell_price']] = min_max_scaler.fit_transform(unique_keys[['demand', 'sell_price']])

In [None]:
unique_keys.sort_values(by = 'item_id').head(10)

In these following steps dummy variables were created for the department and category ids

In [None]:
unique_keys_new = pd.concat([unique_keys,pd.get_dummies(unique_keys['dept_id'], prefix = 'dept_id', dummy_na = False)], axis=1).drop(['dept_id'], axis = 1)
unique_keys_new.head(10)

In [None]:
unique_keys_new2 = pd.concat([unique_keys_new,pd.get_dummies(unique_keys_new['cat_id'], prefix = 'cat_id', dummy_na = False)], 
                             axis=1).drop(['cat_id'], axis = 1)
unique_keys_new2.head(10)

All columns besides item_id are retained for the clustering

In [None]:
cluster_input = unique_keys_new2.loc[:, unique_keys_new2.columns!= 'item_id']
cluster_input.shape

This function creates an SSE plot

In [None]:
def sse_generation(input_data):
    sse = []
    for k in range(1, 11):
        kmeans = KMeans(n_clusters=k)
        kmeans.fit(input_data)
        sse.append(kmeans.inertia_)
    plt.plot(range(1,11), sse)
    plt.xlabel("Number of Clusters")
    plt.ylabel("SSE")
    plt.grid()
    plt.show()

In [None]:
sse_generation(cluster_input)

I found that 7 clusters would be appropriate per the SSE graph

In [None]:
num_k = 7
kmeans = KMeans(n_clusters = num_k)
kmeans.fit(cluster_input)
unique_keys_new2['cluster_id'] = kmeans.labels_
unique_keys_new2.sort_values(by = 'item_id').head(20)


A new dataset was created that only contained the item id and cluster id. That way we could remove all the other features used for clustering.

In [None]:
data_with_clusters = data
unique_keys_clusters = unique_keys_new2[['item_id', 'cluster_id']]
unique_keys_clusters.head(10)

I then merged the dataset with a copy of the original dataset. 

In [None]:
data_with_clusters = data_with_clusters.merge(unique_keys_clusters, on ='item_id', how = 'inner')
data_with_clusters.head(10)

Then new dataframes were created with the data for each cluster

In [None]:
cluster_0 = data_with_clusters.loc[data_with_clusters["cluster_id"] == 0] 
cluster_1 = data_with_clusters.loc[data_with_clusters["cluster_id"] == 1]
cluster_2 = data_with_clusters.loc[data_with_clusters["cluster_id"] == 2]
cluster_3 = data_with_clusters.loc[data_with_clusters["cluster_id"] == 3]
cluster_4 = data_with_clusters.loc[data_with_clusters["cluster_id"] == 4]
cluster_5 = data_with_clusters.loc[data_with_clusters["cluster_id"] == 5]
cluster_6 = data_with_clusters.loc[data_with_clusters["cluster_id"] == 6] 

In [None]:
cluster_6.head(10)

In [None]:
transform_train_and_eval(cluster_0)
cluster_0_results = pd.read_csv('submission.csv')
cluster_0_results = cluster_0_results[cluster_0_results["F1"]!= 0]

transform_train_and_eval(cluster_1)
cluster_1_results = pd.read_csv('submission.csv')
cluster_1_results = cluster_1_results[cluster_1_results["F1"]!= 0]

transform_train_and_eval(cluster_2)
cluster_2_results = pd.read_csv('submission.csv')
cluster_2_results = cluster_2_results[cluster_2_results["F1"]!= 0]

transform_train_and_eval(cluster_3)
cluster_3_results = pd.read_csv('submission.csv')
cluster_3_results = cluster_3_results[cluster_3_results["F1"]!= 0]

transform_train_and_eval(cluster_4)
cluster_4_results = pd.read_csv('submission.csv')
cluster_4_results = cluster_4_results[cluster_4_results["F1"]!= 0]

transform_train_and_eval(cluster_5)
cluster_5_results = pd.read_csv('submission.csv')
cluster_5_results = cluster_5_results[cluster_5_results["F1"]!= 0]

transform_train_and_eval(cluster_6)
cluster_6_results = pd.read_csv('submission.csv')
cluster_6_results = cluster_6_results[cluster_6_results["F1"]!= 0]

In [None]:
concatted_df = pd.concat([cluster_0_results, cluster_1_results, cluster_2_results, cluster_3_results, cluster_4_results,cluster_5_results, cluster_6_results])

In [None]:
concatted_df.to_csv('submission.csv')