# Objective

* Make a baseline model that predict the validation (28 days). 
* This competition has 2 stages, so the main objective is to make a model that can predict the demand for the next 28 days

In [None]:
!pip install kneed

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import dask.dataframe as dd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import dask_xgboost as xgb
import dask.dataframe as dd
from sklearn import preprocessing, metrics
from kneed import KneeLocator
import gc
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


# function to read the data and merge it (ignoring some columns, this is a very fst model)


def read_data():
    print('Reading files...')
    calendar = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/calendar.csv')
    calendar = reduce_mem_usage(calendar)
    print('Calendar has {} rows and {} columns'.format(calendar.shape[0], calendar.shape[1]))
    sell_prices = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sell_prices.csv')
    sell_prices = reduce_mem_usage(sell_prices)
    print('Sell prices has {} rows and {} columns'.format(sell_prices.shape[0], sell_prices.shape[1]))
    sales_train_validation = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv')
    print('Sales train validation has {} rows and {} columns'.format(sales_train_validation.shape[0], sales_train_validation.shape[1]))
    submission = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sample_submission.csv')
    print('Submission has {} rows and {} columns'.format(submission.shape[0], submission.shape[1]))
    return calendar, sell_prices, sales_train_validation, submission


def melt_and_merge(calendar, sell_prices, sales_train_validation, submission, nrows = 55000000, merge = False):
        
    # melt sales data, get it ready for training
    sales_train_validation = pd.melt(sales_train_validation, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')

    print('Melted sales train validation has {} rows and {} columns'.format(sales_train_validation.shape[0], sales_train_validation.shape[1]))
    sales_train_validation = reduce_mem_usage(sales_train_validation)
    
    # seperate test dataframes
    test1_rows = [row for row in submission['id'] if 'validation' in row]
    test2_rows = [row for row in submission['id'] if 'evaluation' in row]
    test1 = submission[submission['id'].isin(test1_rows)]
    test2 = submission[submission['id'].isin(test2_rows)]
    
    # change column names
    test1.columns = ['id', 'd_1914', 'd_1915', 'd_1916', 'd_1917', 'd_1918', 'd_1919', 'd_1920', 'd_1921', 'd_1922', 'd_1923', 
                     'd_1924', 'd_1925', 'd_1926', 'd_1927', 'd_1928', 'd_1929', 'd_1930', 'd_1931', 
                      'd_1932', 'd_1933', 'd_1934', 'd_1935', 'd_1936', 'd_1937', 'd_1938', 'd_1939', 'd_1940', 'd_1941']
    test2.columns = ['id', 'd_1942', 'd_1943', 'd_1944', 'd_1945', 'd_1946', 'd_1947', 'd_1948', 'd_1949', 'd_1950', 'd_1951', 
                     'd_1952', 'd_1953', 'd_1954', 'd_1955', 'd_1956', 'd_1957', 'd_1958', 'd_1959', 
                      'd_1960', 'd_1961', 'd_1962', 'd_1963', 'd_1964', 'd_1965', 'd_1966', 'd_1967', 'd_1968', 'd_1969']
    
    # get product table
    product = sales_train_validation[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']].drop_duplicates()
    
    # merge with product table
    test2['id'] = test2['id'].str.replace('_evaluation','_validation')
    test1 = test1.merge(product, how = 'inner', on = 'id')
    test2 = test2.merge(product, how = 'inner', on = 'id')
    test2['id'] = test2['id'].str.replace('_validation','_evaluation')
    
    # 
    test1 = pd.melt(test1, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')
    test2 = pd.melt(test2, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')
    
    sales_train_validation['part'] = 'train'
    test1['part'] = 'test1'
    test2['part'] = 'test2'
    
    data = pd.concat([sales_train_validation, test1, test2], axis = 0, ignore_index=True)
    
    del sales_train_validation, test1, test2
    
    # get only a sample for fast training
    data = data.loc[nrows:]
    
    # drop some calendar features
    calendar.drop(['weekday', 'wday', 'month', 'year'], inplace = True, axis = 1)
    
    # delete test2 for now
    data = data[data['part'] != 'test2']
    
    if merge:
        # notebook crash with the entire dataset (maybee use tensorflow, dask, pyspark xD)
        data = pd.merge(data, calendar, how = 'left', left_on = ['day'], right_on = ['d'])
        data.drop(['d', 'day'], inplace = True, axis = 1)
        # get the sell price data (this feature should be very important)
        data = data.merge(sell_prices, on = ['store_id', 'item_id', 'wm_yr_wk'], how = 'left')
        print('Our final dataset to train has {} rows and {} columns'.format(data.shape[0], data.shape[1]))
    else: 
        pass
    
    gc.collect()
    
    return data
  
calendar, sell_prices, sales_train_validation, submission = read_data()

# sales_train_validation.head()
# data = melt_and_merge(calendar, sell_prices, sales_train_validation, submission, nrows = 27500000, merge = True)

* We have the data to build our first model, let's build a baseline and predict the validation data (in our case is test1)

In [None]:
def transform(data):
    
    nan_features = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    for feature in nan_features:
        data[feature].fillna('unknown', inplace = True)
        
    cat = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    for feature in cat:
        encoder = preprocessing.LabelEncoder()
        data[feature] = encoder.fit_transform(data[feature])
    
    return data

def simple_fe(data):
    
    # rolling demand features
    data['lag_t28'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28))
    data['lag_t29'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(29))
    data['lag_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(30))
    data['rolling_mean_t7'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(7).mean())
    data['rolling_std_t7'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(7).std())
    data['rolling_mean_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).mean())
    data['rolling_mean_t90'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(90).mean())
    data['rolling_mean_t180'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(180).mean())
    data['rolling_std_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).std())
    data['rolling_skew_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).skew())
    data['rolling_kurt_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).kurt())
    
    
    # price features
    data['lag_price_t1'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.shift(1))
    data['price_change_t1'] = (data['lag_price_t1'] - data['sell_price']) / (data['lag_price_t1'])
    data['rolling_price_max_t365'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.shift(1).rolling(365).max())
    data['price_change_t365'] = (data['rolling_price_max_t365'] - data['sell_price']) / (data['rolling_price_max_t365'])
    data['rolling_price_std_t7'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.rolling(7).std())
    data['rolling_price_std_t30'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.rolling(30).std())
    data.drop(['rolling_price_max_t365', 'lag_price_t1'], inplace = True, axis = 1)
    
    # time features
    data['date'] = pd.to_datetime(data['date'])
    data['year'] = data['date'].dt.year
    data['month'] = data['date'].dt.month
    data['week'] = data['date'].dt.week
    data['day'] = data['date'].dt.day
    data['dayofweek'] = data['date'].dt.dayofweek
    
    
    return data

def run_lgb(data):
    
    # going to evaluate with the last 28 days
    # Modified this to include different date range (See Cell: X for more detailed reason)
    x_train = data[data['date'] <= '2016-03-27']
    y_train = x_train['demand']
    x_val = data[(data['date'] > '2016-03-27') & (data['date'] <= '2016-04-24')]
    y_val = x_val['demand']
    test = data[(data['date'] > '2016-04-24')]
    del data
    gc.collect()

    # define random hyperparammeters
    params = {
        'boosting_type': 'gbdt',
        'metric': 'rmse',
        'objective': 'regression',
        'n_jobs': -1,
        'seed': 236,
        'learning_rate': 0.1,
        'bagging_fraction': 0.75,
        'bagging_freq': 10, 
        'colsample_bytree': 0.75}
    train_set = lgb.Dataset(x_train[features], y_train)
    val_set = lgb.Dataset(x_val[features], y_val)
    
    del x_train, y_train

    model = lgb.train(params, train_set, num_boost_round = 2500, early_stopping_rounds = 50, valid_sets = [train_set, val_set], verbose_eval = 100)
    val_pred = model.predict(x_val[features])
    val_score = np.sqrt(metrics.mean_squared_error(val_pred, y_val))
    print(f'Our val rmse score is {val_score}')
    y_pred = model.predict(test[features])
    test['demand'] = y_pred
    return test

def predict(test, submission, filename):
    predictions = test[['id', 'date', 'demand']]
    predictions = pd.pivot(predictions, index = 'id', columns = 'date', values = 'demand').reset_index()
    predictions.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]

    evaluation_rows = [row for row in submission['id'] if 'evaluation' in row] 
    evaluation = submission[submission['id'].isin(evaluation_rows)]

    validation = submission[['id']].merge(predictions, on = 'id')
    final = pd.concat([validation, evaluation])
    
    final.to_csv(filename, index = False)
    

# define list of features
features = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'year', 'month', 'week', 'day', 'dayofweek', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 
            'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'lag_t28', 'lag_t29', 'lag_t30', 'rolling_mean_t7', 'rolling_std_t7', 'rolling_mean_t30', 'rolling_mean_t90', 
            'rolling_mean_t180', 'rolling_std_t30', 'price_change_t1', 'price_change_t365', 'rolling_price_std_t7', 'rolling_price_std_t30', 'rolling_skew_t30', 'rolling_kurt_t30']


def transform_train_and_eval(data, filename):
    data = transform(data)
    data = simple_fe(data)
    # reduce memory for new features so we can train
    data = reduce_mem_usage(data)
    test = run_lgb(data)
    predict(test, submission, filename)

# **1. Evaluate CA_1 store data** 

Below code illustrates the results and metrics after applying the lgb model on CA_1 one store data. 

In [None]:
# for all CA_1 store data
store_sales_train_validation = sales_train_validation.loc[sales_train_validation['store_id'] == 'CA_1']
store_sell_prices =  sell_prices.loc[sell_prices['store_id'] == 'CA_1']
data = melt_and_merge(calendar, store_sell_prices, store_sales_train_validation, submission, nrows = 50000, merge = True)
transform_train_and_eval(data,'submission_ca.csv')

# **2. Cluster and Evaluate CA_1 store data** 
#### Cluster the CA_1 store items based on aggregated monthly demand and sell price of each item in the store

#### **2.1 Data Collection**

Load the walmart dataset with items and features and filter the data to include records only present in store CA_1. Both the sales_train_validation and sell_prices datasets have been restricted to include only CA_1 store data. The submission dataset is left intact with no filtering at the store level. Type coercions for the date column has been included to facilitate resampling based on month attribute.(see *2.* more more details)

In [None]:
# for clustering CA_1 store data by monthly demand 
calendar, sell_prices, sales_train_validation, submission = read_data()
store_sales_train_validation = sales_train_validation.loc[sales_train_validation['store_id'] == 'CA_1']
store_sell_prices =  sell_prices.loc[sell_prices['store_id'] == 'CA_1']
data_ca = melt_and_merge(calendar, store_sell_prices, store_sales_train_validation, submission, nrows = 50000, merge = True)
data_ca['date'] = pd.to_datetime(data_ca['date'])

#### **2.2 Data Exploration**

Once all the necessary cleanup steps have been completed, viewing the dataset to understand the features available and their respective values.

In [None]:
data_ca.head()

**The below code illustrates the total number of items present in the CA_1 store data**

In [None]:
len(data_ca['item_id'].unique())

#### **2.3 Define functions**

***1. getDataForClusters()*** function is defined to get the average monthly demand and the median sell price for each item. The dataset includes 64 months of historical data and so we expect each item to have value for each of these months. 

This function takes in two input parameters of type DataFrame and List and returns two Lists 

**Parameters:** 
1. ***df***: Dataframe which includes date, demand, item_id, sell_price features at the least.
2. ***items***: List which includes unique items in the df parameter defined above.

**Returns:**
1. ***data_demand***: Includes list of all the average demand aggregated values for each item for all the 64 months.
2. ***data_sellPrice***: Includes list of all the median sell price aggregated values for each item for all the 64 months.

***2. getDataForClustersPlot()*** function is defined to get the average monthly demand for each cluster for all items. 

This function takes in two input parameters of type DataFrame and List and returns a List

**Parameters:** 
1. ***df***: Dataframe which includes date, demand, item_id features at the least.
2. ***clusters***: List which includes unique clusters obtained from k-means clustering.

**Returns:**
1. ***data_demand***: Includes list of all the average demand aggregated values for each cluster for all the 64 months.


In [None]:
# define function to get aggregated data for clustering 
def getDataForClusters(df,items):
    data_demand = []
    data_sellPrice = []
    for item in items:
        df_item_data = df[df['item_id'] == item]
        data_demand.append(df_item_data.resample('M', on='date')['demand'].mean().values)  
        data_sellPrice.append(df_item_data.resample('M', on='date')['sell_price'].median().values)  
    return data_demand, data_sellPrice

# aggregate data with average monthly demand for each cluster
def getDataForClustersPlot(df,clusters):
    data_demand = []
    for cluster in clusters:
        df_cluster_data = df[df['cluster'] == cluster]
        data_demand.append(df_cluster_data.resample('M', on='date')['demand'].mean().values)  
    return data_demand

#### **2.4 Data Aggregation**

After defining the data aggregation function in earlier step, now its time to put these into play and make the dataset ready for clustering. Below are required steps:

1. Get all the distinct items available in CA_1 store using .unique() 
2. Once we identify the unique items, we feed in the dataset and items as arguments and make ***getDataForClusters()*** call.
3. Capture the monthly average demand and median sell price lists into its own seperate list variables by deconstructing the tuple returned from the above function call.

In [None]:
# Get data for clustering
total_items = data_ca['item_id'].unique()
data_demand, data_sellPrice = getDataForClusters(data_ca,total_items)

#### **2.5 Derived Feature**

In order to take both demand and sell prices into consideration for grouping items in the CA_1 store - crafted a new derived feature which is the product of average demand and median sell price obtained from the aggregated data above for each month.

**Output**: Is a list of product values of mean demand and median sell price

**Technical details**: Used list comprehensions to do the element wise product after zipping both the mean demand and median sell price lists. Alternatively, the same operation can be done using numpy.

In [None]:
data_aggregated = [avgDemand * medianSellPrice for avgDemand, medianSellPrice in zip(data_demand, data_sellPrice)]

#### **2.6 Data for Clustering**

Now that we have all the data required for clustering such as items, monthly product data of average demand and median sell price - we can now feed this data to a dataframe and visualize the values. The 64th month sales data is not available and every value is defaulted to 0, so explicitly removed this month to avoid some unintended noise during grouping. 

In [None]:
# Removing the last month sales as the data indicates 0 data
data_for_clustering = pd.DataFrame(data_aggregated, index=total_items).drop(columns=[63]).fillna(0)
data_for_clustering.head()

#### **2.7 Clustering**

Below are the steps:

1. **Normalization** : Inorder to transform all of the attributes which may be different orders of magnitude into similar scale so that it can be compared and can contribute equally to the distance computations while using it for clustering. 
2. **K-means**: Using K-means for clustering as the data contains numerical features. 
3. **Plot SSE**: It is ideal to find out the optimal value for K. This can be acheived by fitting multiple values of K and capturing the corresponding SSE values and visualizing elbow curve for all SSE obtained to find the number of clusters.
4. **Find Optimal K**: As we may or may not find the right value just by eye balling the elbow plot, it is best to use Kneelocator to find the optimal K value from the *elbow* attribute.
5. **Fit K-means**:  Fit the data with the optimal number of K obtained above and determine the clusters.
6. **Create Cluster Map**: Create a map containing the item id and the cluster it belongs to.

In [None]:
# K Means clustering
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (5,5)

# normalizing demand across all departments
scaler = MinMaxScaler()
features = scaler.fit_transform(data_for_clustering)

# fit kmeans
sse = []
for k in range(1, 10):
    kmeans = KMeans(k)
    kmeans.fit(features)
    sse.append(kmeans.inertia_)

# elbow curve to find the number of clusters
plt.plot(range(1, 10), sse)
plt.Circle((5, 5), 0.5, color='b', fill=False)
plt.xticks(range(1, 10))
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.show()

# determine number of clusters
knee = KneeLocator(range(1, 10), sse, curve="convex", direction="decreasing")
print("Optimal number of clusters:",knee.elbow)

# fit kmeans with the optimal number of clusters from above plot
kmeans = KMeans(n_clusters=3)
kmeans.fit(features)

# create a map of items and corresponding cluster it belongs to
clusterMap = pd.DataFrame()
clusterMap['item_id'] = data_for_clustering.index.values
clusterMap['cluster'] = kmeans.labels_

**Viewing the cluster map**

In [None]:
clusterMap.head()

**Join the assigned clusters back to the initial data set based on item_id**

In [None]:
data_with_clusters = data_ca.merge(clusterMap, how='left', on='item_id')
data_with_clusters.head()

#### **2.8 Data for Plots**

Steps below:

1. Get all the distinct clusters for CA_1 store items using .unique() 
2. Once we identify the unique clusters, we feed in the dataset with clusters attribute and clusters as arguments and make ***getDataForClustersPlot()*** call.
3. Capture the monthly average demand by each cluster to a list variable returned from the above function call.
4. We can now feed this data to a dataframe and view the values.
5. The 64th month sales data is not available and every value is defaulted to 0, so explicitly removed this month to avoid some unintended noise in plots


In [None]:
unique_clusters = clusterMap['cluster'].unique()
avg_monthly_demand = getDataForClustersPlot(data_with_clusters,unique_clusters)
df_avg_monthly_demand = pd.DataFrame(avg_monthly_demand,unique_clusters).drop(columns=[63])
df_avg_monthly_demand.head()

#### **2.9 Plot Clusters**

The below code plots average monthly demand for items in each cluster

**X-axis**: Month numbers (0-62): total 63 months

**Y-axis**: Average demand for each month of all items in each cluster

In [None]:
plt.rcParams["figure.figsize"] = (20,5)
for i in unique_clusters:
    plt.plot(df_avg_monthly_demand.columns, df_avg_monthly_demand.loc[i].values, lw=3)
    plt.xticks(df_avg_monthly_demand.columns)
    plt.xticks(np.arange(0, 63, 1))
    plt.legend(['Slow demand Cluster(2)', 'Medium demand Cluster(0)', 'High demand Cluster(1)'])

#### **2.10 Defining Clusters**

1. **Cluster 1**: **High** demand

The items belonging to this cluster have a high average demand than the rest of the other clusters. The average demand for this group ranges from 10 to 25 reaching it's peak sale during the months 30 to 33.

2. **Cluster 0**: **Medium** demand

The items belonging to this cluster have a medium average demand than the rest of the other clusters. The average demand for this group ranges from 2 to 5 reaching it's peak sale during the months 30 to 33. The items in this group tend to have similar trend as with the fast selling items.

3. **Cluster 2**: **Low** demand

The items belonging to this cluster have a very slow selling rate than the rest of the other clusters. The average demand for this group ranges from 0 to 1. There is an increasing trend for these items over the years.

In [None]:
plt.plot(np.linspace(0, 63, 63), df_avg_monthly_demand.loc[1].values,c='green', lw=3)
plt.xticks(df_avg_monthly_demand.columns)
plt.xticks(np.arange(0, 63, 1))
plt.legend(['High demand Cluster(1)'])
plt.show()

plt.plot(np.linspace(0, 63, 63), df_avg_monthly_demand.loc[0].values,c='orange', lw=3)
plt.xticks(df_avg_monthly_demand.columns)
plt.xticks(np.arange(0, 63, 1))
plt.legend(['Medium demand Cluster(0)'])
plt.show()

plt.plot(np.linspace(0, 63, 63), df_avg_monthly_demand.loc[2].values, lw=3)
plt.xticks(df_avg_monthly_demand.columns)
plt.xticks(np.arange(0, 63, 1))
plt.legend(['Slow demand Cluster(2)'])
plt.show()

In [None]:
def transform(df_data):
    
    nan_features = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    for feature in nan_features:
        df_data[feature].fillna('unknown', inplace = True)
        
    cat = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    for feature in cat:
        encoder = preprocessing.LabelEncoder()
        df_data[feature] = encoder.fit_transform(df_data[feature])
    
    return df_data

def simple_fe(df_data):
    
    # rolling demand features
    df_data['lag_t28'] = df_data.groupby(['id'])['demand'].transform(lambda x: x.shift(28))
    df_data['lag_t29'] = df_data.groupby(['id'])['demand'].transform(lambda x: x.shift(29))
    df_data['lag_t30'] = df_data.groupby(['id'])['demand'].transform(lambda x: x.shift(30))
    df_data['rolling_mean_t7'] = df_data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(7).mean())
    df_data['rolling_std_t7'] = df_data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(7).std())
    df_data['rolling_mean_t30'] = df_data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).mean())
    df_data['rolling_mean_t90'] = df_data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(90).mean())
    df_data['rolling_mean_t180'] = df_data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(180).mean())
    df_data['rolling_std_t30'] = df_data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).std())
    df_data['rolling_skew_t30'] = df_data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).skew())
    df_data['rolling_kurt_t30'] = df_data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).kurt())
    
    
    # price features
    df_data['lag_price_t1'] = df_data.groupby(['id'])['sell_price'].transform(lambda x: x.shift(1))
    df_data['price_change_t1'] = (df_data['lag_price_t1'] - df_data['sell_price']) / (df_data['lag_price_t1'])
    df_data['rolling_price_max_t365'] = df_data.groupby(['id'])['sell_price'].transform(lambda x: x.shift(1).rolling(365).max())
    df_data['price_change_t365'] = (df_data['rolling_price_max_t365'] - df_data['sell_price']) / (df_data['rolling_price_max_t365'])
    df_data['rolling_price_std_t7'] = df_data.groupby(['id'])['sell_price'].transform(lambda x: x.rolling(7).std())
    df_data['rolling_price_std_t30'] = df_data.groupby(['id'])['sell_price'].transform(lambda x: x.rolling(30).std())
    df_data.drop(['rolling_price_max_t365', 'lag_price_t1'], inplace = True, axis = 1)
    
    # time features
    df_data['date'] = pd.to_datetime(df_data['date'])
    df_data['year'] = df_data['date'].dt.year
    df_data['month'] = df_data['date'].dt.month
    df_data['week'] = df_data['date'].dt.week
    df_data['day'] = df_data['date'].dt.day
    df_data['dayofweek'] = df_data['date'].dt.dayofweek
    
    
    return df_data

def run_lgb(df_data):
    
    # going to evaluate with the last 28 days
    # Modified this to include different date range (See Cell: X for more detailed reason)
    x_train = df_data[df_data['date'] <= '2016-03-27']
    y_train = x_train['demand']
    x_val = df_data[(df_data['date'] > '2016-03-27') & (df_data['date'] <= '2016-04-24')]
    y_val = x_val['demand']
    test = df_data[(df_data['date'] > '2016-04-24')]
    del df_data
    gc.collect()

    # define random hyperparammeters
    params = {
        'boosting_type': 'gbdt',
        'metric': 'rmse',
        'objective': 'regression',
        'n_jobs': -1,
        'seed': 236,
        'learning_rate': 0.1,
        'bagging_fraction': 0.75,
        'bagging_freq': 10, 
        'colsample_bytree': 0.75}
    train_set = lgb.Dataset(x_train[features], y_train)
    val_set = lgb.Dataset(x_val[features], y_val)
    
    del x_train, y_train

    model = lgb.train(params, train_set, num_boost_round = 2500, early_stopping_rounds = 50, valid_sets = [train_set, val_set], verbose_eval = 100)
    val_pred = model.predict(x_val[features])
    val_score = np.sqrt(metrics.mean_squared_error(val_pred, y_val))
    print(f'Our val rmse score is {val_score}')
    y_pred = model.predict(test[features])
    test['demand'] = y_pred
    return test

def predict(test, submission, filename):
    predictions = test[['id', 'date', 'demand']]
    predictions = pd.pivot(predictions, index = 'id', columns = 'date', values = 'demand').reset_index()
    predictions.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]

#     evaluation_rows = [row for row in submission['id'] if 'evaluation' in row] 
#     evaluation = submission[submission['id'].isin(evaluation_rows)]

    validation = submission[['id']].merge(predictions, on = 'id')
#     final = pd.concat([validation, evaluation])
    
    validation.to_csv(filename, index = False)
    

# define list of features
features = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'year', 'month', 'week', 'day', 'dayofweek', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 
            'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'lag_t28', 'lag_t29', 'lag_t30', 'rolling_mean_t7', 'rolling_std_t7', 'rolling_mean_t30', 'rolling_mean_t90', 
            'rolling_mean_t180', 'rolling_std_t30', 'price_change_t1', 'price_change_t365', 'rolling_price_std_t7', 'rolling_price_std_t30', 'rolling_skew_t30', 'rolling_kurt_t30']


def transform_train_and_eval(df_data, filename):
    df_data = transform(df_data)
    df_data = simple_fe(df_data)
    # reduce memory for new features so we can train
    df_data = reduce_mem_usage(df_data)
    test = run_lgb(df_data)
    predict(test, submission, filename)

#### **2.11 Modeling on Clusters**


In [None]:
# Slow demand items cluster
group_2 = data_with_clusters[data_with_clusters['cluster'] == 2]

# Medium demand items cluster
group_0 = data_with_clusters[data_with_clusters['cluster'] == 0]

# High demand items cluster
group_1 = data_with_clusters[data_with_clusters['cluster'] == 1]

# Validate model on each individual groups created above
print("\nGroup 0")
transform_train_and_eval(group_0,'submission0.csv')

print("\nGroup 1")
transform_train_and_eval(group_1,'submission1.csv')

print("\nGroup 2")
transform_train_and_eval(group_2,'submission2.csv')

#### **2.12 Make Submission to the Competition**

Now we have the predictions from all the three clusters and will use the group that has the lowest rmse score and will replace these items prediction in the full submission dataset having predictions for all the stores and make a final submission to the competition.

**Load the final submission files for each cluster after modeling**

In [None]:
submission0 = pd.read_csv('./submission0.csv')
submission1 = pd.read_csv('./submission1.csv')
submission2 = pd.read_csv('./submission2.csv')
submission_ca = pd.read_csv('./submission_ca.csv')

**View the data for cluster 2 having the lowest RMSE score**

In [None]:
submission2.head()

**Get the full submission file containing the submission for all the stores**

**Reference:** https://www.kaggle.com/kneroma/m5-first-public-notebook-under-0-50/notebook

In [None]:
full_submission = pd.read_csv('/kaggle/input/d/josephtb2/output/submission.csv')

In [None]:
full_submission.head()

**Joining the all stores submission file with the cluster 2 predicted values**

In [None]:
full_submission.merge(submission2,how='left',on='id')
left = full_submission.set_index('id')
right = submission2.set_index('id')
final = left.reindex(columns=left.columns.union(right.columns))
final.update(right)
final.reset_index(inplace=True)
final

**Write the final submission file to a csv and use this file to make a submission to the competition**

In [None]:
final.to_csv("submission.csv",index=False)