# Objective

* Make a baseline model that predict the validation (28 days). 
* This competition has 2 stages, so the main objective is to make a model that can predict the demand for the next 28 days

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import dask.dataframe as dd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import dask_xgboost as xgb
import dask.dataframe as dd
from sklearn import preprocessing, metrics
import gc
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


# function to read the data and merge it (ignoring some columns, this is a very fst model)


def read_data():
    print('Reading files...')
    calendar = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/calendar.csv')
    calendar = reduce_mem_usage(calendar)
    print('Calendar has {} rows and {} columns'.format(calendar.shape[0], calendar.shape[1]))
    sell_prices = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sell_prices.csv')
    sell_prices = reduce_mem_usage(sell_prices)
    print('Sell prices has {} rows and {} columns'.format(sell_prices.shape[0], sell_prices.shape[1]))
    sales_train_validation = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv')
    print('Sales train validation has {} rows and {} columns'.format(sales_train_validation.shape[0], sales_train_validation.shape[1]))
    submission = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sample_submission.csv')
    return calendar, sell_prices, sales_train_validation, submission


In [None]:
def melt_and_merge(calendar, sell_prices, sales_train_validation, submission, nrows = 55000000, merge = False):
    
    # melt sales data, get it ready for training
    sales_train_validation = pd.melt(sales_train_validation, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')
    print('Melted sales train validation has {} rows and {} columns'.format(sales_train_validation.shape[0], sales_train_validation.shape[1]))
    sales_train_validation = reduce_mem_usage(sales_train_validation)
    
    # seperate test dataframes
    test1_rows = [row for row in submission['id'] if 'validation' in row]
    test2_rows = [row for row in submission['id'] if 'evaluation' in row]
    test1 = submission[submission['id'].isin(test1_rows)]
    test2 = submission[submission['id'].isin(test2_rows)]
    
    # change column names
    test1.columns = ['id', 'd_1914', 'd_1915', 'd_1916', 'd_1917', 'd_1918', 'd_1919', 'd_1920', 'd_1921', 'd_1922', 'd_1923', 'd_1924', 'd_1925', 'd_1926', 'd_1927', 'd_1928', 'd_1929', 'd_1930', 'd_1931', 
                      'd_1932', 'd_1933', 'd_1934', 'd_1935', 'd_1936', 'd_1937', 'd_1938', 'd_1939', 'd_1940', 'd_1941']
    test2.columns = ['id', 'd_1942', 'd_1943', 'd_1944', 'd_1945', 'd_1946', 'd_1947', 'd_1948', 'd_1949', 'd_1950', 'd_1951', 'd_1952', 'd_1953', 'd_1954', 'd_1955', 'd_1956', 'd_1957', 'd_1958', 'd_1959', 
                      'd_1960', 'd_1961', 'd_1962', 'd_1963', 'd_1964', 'd_1965', 'd_1966', 'd_1967', 'd_1968', 'd_1969']
    
    # get product table
    product = sales_train_validation[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']].drop_duplicates()
    
    # merge with product table
    test2['id'] = test2['id'].str.replace('_evaluation','_validation')
    test1 = test1.merge(product, how = 'inner', on = 'id')
    test2 = test2.merge(product, how = 'inner', on = 'id')
    test2['id'] = test2['id'].str.replace('_validation','_evaluation')
    
    # 
    test1 = pd.melt(test1, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')
    test2 = pd.melt(test2, id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name = 'day', value_name = 'demand')
    
    sales_train_validation['part'] = 'train'
    test1['part'] = 'test1'
    test2['part'] = 'test2'
    
    data = pd.concat([sales_train_validation, test1, test2], axis = 0)
    
    del sales_train_validation, test1, test2
    
    # get only a sample for fst training
    data = data.loc[nrows:]
    
    # drop some calendar features
    calendar.drop(['weekday', 'wday', 'month', 'year'], inplace = True, axis = 1)
    
    # delete test2 for now
    data = data[data['part'] != 'test2']
    
    if merge:
        # notebook crash with the entire dataset (maybee use tensorflow, dask, pyspark xD)
        data = pd.merge(data, calendar, how = 'left', left_on = ['day'], right_on = ['d'])
        data.drop(['d', 'day'], inplace = True, axis = 1)
        # get the sell price data (this feature should be very important)
        data = data.merge(sell_prices, on = ['store_id', 'item_id', 'wm_yr_wk'], how = 'left')
        print('Our final dataset to train has {} rows and {} columns'.format(data.shape[0], data.shape[1]))
    else: 
        pass
    
    gc.collect()
    
    return data

In [None]:
calendar, sell_prices, sales_train_validation, submission = read_data()

In [None]:
# do the results for one store and then extrapolate to all stores

one_store_sales = sales_train_validation.loc[sales_train_validation['store_id']=='CA_1']
one_store_sell_prices = sell_prices.loc[sell_prices['store_id']=='CA_1']
data = melt_and_merge(calendar, one_store_sell_prices, one_store_sales, submission, nrows = 500000, merge = True)


In [None]:
def transform(data):
    
    nan_features = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    for feature in nan_features:
        data[feature].fillna('unknown', inplace = True)
        
    cat = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
    for feature in cat:
        encoder = preprocessing.LabelEncoder()
        data[feature] = encoder.fit_transform(data[feature])
    
    return data

def simple_fe(data):
    
    # rolling demand features
    data['lag_t28'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28))
    data['lag_t29'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(29))
    data['lag_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(30))
    data['rolling_mean_t7'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(7).mean())
    data['rolling_std_t7'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(7).std())
    data['rolling_mean_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).mean())
    data['rolling_mean_t90'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(90).mean())
    data['rolling_mean_t180'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(180).mean())
    data['rolling_std_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).std())
    data['rolling_skew_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).skew())
    data['rolling_kurt_t30'] = data.groupby(['id'])['demand'].transform(lambda x: x.shift(28).rolling(30).kurt())
    
    
    # price features
    data['lag_price_t1'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.shift(1))
    data['price_change_t1'] = (data['lag_price_t1'] - data['sell_price']) / (data['lag_price_t1'])
    data['rolling_price_max_t365'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.shift(1).rolling(365).max())
    data['price_change_t365'] = (data['rolling_price_max_t365'] - data['sell_price']) / (data['rolling_price_max_t365'])
    data['rolling_price_std_t7'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.rolling(7).std())
    data['rolling_price_std_t30'] = data.groupby(['id'])['sell_price'].transform(lambda x: x.rolling(30).std())
    data.drop(['rolling_price_max_t365', 'lag_price_t1'], inplace = True, axis = 1)
    
    # time features
    data['date'] = pd.to_datetime(data['date'])
    data['year'] = data['date'].dt.year
    data['month'] = data['date'].dt.month
    data['week'] = data['date'].dt.week
    data['day'] = data['date'].dt.day
    data['dayofweek'] = data['date'].dt.dayofweek
    
    
    return data

In [None]:
#clustering part
#Pick one store
#normalize here

data = transform(data)
data = simple_fe(data)

In [None]:
data.head()

In [None]:
# Create unique keys of data to map aggregate values to!
data_unique_keys = data[['item_id','dept_id','cat_id']].drop_duplicates()


data_unique_keys.sort_values(by = 'item_id')
# 'demand','sell_price', 'wm_yr_wk']
# unique row id = iddeptidcatidwmy

#sum of all sales in a week
#also mean of the sell price across time

# each row should be a unique row
#pd.concat to create a new individual item id that 
# data_1.shape

In [None]:
# create avg demand by week and then check resulting dataframe
data_avg_demand_by_week = data.groupby(['item_id','wm_yr_wk']).agg('sum')['demand']
data_avg_demand_by_week.head(50)

In [None]:
#Get overall average demand in any given week by item and check resulting dataframe
data_avg_demand_by_week_overall = data_avg_demand_by_week.groupby(['item_id']).mean()
data_avg_demand_by_week_overall.head(6)

In [None]:
#Merge the avg weekly demand by item into the unique keys list as a clustering feature
data_unique_keys = data_unique_keys.merge(data_avg_demand_by_week_overall, on = "item_id")
data_unique_keys.head(6)

In [None]:
#get median price and double check its accuracy in shape
data1 = data
data_median_price_calc = data1[['item_id','sell_price']].groupby(['item_id']).agg('median')['sell_price']
data_median_price_calc.shape

In [None]:
# confirm data frame head is working as expected with values 
data_median_price_calc.head()

In [None]:
#confirm that all items have a non-null price
data_median_price_calc.isnull().values.any()

In [None]:
# merge and confirm column / row numbers
data_unique_keys = data_unique_keys.merge(data_median_price_calc, on = "item_id")
data_unique_keys.head()

In [None]:
# sort and check that demand and sale price imported correctly into the dataset
data_unique_keys.sort_values( by = 'item_id').head()

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler,MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from scipy import sparse

n_clusters = 4

In [None]:
clustering_feature_input = data_unique_keys
scaling = Pipeline(
[
    ("scaler", MinMaxScaler())
])

clustering_feature_input[['demand', 'sell_price']] = scaling.fit_transform(clustering_feature_input[['demand', 'sell_price']])
clustering_feature_input.head()

In [None]:
# checking unique category values
catuniques = clustering_feature_input['cat_id'].drop_duplicates()
catuniques

In [None]:
#decided not to use OneHotEncoder, instead using get_dummies within a pd.concat call
# OneHot = OneHotEncoder()
# categ_features = clustering_feature_input[['dept_id', 'cat_id']]
# categ_features = OneHot.fit_transform(categ_features)
# type(categ_features)

In [None]:
df = pd.concat([data_unique_keys,pd.get_dummies(data_unique_keys['dept_id'], prefix=['dept_id'],dummy_na=False)],axis=1).drop(['dept_id'],axis=1)
df.head()

In [None]:
df1 = pd.concat([df,pd.get_dummies(df['cat_id'], prefix=['cat_id'],dummy_na=False)],axis=1).drop(['cat_id'],axis=1)
df1.head()

In [None]:
df1.shape

In [None]:
col_name_list = ['item_id', 'demand', 'sell_price','dept0','dept1','dept2','dept3','dept4','dept5','dept6','cat0','cat1','cat2',]
df1.set_axis(col_name_list, axis = 1)
df1.head()

# take out the avg by putting it into a new dataframe then push it back into the unique key dataframe, and then youll have the normalization done.
# add onehot encoder for department and categories or use get dummies, can use .apply since the

In [None]:
# remove item_id from the list of columns to cluster
input_1_store_clusters = df1.loc[:,df1.columns!='item_id']
input_1_store_clusters.shape

In [None]:
# show SSE and Sihouette graph to identify optimal amount of clusters
def cluster_refining(data_input):
    sse = []
    silhouette_coefficients = []
    for k in range(1, 11):
        kmeans = KMeans(n_clusters=k)
        kmeans.fit(data_input)
        sse.append(kmeans.inertia_)
    plt.style.use("fivethirtyeight")
    plt.plot(range(1, 11), sse)
    plt.show()
    for k in range(2, 11):
        kmeans = KMeans(n_clusters=k)
        kmeans.fit(data_input)
        score = silhouette_score(data_input, kmeans.labels_)
        silhouette_coefficients.append(score)
    plt.style.use("fivethirtyeight")
    sil_plt = plt.plot(range(2, 11), silhouette_coefficients)
    plt.xticks(range(2, 11))
    plt.xlabel("Number of Clusters")
    plt.ylabel("Silhouette Coefficient")
    plt.show()

    
#call the function
cluster_refining(input_1_store_clusters)

In [None]:
# add thing for redoing k to fit the graphs given by the function
new_k = 7
kmeans = KMeans(n_clusters = new_k)
kmeans.fit(input_1_store_clusters)
df1['cluster_id'] = kmeans.labels_
df1['cluster_id'].unique()

In [None]:
df1.head()

Now lets evaluate our data using the functions and separate the dataframe into 5 distinct ones for evaluation!

In [None]:
#define lgb function
def run_lgb(data):
    
    # going to evaluate with the last 28 days
    x_train = data[data['date'] <= '2016-03-27']
    y_train = x_train['demand']
    x_val = data[(data['date'] > '2016-03-27') & (data['date'] <= '2016-04-24')]
    y_val = x_val['demand']
    test = data[(data['date'] > '2016-04-24')]
    del data
    gc.collect()

    # define random hyperparammeters
    params = {
        'boosting_type': 'gbdt',
        'metric': 'rmse',
        'objective': 'regression',
        'n_jobs': -1,
        'seed': 236,
        'learning_rate': 0.1,
        'bagging_fraction': 0.75,
        'bagging_freq': 10, 
        'colsample_bytree': 0.75}

    train_set = lgb.Dataset(x_train[features], y_train)
    val_set = lgb.Dataset(x_val[features], y_val)
    
    del x_train, y_train

    model = lgb.train(params, train_set, num_boost_round = 2500, early_stopping_rounds = 50, valid_sets = [train_set, val_set], verbose_eval = 100)
    val_pred = model.predict(x_val[features])
    val_score = np.sqrt(metrics.mean_squared_error(val_pred, y_val))
    print(f'Our val rmse score is {val_score}')
    y_pred = model.predict(test[features])
    test['demand'] = y_pred
    return test

In [None]:
#create cluster lists that I can iterate on to pull the data clusters out of one combined file into 5 separate ones
cluster_list = pd.DataFrame()
cluster_list['item_id'] = df1['item_id']
cluster_list['cluster_id'] = df1['cluster_id']
cluster_list.shape

In [None]:
#create item list that I can reference with .isin for easy mapping of clusters
cluster1_item = cluster_list[cluster_list['cluster_id'] == 0]['item_id']
cluster2_item = cluster_list[cluster_list['cluster_id'] == 1]['item_id']
cluster3_item = cluster_list[cluster_list['cluster_id'] == 2]['item_id']
cluster4_item = cluster_list[cluster_list['cluster_id'] == 3]['item_id']
cluster5_item = cluster_list[cluster_list['cluster_id'] == 4]['item_id']
cluster6_item = cluster_list[cluster_list['cluster_id'] == 5]['item_id']
cluster7_item = cluster_list[cluster_list['cluster_id'] == 6]['item_id']
cluster5_item.shape

In [None]:
#create copy of data so we can transform it
data_gbm = data

In [None]:
#subset full store dataset into 5 dataframes that correspond to clusters
cluster_1_items = data_gbm[data_gbm['item_id'].isin(cluster1_item)]
cluster_2_items = data_gbm[data_gbm['item_id'].isin(cluster2_item)]
cluster_3_items = data_gbm[data_gbm['item_id'].isin(cluster3_item)]
cluster_4_items = data_gbm[data_gbm['item_id'].isin(cluster4_item)]
cluster_5_items = data_gbm[data_gbm['item_id'].isin(cluster5_item)]
cluster_6_items = data_gbm[data_gbm['item_id'].isin(cluster6_item)]
cluster_7_items = data_gbm[data_gbm['item_id'].isin(cluster7_item)]

In [None]:
cluster_7_items.head()

step 1: redo the transformations for the entire data, call melt and merge with no store restriction. Then, run transform and simple fe functions that import new features. 
step 2: import cluster id from 1 store input to the broader dataset, joining on item_id. This will allow you to bypass clusters.
step 3: train k individual gbm models on the cluster ids to get the predictions for each cluster. this is just run_lgb(data).
step 4: roll up predictions into one file using the code from the original file. It will work as long as its in one dataframe!

In [None]:
#define predictions and feature list
def predict(test, submission, filename):
    predictions = test[['id', 'date', 'demand']]
    predictions = pd.pivot(predictions, index = 'id', columns = 'date', values = 'demand').reset_index()
    predictions.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]

    evaluation_rows = [row for row in submission['id'] if 'evaluation' in row] 
    evaluation = submission[submission['id'].isin(evaluation_rows)]

    validation = submission[['id']].merge(predictions, on = 'id')
    final = pd.concat([validation, evaluation])
    
    final.to_csv(filename, index = False)
    
# define list of features
features = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'year', 'month', 'week', 'day', 'dayofweek', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 
            'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'lag_t28', 'lag_t29', 'lag_t30', 'rolling_mean_t7', 'rolling_std_t7', 'rolling_mean_t30', 'rolling_mean_t90', 
            'rolling_mean_t180', 'rolling_std_t30', 'price_change_t1', 'price_change_t365', 'rolling_price_std_t7', 'rolling_price_std_t30', 'rolling_skew_t30', 'rolling_kurt_t30']

In [None]:
#transform function 
def transform_train_and_eval(df_data, filename):
    df_data = transform(df_data)
    df_data = simple_fe(df_data)
    # reduce memory for new features so we can train
    df_data = reduce_mem_usage(df_data)
    test = run_lgb(df_data)
    predict(test, submission, filename)

In [None]:
transform_train_and_eval(cluster_1_items,'submission1.csv')
transform_train_and_eval(cluster_2_items,'submission2.csv')
transform_train_and_eval(cluster_3_items,'submission3.csv')
transform_train_and_eval(cluster_4_items,'submission4.csv')
transform_train_and_eval(cluster_5_items,'submission5.csv')
transform_train_and_eval(cluster_6_items,'submission6.csv')
transform_train_and_eval(cluster_7_items,'submission7.csv')

In [None]:
c1_excel = pd.read_csv('submission1.csv')
c2_excel = pd.read_csv('submission2.csv')
c3_excel = pd.read_csv('submission3.csv')
c4_excel = pd.read_csv('submission4.csv')
c5_excel = pd.read_csv('submission5.csv')
c6_excel = pd.read_csv('submission6.csv')
c7_excel = pd.read_csv('submission7.csv')

In [None]:
final_submission = pd.concat([c1_excel,c2_excel,c3_excel,c4_excel,c5_excel,c6_excel,c7_excel])
final_submission.shape

In [None]:
final_submission = final_submission[final_submission.F1 != 0]
final_submission.shape

In [None]:
#final submission file for predictions
final_submission.to_csv('submission.csv',index = False)

Now to repeat the process for all stores, but my kaggle crashes every time for all stores. 