# Necessary Libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import category_encoders as c_e
import lightgbm as lgb
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
import pickle
import seaborn as sns
import shap
import time


from calendar import monthrange
from datetime import date, timedelta
from itertools import product
from sklearn import preprocessing
from xgboost import plot_importance

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler

# Functions

## Data Preprocessing Functions

> ### Functions to work with sales data

In [None]:
def prepare_sales_monthly(sales):
    """
    Aggregates sales data montly
    :arg sales (pandas df) - sales df from sales_train.csv
    :return sales_train_montly_df (pandas df) - sales aggregated montly
    """
    
    sales_train_df_no_dupl_shops = adjust_duplicated_shops(sales)

    sales_train_montly_df = sales_train_df_no_dupl_shops.groupby(
        ['date_block_num', 'shop_id', 'item_id'])['item_price', 'item_cnt_day', 'date'].agg(
        {'item_price': 'mean',
         'date': 'min',
         'item_cnt_day': 'sum'})
    sales_train_montly_df = sales_train_montly_df.reset_index()

    colnames = ['date_block_num', 'shop_id', 'item_id', 'item_price_avg', 'date_min', 'item_cnt_month']
    sales_train_montly_df.columns = colnames

    print('sales_train_montly_df has shape ', sales_train_montly_df.shape)
    return sales_train_montly_df


def adjust_duplicated_shops(df):
    'Function that combines duplicated shop names'
    # from https://www.kaggle.com/taranenkodaria/predict-future-sales-the-russian-forecast
    
    df.loc[df['shop_id'] == 0, 'shop_id'] = 57
    df.loc[df['shop_id'] == 1, 'shop_id'] = 58
    df.loc[df['shop_id'] == 11, 'shop_id'] = 10
    df.loc[df['shop_id'] == 40, 'shop_id'] = 39
    df.loc[df['shop_id'] == 23, 'shop_id'] = 24

    return df


def remove_outliers(df):
    'Removes duplicates from sales df'
    return df[(df["item_price"] < np.percentile(df["item_price"], q=99))
              & (df["item_price"] > 0)
              & (df["item_cnt_day"] >= 0)
              & (df["item_cnt_day"] < np.percentile(df["item_cnt_day"], q=99))]


def prepare_sales(use_cache):
    """
    Function that applies preprocessing steps to the original sales df (from sales_train.csv) 
    :arg use_cache (Bool) if True the resulting data is loaded from already created pickle file (to save time), if False 
                          then all processing steps are performed
    :return sales (pandas df) - sales df ready to be used for modeling
    """
    if use_cache:
        try:
            infile = open("sales_df.pickle.dat", "rb")
            sales = pickle.load(infile)
            infile.close()
            return sales
        except (OSError, IOError) as e:
            pass
    
    print('prepare_sales')
    # load and preprocess sales
    sales = pd.read_csv("../input/competitive-data-science-predict-future-sales/sales_train.csv")
    sales = remove_outliers(sales)
    sales = prepare_sales_monthly(sales)
    sales = add_zero_sales(sales, None)

    first_items_sales = sales.groupby('item_id')['date_block_num'].min()
    sales['when_first_sold'] = sales['item_id'].map(first_items_sales)
    sales['when_first_sold'] = sales['date_block_num'] - sales['when_first_sold']


    print('prepare_sales has shape ', sales.shape)
    if use_cache:
        pickle.dump(sales, open("sales_df.pickle.dat", "wb"))
    return sales


def create_matrix(df):
    'creates shop_id/item_id matrix based on sales to match the test set distribution'
    # adopted from https://www.kaggle.com/gordotron85/future-sales-xgboost-top-3
    x = date(2013, 1, 1)
    matrix = []
    cols = ["date_block_num", "shop_id", "item_id", "Year", "Month"]
    for i in range(df.date_block_num.max() + 1):
        try:
            sales = df[df.date_block_num == i]
            matrix.append(np.array(list(product([i], sales.shop_id.unique(), sales.item_id.unique(), [x.year], [x.month])), dtype=np.int16))
            x = x.replace(month=x.month+1)
        except ValueError:
            if x.month == 12:
                x = x.replace(year=x.year+1, month=1)
            else:
                # next month is too short to have "same date"
                # pick your own heuristic, or re-raise the exception:
                raise

    matrix = pd.DataFrame(np.vstack(matrix), columns = cols )
    matrix["date_block_num"] = matrix["date_block_num"].astype(np.int8)
    matrix["shop_id"] = matrix["shop_id"].astype(np.int8)
    matrix["item_id"] = matrix["item_id"].astype(np.int16)
    matrix.sort_values(cols, inplace=True)

    return matrix


def add_zero_sales(df, matrix):
    'Combines original sales df and matrix'
    if matrix is None:
        matrix = create_matrix(df)

    df = pd.merge(matrix, df, how='left',
                  left_on=['date_block_num', 'shop_id', 'item_id'],
                  right_on=['date_block_num', 'shop_id', 'item_id'])

    return df

### Functions to work with shops data

In [None]:
def extract_shop_type(df):
    'Extracts type of the shop and creates the shop_type_1 and shop_type_2 columns'

    df.loc[df['shop_name'].str.contains('ТЦ'),'shop_type_1'] = 'type_1'
    df.loc[df['shop_name'].str.contains('ТК'),'shop_type_1'] = 'type_2'
    df.loc[df['shop_name'].str.contains('ТРЦ'),'shop_type_1'] = 'type_3'
    df.loc[df['shop_name'].str.contains('ТРК'),'shop_type_1'] = 'type_4'

    df.loc[(df['shop_name'].str.contains('ТЦ')) |
           (df['shop_name'].str.contains('ТК')),'shop_type_2'] = 'type_1'
    df.loc[(df['shop_name'].str.contains('ТРЦ')) |
           (df['shop_name'].str.contains('ТРК')),'shop_type_2'] = 'type_2'

    df.shop_type_1 = df.shop_type_1.fillna('NONE')
    df.shop_type_2 = df.shop_type_2.fillna('NONE')

    le_1 = preprocessing.OrdinalEncoder(dtype=np.int32)
    df['shop_type_1'] = le_1.fit_transform(df[['shop_type_1']])
    le_2 = preprocessing.OrdinalEncoder(dtype=np.int32)
    df['shop_type_2'] = le_2.fit_transform(df[['shop_type_2']])

    return df


def extract_shop_city(df):
    'Extracts shop city name and city type and creates two new columns'

    # City type: 1 if city is Moscow or Sankt Petersburg (they are quite different from the rest of Russia)
    df['shop_city_type'] = 0

    df['shop_city'] = df['shop_name'].str.split(' ').str[0]
    df.drop(columns=['shop_name'], inplace=True)

    df.loc[df['shop_city'].isin(['Москва', 'СПб']), 'shop_city_type'] = 1

    le = preprocessing.OrdinalEncoder(dtype=np.int32)
    df['shop_city'] = le.fit_transform(df[['shop_city']])

    return df


def fix_shops(shops_df):
    """
    This function modifies the shops df inplace.
    It correct's 3 shops that we have found to be 'duplicates'
    and also creates a few more features: extracts the city and encodes it using OrdinalEncoder
    """

    shops_df = shops_df.loc[~shops_df['shop_id'].isin([0, 1, 11, 40, 23])]
    shops_df = extract_shop_type(shops_df)
    shops_df = extract_shop_city(shops_df)

    return shops_df

### Functions to work with item_category data

In [None]:
def correct_item_category_name(df):
    'adjusts the format of the "item_category_name" column'
    df.loc[df['item_category_name'] == 'Билеты (Цифра)','item_category_name'] = 'Билеты - Цифра'
    df.loc[df['item_category_name'] == 'Доставка товара','item_category_name'] = 'Доставка товара - service'
    df.loc[df['item_category_name'] == 'Карты оплаты (Кино, Музыка, Игры)',
           'item_category_name'] = 'Карты оплаты - Кино, Музыка, Игры'
    df.loc[df['item_category_name'] == 'Служебные','item_category_name'] = 'Служебные - none'
    df.loc[df['item_category_name'] == 'Чистые носители (шпиль)','item_category_name'] = 'Чистые носители - шпиль'
    df.loc[df['item_category_name'] == 'Чистые носители (штучные)','item_category_name'] = 'Чистые носители - штучные'
    df.loc[df['item_category_name'] == 'Элементы питания','item_category_name'] = 'Элементы питания - none'

    return df


def extract_main_category(df):
    'extracts the first part of the category_name to build a feature'
    df['item_category_main'] = df['item_category_name'].str.split(' - ').str[0]

    le = preprocessing.OrdinalEncoder(dtype=np.int32)
    df['item_category_main'] = le.fit_transform(df[['item_category_main']])

    return df


def extract_whether_digital(df):
    'extracts a feature whether item_category_name contains a word "Digit"'
    df['is_category_digital'] = 0
    df.loc[df['item_category_name'].str.contains('Цифра'),'is_category_digital'] = 1

    return df


def extract_ps_related(df):
    'extracts a feature whether item_category_name contains a word "PS"'
    df['is_category_ps_related'] = 0
    df.loc[df['item_category_name'].str.contains('PS', case=False), 'is_category_ps_related'] = 1

    return df


def fix_item_category(df):
    'Applies all above fritten functions to item_category df'
    df = correct_item_category_name(df)
    df = extract_main_category(df)
    df = extract_whether_digital(df)
    df = extract_ps_related(df)
    df.drop(columns = ['item_category_name'], inplace = True)

    return df

### General functions for train_df and test_df

In [None]:
def create_df(use_cache=True):
    'Function that creates the train df'

    sales = prepare_sales(use_cache)

    shops = pd.read_csv("../input/competitive-data-science-predict-future-sales/shops.csv")
    shops = fix_shops(shops)  # fix the shops as we have seen before

    items_category = pd.read_csv("../input/competitive-data-science-predict-future-sales/item_categories.csv")
    items_category = fix_item_category(items_category)

    items = pd.read_csv("../input/competitive-data-science-predict-future-sales/items.csv")
    items.drop(columns=['item_name'], inplace=True)

    merged_df = sales.merge(shops, on = 'shop_id', how = 'left')
    print('df size with zero sales ', merged_df.shape)

    items_to_merge = items.merge(items_category, on = 'item_category_id')
    merged_df = merged_df.merge(items_to_merge, on = 'item_id', how = 'left')
    
    merged_df['item_cnt_month'] = merged_df['item_cnt_month'].fillna(0)
    
    return merged_df


def calculate_missing_prices_for_train_set(df):
    'Imputes the missing price into train set based on the median value of month/item_id combination'
    
    sales_raw = pd.read_csv("../input/competitive-data-science-predict-future-sales/sales_train.csv")
    
    price_stat = pd.DataFrame(sales_raw.groupby(['date_block_num', 'item_id'])['item_price'].median()).reset_index()

    df_with_price = df.merge(price_stat, on=['date_block_num', 'item_id'], how='left')
    df_with_price['temp_col'] = df_with_price['item_price_avg'].fillna(df_with_price['item_price'])
    df_with_price.drop(['item_price_avg', 'item_price'], axis=1, inplace=True)
    df_with_price.rename(columns={'temp_col': 'item_price_avg'}, inplace=True)

    return df_with_price


def downcast_dtypes(df):
    """
        Changes column types in the dataframe:
                `float64` type to `float32`
                `int64`   type to `int32`
    """

    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols = [c for c in df if df[c].dtype == "int64"]

    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int32)

    return df


def construct_lag(df, original_df, colname, target_var = 'item_cnt_month'):
    '''
    function that constructs lag
    :arg 
    df (pandas df) - train df after all transformation after date_block_num filter
    original_df (pandas df) - train df after all transformation withou filtering
    colname (list of str) - list of columns to build lag onto (example: ['shop_id'] or ['shop_id','item_id'])
    target_var (str) - name of the parameter to calculate with lag
    :return: df with additional lag columns
    '''

    # calculate montly statistics over lag columns
    stat = pd.DataFrame(original_df.groupby(['date_block_num', *colname])[target_var].sum()).reset_index()
    stat[target_var] = stat[target_var].round(2)
    stat = downcast_dtypes(stat)

    for month_shift in [1,2,3,12]:
        # rename a resulting column in the copy of stats for further merge
        stat_copy = stat.copy()
        new_colname = 'lag_sum_{}_{}_{}'.format(target_var,'_'.join(colname), month_shift)
        stat_copy.rename(columns={target_var: new_colname}, inplace=True)

        # merge a lagged column with original dataset
        df['temp_col'] = df['date_block_num'] - month_shift
        df = df.merge(stat_copy, left_on=['temp_col', *colname],
                      right_on=['date_block_num', *colname], how='left')

        # perform some final cleaning steps
        df.drop(columns=['date_block_num_y', 'temp_col'], inplace=True)
        df.rename(columns={'date_block_num_x': 'date_block_num'}, inplace=True)
        df[new_colname].fillna(0, inplace=True)

        print(target_var, colname, month_shift, df.shape)

    return df


def add_lag(all_data_df, df_to_add_lag, target_var = 'item_cnt_month'):
    """ adds lag columns to original dataframe
    :arg df (pandas df) train df after all modifications
    :return df (pandas df) original df with lag columns. Important (!) the 2013 data is deleted
    """

    lag_columns_list = [
        ['shop_id','item_id'],
        ['shop_id','item_category_id'],
        ['shop_id'],
        ['item_id'],
        ['item_category_id'],
        ['item_category_main']
    ]

    df_with_lag = df_to_add_lag.loc[df_to_add_lag['date_block_num'] >= 12]
    for colname in lag_columns_list:
        df_with_lag = construct_lag(df_with_lag, all_data_df, colname, target_var)

    return df_with_lag


def calculate_number_of_particular_days(day_of_week):
    """
    Function that calculates how many particular days were in each months of 2013-2015
                    (like how many mondays were in 02.2014)
    :arg day_of_week (int) number of weekday (0 - Monday, 6 - Sunday)
    :return: day_of_week_series (pandas series) index - month_number, value - number of particular day
    """

    def checkio(from_date, to_date, day_of_week):
        return len([1 for d in range((to_date - from_date).days + 1)
                    if (from_date + timedelta(d)).weekday() in [day_of_week]])

    month_number = []
    number_of_weekend_days = []

    counter = 0
    for year in [2013, 2014, 2015]:
        for month in range(1, 13):
            start_m, end_m = monthrange(year, month)
            weekend_days_in_month = checkio(date(year, month, 1), date(year, month, end_m), day_of_week)
            month_number.append(counter)
            number_of_weekend_days.append(weekend_days_in_month)
            counter += 1
    day_of_week_series = pd.Series(index=month_number, data=number_of_weekend_days)

    return day_of_week_series


def add_days_stat(df):
    df['number_of_saturdays'] = df['date_block_num'].map(calculate_number_of_particular_days(5)).astype(np.int8)
    df['number_of_sundays'] = df['date_block_num'].map(calculate_number_of_particular_days(6)).astype(np.int8)
    df['number_of_mondays'] = df['date_block_num'].map(calculate_number_of_particular_days(0)).astype(np.int8)
    df['number_of_days_in_month'] = df['Month'].map(
        pd.Series([0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31])).astype(np.int8)

    return df


def create_feature_names_list(df):
    """
    Function that builds a list of all features required to train/test model
    :arg df (pandas df) - train set df
    :return features (list) - list of feature names
    """
    features = ['date_block_num', 'shop_id', 'item_id', 'Year', 'Month', 'shop_type_1',
                'shop_type_2', 'shop_city_type', 'shop_city', 'item_category_id',
                'item_category_main', 'is_category_digital', 'is_category_ps_related', 'item_price_avg',
                'when_first_sold',
                'number_of_mondays', 'number_of_saturdays', 'number_of_sundays', 'number_of_days_in_month']
    lag_cols = [x for x in df.columns if 'lag' in x]
    features = features + lag_cols

    return features


def create_train_set(addition_to_filename = ''):
    """
    Function that creates a train set and saves it in pickle file for further use
    :arg addition_to_filename (str) - any additional information appended to filename
    :return pickle file with train set without mean encoding
    """
    df = create_df()
    print('original df size is ', df.shape)
    print('original df columns ', df.columns)

    df = calculate_missing_prices_for_train_set(df)
    print('df size after averaging price ', df.shape)
    df = add_lag(all_data_df = df, df_to_add_lag= df)
    df['revenue'] = df['item_price_avg'] * df['item_cnt_month']
    #df = add_lag(df, df, 'revenue')
    
    df = add_days_stat(df)
    df = downcast_dtypes(df)
    
    print(df.columns)

    # df = h.add_holidays(df)
    # print('df size with holidays ', df.shape)

    timestr = time.strftime("%Y%m%d-%H%M%S")
    pickle.dump(df, open("train.pickle.dat", "wb"))

    # save feature names for further use
    features_list = create_feature_names_list(df)
    pickle.dump(features_list, open("features.pickle.dat", "wb"))
    
    return df, features_list


def load_train_set_and_features_list(version):
    """
    Function that loads train set and features list from corresponding pickle files
    :param version: (str) part of the pickle filename (for example "20201224-121548_first_ver")
    :return: df (pandas df) train set
             features (list) feature names
    """
    filename_train_set = f'{version}_train.pickle.dat'
    filename_features_list = f'{version}_features.pickle.dat'

    if os.path.exists(filename_train_set) and os.path.exists(filename_features_list):

        infile = open(filename_train_set, "rb")
        df = pickle.load(infile)
        infile.close()

        infile = open(filename_features_list, "rb")
        features = pickle.load(infile)
        infile.close()

        return df, features

    else:
        raise ValueError('Files do not exist!')

        
# TEST SET PREPROCESSING

def add_when_first_sold_to_test(test):
    """ads a 'when_first_sold column to test df'"""
    sales = pd.read_csv("../input/competitive-data-science-predict-future-sales/sales_train.csv")
    sales = remove_outliers(sales)
    first_items_sales = sales.groupby('item_id')['date_block_num'].min()
    test['when_first_sold'] = test['item_id'].map(first_items_sales)
    test['when_first_sold'] = test['date_block_num'] - test['when_first_sold']
    test['when_first_sold'].fillna(0, inplace=True)

    return test


def create_test_df(train_df):

    test = pd.read_csv("../input/competitive-data-science-predict-future-sales/test.csv")
    test['date_block_num'] = 34
    test['Year'] = 2015
    test['Month'] = 11

    # load shops and preprocess it
    shops = pd.read_csv("../input/competitive-data-science-predict-future-sales/shops.csv")
    shops = fix_shops(shops)  # fix the shops as we have seen before

    # load item_category and preprocess it
    items_category = pd.read_csv("../input/competitive-data-science-predict-future-sales/item_categories.csv")
    items_category = fix_item_category(items_category)

    # load items
    items = pd.read_csv("../input/competitive-data-science-predict-future-sales/items.csv")
    items.drop(columns = ['item_name'], inplace = True)

    # merge data
    items_to_merge = items.merge(items_category, on = 'item_category_id')
    test_merged = test.merge(shops, on = 'shop_id', how = 'left')
    test_merged = test_merged.merge(items_to_merge, on = 'item_id', how = 'left')
    
    return test_merged


def add_price_col_to_test(test, regime = 'test'):

    # Algorithm:
    # 1. take the last price for the shop/item_id
    # 2. take the average price for the item_id
    # 3. take the median price for the category

    items = pd.read_csv("../input/competitive-data-science-predict-future-sales/items.csv")
    sales_raw = pd.read_csv("../input/competitive-data-science-predict-future-sales/sales_train.csv")
    sales_raw = adjust_duplicated_shops(sales_raw)
    if regime == 'val':
        sales_raw = sales_raw.loc[sales_raw['date_block_num'] < 33]

    last_price = sales_raw.sort_values(['shop_id','date_block_num'], ascending = [True, True])
    last_price = last_price.drop_duplicates(subset = ['shop_id','item_id'], keep = 'last')
    last_price = last_price[['shop_id','item_id','item_price']]
    last_price.columns = ['shop_id','item_id','item_price_avg']
    test_last_price = test.merge(last_price, on = ['shop_id','item_id'], how = 'left').dropna(subset = ['item_price_avg'])
    test_last_price_rest = test.loc[~test['ID'].isin(test_last_price['ID'])]

    mean_item_price = pd.DataFrame(sales_raw.groupby('item_id')['item_price'].median()).reset_index()
    mean_item_price.columns = ['item_id','item_price_avg']
    test_mean_item = test_last_price_rest.merge(mean_item_price, on = 'item_id', how = 'left').dropna(subset = ['item_price_avg'])
    id_list = list(test_mean_item['ID']) + list(test_last_price['ID'])
    test_mean_item_rest = test.loc[~test['ID'].isin(id_list)]

    sales_items = sales_raw.merge(items[['item_id','item_category_id']], on = 'item_id')

    median_item_cat_price = pd.DataFrame(sales_items.groupby('item_category_id')['item_price'].median()).reset_index()
    median_item_cat_price.columns = ['item_category_id','item_price_avg']

    test_category_price = test_mean_item_rest.merge(median_item_cat_price, on = 'item_category_id', how = 'left')

    test_with_price = pd.concat([test_last_price,test_mean_item,test_category_price], axis = 0)

    test_with_price.sort_values('ID', inplace = True)

    test_with_price.set_index('ID', inplace = True)

    return test_with_price


def perform_target_encoding(train, test):

    cat_cols = ['shop_id',
                'Year',
                'Month',
                'shop_type_1',
                'shop_type_2',
                'shop_city_type',
                'shop_city',
                'item_category_id',
                'item_category_main',
                'is_category_digital',
                'is_category_ps_related']

    X_train = train.drop('item_cnt_month', axis=1)
    Y_train = train['item_cnt_month']
    enc = c_e.TargetEncoder(cols = cat_cols, smoothing=100)

    X_train = enc.fit_transform(X_train, Y_train)
    train = X_train.copy()
    train['item_cnt_month'] = Y_train
    test = enc.transform(test)

    return train, test


def create_test_set(train, features):
    """
    Function that creates test set
    :param train: (pandas df) train set from pickle file
    :param features: (list) - list of feature names
    :return: test_with_price (pandas df) - test set
    """
    test = create_test_df(train)
    print('df base size ', test.shape)

    test = add_when_first_sold_to_test(test)

    test = add_lag(all_data_df = train, df_to_add_lag= test)
    
    #test = add_lag(train, test, 'revenue')

    test = add_days_stat(test)

    test_with_price = add_price_col_to_test(test)

    test_with_price = test_with_price[features]

    test_with_price = downcast_dtypes(test_with_price)
    
    pickle.dump(test_with_price, open(f"test.pickle.dat", "wb"))

    return test_with_price


## Modeling Functions

In [None]:
def create_train_val_split(df, features):
    """
    Function that splits the df into train / val splits
    :param df: (pandas df) all training data loaded from pickle file
    :param features: (list) oll feature names loaded from pickle file
    :return: train/val splits
    """
    print('Number of features Train_set: ', len(features))
    print('Features: ', features)

    target = ['item_cnt_month']

    train = df[(df["date_block_num"] < 33)]
    val = df[(df["date_block_num"] == 33)]
    #test = df[(df["date_block_num"] == 33)]


    # Adjust price of the val according to test set
    val = val.drop('item_price_avg', axis = 1)
    val = val.reset_index().rename(columns={'index':'ID'})
    val = add_price_col_to_test(val, regime='val')

    X_train = train[features]
    X_val = val[features]

    Y_train = train[target]
    Y_val = val[target]

    X_train['Year'] = X_train['Year'].astype(int)
    X_train['Month'] = X_train['Month'].astype(int)
    X_val['Year'] = X_val['Year'].astype(int)
    X_val['Month'] = X_val['Month'].astype(int)

    return X_train, Y_train, X_val, Y_val



def build_lgb_model(params, X_train, X_val, y_train, y_val, cat_features):
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_val = lgb.Dataset(X_val, y_val)
    model = lgb.train(params=params, train_set=lgb_train, valid_sets=(lgb_train, lgb_val), verbose_eval=10,
                     categorical_feature=cat_features)
    return model


def build_lgb_model_test(params, X_train, y_train, cat_features):
    lgb_train = lgb.Dataset(X_train, y_train)
    model = lgb.train(params=params, train_set=lgb_train, valid_sets=(lgb_train), verbose_eval=10,
                     categorical_feature=cat_features)
    return model


## Functions necessary to create submission file

In [None]:
def apply_0_to_not_sold_categories(df):
    'Optional function that applies 0 to submission file rows with item_categories that were not sold in particular shops'
    items = pd.read_csv("../input/competitive-data-science-predict-future-sales/items.csv")
    sales_raw = pd.read_csv("../input/competitive-data-science-predict-future-sales/sales_train.csv")
    sales_raw = adjust_duplicated_shops(sales_raw)
    merged_df = sales_raw.merge(items[['item_id','item_category_id']], on = 'item_id')

    all_item_categories = list(merged_df['item_category_id'].unique())

    not_available_categories_per_shop = {}

    for shop_id in merged_df['shop_id'].unique():
        shop_item_categories = list(merged_df.loc[merged_df['shop_id'] == shop_id,'item_category_id'].unique())
        not_available_categories_per_shop[shop_id] = list(set(all_item_categories) - set(shop_item_categories))

    counter = 0
    for key, value in not_available_categories_per_shop.items():
        counter += len(df.loc[(df['shop_id'] == key) & (df['item_category_id'].isin(value)), 'item_cnt_month'])
        df.loc[(df['shop_id'] == key) & (df['item_category_id'].isin(value)), 'item_cnt_month'] = 0

    print(counter)
    return df


def correct_submission_for_not_sold_items(test_to_correct):
    """
    function that checks whether particular item_id is still sold in shops.
    arg: path_to_submission_to_improve (str) - path to submission file
         path_to_sales_train (str) - path to sales_train.csv file
    return: corrected_submission (pandas df) - an adjusted submission
    """

    shop_sales = pd.read_csv("../input/competitive-data-science-predict-future-sales/sales_train.csv")

    items_not_sold_in_last_3_months = {}

    test = pd.read_csv("../input/competitive-data-science-predict-future-sales/test.csv")

    all_test_items = test.item_id.unique()

    for shop_id in test['shop_id'].unique():
        shop = shop_sales.loc[(shop_sales['shop_id'] == shop_id) & (shop_sales['item_id'].isin(all_test_items))]
        shop_items = list(shop['item_id'].unique())
        shop_items_3_months = list(shop.loc[shop['date_block_num'] > 30, 'item_id'].unique())

        items_not_sold_in_last_3_months[shop_id] = list(set(shop_items) - set(shop_items_3_months))

    test_work = test[['ID', 'shop_id', 'item_id']]

    test_work_done = pd.DataFrame()

    for key, value in items_not_sold_in_last_3_months.items():
        test_df = test_work.loc[test_work['shop_id'] == key]
        test_df['coeff'] = 1
        test_df.loc[test_df['item_id'].isin(value),'coeff'] = 0
        test_work_done = pd.concat([test_work_done, test_df], axis = 0)

    test_to_correct_1 = test_to_correct.merge(test_work_done, on = 'ID', how = 'left')

    # test_to_correct_1.fillna(1, inplace=True)

    test_to_correct_1['item_cnt_month'] = test_to_correct_1['item_cnt_month'] * test_to_correct_1['coeff']

    corrected_submission = test_to_correct_1[['ID','item_cnt_month']]

    return corrected_submission


def create_submission_file(test, model=None, y_pred=None, name=None):
    """
    Function that creates and saves a submission-ready file
    :param test: (pandas df) test set
    :param model: (xgboost model) trained model
    :return: saved submission file
    """
    if model:
        y_pred = model.predict(test)

    submission = test.copy()
    submission['item_cnt_month'] = y_pred.clip(0, 20)
    
    submission = apply_0_to_not_sold_categories(submission)
    submission = submission[['item_cnt_month']].reset_index()
    submission.rename(columns={"index": "ID"}, inplace=True)
    #print('apply_0_to_not_sold_categories ', submission.shape)

    # This did not change much
    #submission = correct_submission_for_not_sold_items(submission)

    print('Expect (214200, 2)')
    print('Actual ', submission.shape)
    timestr = time.strftime("%Y%m%d-%H%M%S")
    submission.to_csv(timestr+'solution_{}.csv'.format(name), index=False)
    print("Your submission was successfully saved!")
    
    return

# SKRIPT

## Create Datasets

In [None]:
# Prepare training set and save it to pickle for reuse
#train_set_identifier = 'first_ver'
#df, features_list = create_train_set(train_set_identifier)

# Uncomment if pickle files are available
df = pickle.load(open('./train.pickle.dat', "rb"))
features_list = pickle.load(open('./features.pickle.dat',"rb"))

In [None]:
# Prepare test set
#test = create_test_set(df, features_list)

# Uncomment if pickle files are available
test = pickle.load(open('./test.pickle.dat', "rb"))

## Some EDA

In [None]:
shops_df = pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv')

df_mean_revenue_per_shop_month = df.groupby(['shop_id','date_block_num'])['revenue'].mean().reset_index()
df_mean_revenue_per_shop = df_mean_revenue_per_shop_month.groupby('shop_id')['revenue'].mean().reset_index()
df_mean_revenue_per_shop = df_mean_revenue_per_shop.merge(shops_df, on = 'shop_id', 
                                                          how = 'left').sort_values('revenue', ascending = False)

In [None]:
# mean revenue per shop
fig, axes = plt.subplots(figsize = (15,6))
sns.barplot(x = df_mean_revenue_per_shop['shop_id'], y = df_mean_revenue_per_shop['revenue'], ax = axes)
print('Top 10 shops by revenue: ')
print(df_mean_revenue_per_shop.head(10))
print('\nShops from Moscow have by far the highest sales!')

In [None]:
df_mean_revenue_per_shop.head(10)['shop_id']

In [None]:
df_mean_revenue_per_shop_month_top_10 = df_mean_revenue_per_shop_month.loc[
    df_mean_revenue_per_shop_month['shop_id'].isin(df_mean_revenue_per_shop.head(10)['shop_id'])]

fig, axes = plt.subplots(figsize = (22,5), ncols = 3) 
sns.pointplot(x = 'date_block_num', y = 'revenue', data = df_mean_revenue_per_shop_month, ax = axes[0])
sns.pointplot(x = 'date_block_num', y = 'revenue', data = df_mean_revenue_per_shop_month_top_10, ax = axes[1])
sns.pointplot(x = 'date_block_num', y = 'revenue', data = df_mean_revenue_per_shop_month_top_10, hue = 'shop_id', ax = axes[2])

axes[0].set_ylim(0,1300)
axes[1].set_ylim(0,1300)
axes[2].set_ylim(0,2000)

axes[0].set_ylabel('mean revenue per month')
axes[1].set_ylabel('mean revenue per month')
axes[2].set_ylabel('mean revenue per month')

axes[0].set_title('All Shops', fontsize = 16)
axes[1].set_title('Top 10 Shops by revenue', fontsize = 16)
axes[2].set_title('Top 10 Shops by revenue', fontsize = 16)

axes[2].legend(bbox_to_anchor=(1.02, 1.05))

print('December sales are predominant')
print('November sales (test set) are expected to be higher than those of October (validation set)')

## Clean memory

In [None]:
import gc
del df#, test
gc.collect()

## ML Predictions using Stacking --> LinearRegression(LightGBM + RandomForest + LinearRegression)

An approach is:
1. Train LighGBM + RandomForest + LinearRegression using train/val splits (Level 1)
2. Evaluate models and tune parameters
3. Make predictions on val set for all three models
4. Combine all predictions and Y_val into one table
5. Train Linear Regression on the table from step 4 in order to predict Y_val from 3 models (Level 2)
6. Predict Test Set by models from level 1 and then by level 2 model.
7. Make a submission

### Split the data into train/val

In [None]:
X_train, Y_train, X_val, Y_val = create_train_val_split(df, features_list)

### Train LightGBM Model

In [None]:
params = {
    'objective': 'rmse',
    'metric': 'rmse',
    'max_depth':10,
    'num_leaves': 1023,
    'min_data_in_leaf':100, 
    'feature_fraction':0.7, # ratio of features to be randomly selected at each iteration
    'learning_rate': 0.03,
    'num_rounds': 1000,
    'early_stopping_rounds': 10,
    'seed': 42,
    'device': 'gpu',
    'gpu_platform_id': 0,
    'gpu_device_id': 0

}
#designating the categorical features which should be focused on

cat_features = ['shop_id',
 'Year',
 'Month',
 'shop_type_1',
 'shop_type_2',
 'shop_city_type',
 'shop_city',
 'item_category_id',
 'item_category_main',
 'is_category_digital',
 'is_category_ps_related']

In [None]:
lgb_model = build_lgb_model(params, X_train, X_val, Y_train, Y_val, cat_features)

In [None]:
fig, axes = plt.subplots(figsize = (8,15))
lgb.plot_importance(lgb_model, ax = axes)

In [None]:
sample_df = X_train.sample(5000)
shap_values = shap.TreeExplainer(lgb_model).shap_values(sample_df)
shap.summary_plot(shap_values, sample_df)

Interestingly SHAP analysis and LGBM Feature Importance do not 100% match. This is probably to the the random subset that I provided to the SHAP. 

In [None]:
# Train Final Light GBM Model on concatenated train and val sets

params = {
    'objective': 'rmse',
    'metric': 'rmse',
    'max_depth':10,
    'num_leaves': 1023,
    'min_data_in_leaf':100, 
    'feature_fraction':0.7, # ratio of features to be randomly selected at each iteration
    'learning_rate': 0.03,
    'num_rounds': 170,
    'early_stopping_rounds': 10,
    'seed': 42,
    'device': 'gpu',
    'gpu_platform_id': 0,
    'gpu_device_id': 0

}
#designating the categorical features which should be focused on

cat_features = ['shop_id',
 'Year',
 'Month',
 'shop_type_1',
 'shop_type_2',
 'shop_city_type',
 'shop_city',
 'item_category_id',
 'item_category_main',
 'is_category_digital',
 'is_category_ps_related']

lgb_model = build_lgb_model_test(params, pd.concat([X_train, X_val]), pd.concat([Y_train, Y_val]), cat_features)


In [None]:
pickle.dump(lgb_model, open("lgb_model.pickle.dat", "wb"))

### Random Forest Model

In [None]:
# RF parameters tuning was done by my teammate Tullio Coppottelli

ts = time.time()
rf = RandomForestRegressor(bootstrap=0.7, criterion='mse', max_depth=10,
                           max_features=6, max_leaf_nodes=None, min_impurity_decrease=0.0,
                           min_impurity_split=None, min_samples_leaf=1,
                           min_samples_split=2, min_weight_fraction_leaf=0.0,
                           n_estimators=50, n_jobs=-1, oob_score=False, random_state=42,
                           verbose=1, warm_start=False)
rf.fit(X_train,Y_train)
print(f"Time required for RF: {time.time() - ts}")

In [None]:
Y_val_rf = rf.predict(X_val)
print('Val rmse:', np.sqrt(mean_squared_error(Y_val_rf, Y_val)))

In [None]:
sample_df = X_train.sample(5000)
shap_values = shap.TreeExplainer(rf).shap_values(sample_df)
shap.summary_plot(shap_values, sample_df)

In [None]:
# Training on the whole dataset
ts = time.time()

rf_compl = RandomForestRegressor(bootstrap=0.7, criterion='mse', max_depth=10,
                           max_features=6, max_leaf_nodes=None, min_impurity_decrease=0.0,
                           min_impurity_split=None, min_samples_leaf=1,
                           min_samples_split=2, min_weight_fraction_leaf=0.0,
                           n_estimators=50, n_jobs=-1, oob_score=False, random_state=42,
                           verbose=1, warm_start=False)
rf_compl.fit(pd.concat([X_train, X_val]), pd.concat([Y_train, Y_val]))
print(f"Time required for RF with combined train/val set: {time.time() - ts}")

In [None]:
pickle.dump(rf_compl, open(f"rf_model.pickle.dat", "wb"))

### Linear Regression

In [None]:
# Scale the features to be able to train the linear model
lr_scaler = MinMaxScaler()
lr_scaler.fit(X_train)
lr_train = lr_scaler.transform(X_train)
lr_val = lr_scaler.transform(X_val)
lr_test = lr_scaler.transform(test)

In [None]:
ts = time.time()
lr_model = LinearRegression(n_jobs=-1)
lr_model.fit(lr_train, Y_train)
print(f"Time required for LR with combined train/val set: {time.time() - ts}")

Y_val_lr = lr_model.predict(lr_val)
print('Train rmse:', np.sqrt(mean_squared_error(Y_val_lr, Y_val)))

In [None]:
# Train LR on the whole dataset
ts = time.time()
lr_train_compl = lr_scaler.transform(pd.concat([X_train, X_val]))
lr_model_compl = LinearRegression(n_jobs=-1)
lr_model_compl.fit(lr_train_compl, pd.concat([Y_train, Y_val]))
print(f"Time required for LR with combined train/val set: {time.time() - ts}")

In [None]:
pickle.dump(lr_model_compl, open("lr_model.pickle.dat", "wb"))

### Stacking

In [None]:
# If needed the load the models from pickle
lgb_model = pickle.load(open(f"lgb_model.pickle.dat", "rb"))
rf_compl = pickle.load(open(f"rf_model.pickle.dat", "rb"))
lr_model_compl = pickle.load(open(f"lr_model.pickle.dat", "rb"))

In [None]:
# Perform predictions on val and test sets
lgb_pred_val = lgb_model.predict(X_val)
rf_pred_val = rf_compl.predict(X_val)
lr_pred_val = lr_model_compl.predict(lr_val)

lgb_pred_test = lgb_model.predict(test)
rf_pred_test = rf_compl.predict(test)
lr_pred_test = lr_model_compl.predict(lr_test)

In [None]:
# Create tables with prediction results

first_level_df = pd.DataFrame({
    'lgbm':lgb_pred_val,
    'rf':rf_pred_val,
    'lr':lr_pred_val[:,0],
    'y_val':Y_val['item_cnt_month'].values
})

first_level_test = pd.DataFrame({
    'lgbm':lgb_pred_test,
    'rf':rf_pred_test,
    'lr':lr_pred_test[:,0],
})

In [None]:
# Train a LR metamodel to fit the prediction of Level1 models
meta_model = LinearRegression(n_jobs=-1)
meta_model.fit(first_level_df.iloc[:,:-1], first_level_df.iloc[:,-1])

ensemble_pred = meta_model.predict(first_level_df.iloc[:,:-1])
print('Train rmse for meta model:', np.sqrt(mean_squared_error(ensemble_pred, Y_val)))

In [None]:
# Create submission
y_pred = meta_model.predict(first_level_test)
create_submission_file(test, None, y_pred, name = 'stacking')

# SETUP LIGHTGBM GPU (Optional)

In [None]:
#FROM https://www.kaggle.com/dromosys/gpu-accelerated-lightgbm-full

In [None]:
!rm -r /opt/conda/lib/python3.6/site-packages/lightgbm
!git clone --recursive https://github.com/Microsoft/LightGBM

In [None]:
!apt-get install -y -qq libboost-all-dev

In [None]:
%%bash
cd LightGBM
rm -r build
mkdir build
cd build
cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ ..
make -j$(nproc)

In [None]:
!cd LightGBM/python-package/;python3 setup.py install --precompile

In [None]:
!mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
!rm -r LightGBM