In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Loading Data

In [2]:
def create_final_dataframe(main_data):
    #load all aditional dataframes
    oil_data = pd.read_csv("./Data/oil.csv", parse_dates= ["date"])
    holiday_data = pd.read_csv("./Data/holidays_events.csv", parse_dates= ["date"])
    stores_data = pd.read_csv("./Data/stores.csv")
    trans_data = pd.read_csv("./Data/transactions.csv", parse_dates= ['date'])

    # preprocess dataframes: updating datatypes and filling missing values

    main_datatype = {"store_nbr": "category", "family" :"category"}
    main_data = main_data.astype(main_datatype)

    oil_data.fillna(method = "backfill", inplace = True)

    holiday_datatypes = {"type": "category","locale":"category", "locale_name": "category", "description": "string"}
    holiday_data = holiday_data.astype(holiday_datatypes)

    stores_data = stores_data.astype("category")

    # split holiday dataframe based on holiday types
    holiday_local  = holiday_data[holiday_data["locale"]== "Local"].copy()
    holiday_local.rename(columns={'locale_name': 'city'}, inplace=True)
    holiday_local= holiday_local.drop_duplicates(subset=['date', 'city'])
    holiday_regional  = holiday_data[holiday_data["locale"]== "Regional"].copy()
    holiday_regional.rename(columns={'locale_name': 'state'}, inplace=True)
    holiday_regional= holiday_regional.drop_duplicates(subset=['date', 'state'])
    holiday_national = holiday_data[holiday_data["locale"]== "National"].copy()
    holiday_national= holiday_national.drop_duplicates(subset=['date'])

    # merging all frames to create the final dataframe: 
    df = main_data.merge(oil_data, on = "date", how = "left").fillna(method = 'ffill')
    df = df.merge(stores_data, on = "store_nbr", how = "left", )
    df = df.merge(trans_data, on = ["date", "store_nbr"], how = "left")
    df = df.merge(holiday_local, on = ["date", "city"], how = "left", suffixes= ("_store", "_holiday")) 
    df = df.merge(holiday_regional, on = ["date", "state"], how = "left",)
    df = df.merge(holiday_national, on ="date", how = "left")
    
    # Filing missing values of type holiday variable in the df dataframe
    type_holiday = df['type_holiday'].combine_first(df['type_x']).combine_first(df['type_y'])
    locale = df['locale_x'].combine_first(df['locale_y']).combine_first(df['locale'])
    transferred =  df['transferred_x'].combine_first(df['transferred_y']).combine_first(df['transferred'])
    
    # create the final version of df dataframe by adding the correct columns of holiday dataframe
    df = df.iloc[:, :11]
    df['type_holiday']= type_holiday
    df['locale']= locale
    df['transferred'] = transferred

    # trim the final dataframe by improving its columns' datatypes and imputing the missing values
    df['type_holiday'] = df['type_holiday'].cat.add_categories("IsNotHoliday")  
    df['locale'] = df['locale'].cat.add_categories("IsNotHoliday") 

    fill_values = {"type_holiday" :"IsNotHoliday", "locale":"IsNotHoliday"}
    df.fillna(fill_values, inplace = True)

    df.fillna({"transferred": False, "transactions": 0}, inplace = True)

    df = df.astype({"city":"category", "state": "category"})

    return df



In [3]:
train_data = pd.read_csv("./Data/train.csv", parse_dates = ['date'] ,  index_col='id')

df = create_final_dataframe(train_data)

df


  df.fillna(fill_values, inplace = True)


Unnamed: 0,date,store_nbr,family,sales,onpromotion,dcoilwtico,city,state,type_store,cluster,type_holiday,locale,transferred
0,2013-01-01,1,AUTOMOTIVE,0.000,0,93.14,Quito,Pichincha,D,13,Holiday,National,False
1,2013-01-01,1,BABY CARE,0.000,0,93.14,Quito,Pichincha,D,13,Holiday,National,False
2,2013-01-01,1,BEAUTY,0.000,0,93.14,Quito,Pichincha,D,13,Holiday,National,False
3,2013-01-01,1,BEVERAGES,0.000,0,93.14,Quito,Pichincha,D,13,Holiday,National,False
4,2013-01-01,1,BOOKS,0.000,0,93.14,Quito,Pichincha,D,13,Holiday,National,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3008275,2017-08-15,9,POULTRY,438.133,0,47.57,Quito,Pichincha,B,6,IsNotHoliday,IsNotHoliday,False
3008276,2017-08-15,9,PREPARED FOODS,154.553,1,47.57,Quito,Pichincha,B,6,IsNotHoliday,IsNotHoliday,False
3008277,2017-08-15,9,PRODUCE,2419.729,148,47.57,Quito,Pichincha,B,6,IsNotHoliday,IsNotHoliday,False
3008278,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8,47.57,Quito,Pichincha,B,6,IsNotHoliday,IsNotHoliday,False


In [4]:
test_data = pd.read_csv("./Data/test.csv",  index_col='id', parse_dates = ['date'] )
test_frame = create_final_dataframe(test_data.iloc[:1, :])

test_frame

  df.fillna(fill_values, inplace = True)


Unnamed: 0,date,store_nbr,family,onpromotion,dcoilwtico,city,state,type_store,cluster,type_holiday,locale,transferred
0,2017-08-16,1,AUTOMOTIVE,0,46.8,Quito,Pichincha,D,13,IsNotHoliday,IsNotHoliday,False


# Create new time features



In [5]:
def extract_time_vars(dataframe):
    dataframe['year'] = dataframe["date"].apply(lambda x: x.year)
    dataframe['month'] = dataframe["date"].apply(lambda x: x.month)
    dataframe['day'] = dataframe["date"].apply(lambda x : x.day)
    dataframe['dayweek'] = dataframe["date"].apply(lambda x : x.day_name()).astype("category")
    dataframe["quarter"] = dataframe["date"].apply(lambda x: x.quarter)
    dataframe["weekyear"]= dataframe["date"].apply(lambda x: x.weekofyear).astype(bool)
    dataframe["isweekend"] = dataframe["date"].dt.dayofweek // 5 

    return dataframe

In [6]:
df = extract_time_vars(df)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3008280 entries, 0 to 3008279
Data columns (total 20 columns):
 #   Column        Dtype         
---  ------        -----         
 0   date          datetime64[ns]
 1   store_nbr     category      
 2   family        category      
 3   sales         float64       
 4   onpromotion   int64         
 5   dcoilwtico    float64       
 6   city          category      
 7   state         category      
 8   type_store    category      
 9   cluster       category      
 10  type_holiday  category      
 11  locale        category      
 12  transferred   bool          
 13  year          int64         
 14  month         int64         
 15  day           int64         
 16  dayweek       category      
 17  quarter       int64         
 18  weekyear      bool          
 19  isweekend     int64         
dtypes: bool(2), category(9), datetime64[ns](1), float64(2), int64(6)
memory usage: 261.1 MB


# Model Training

In [7]:
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline

import time
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_squared_log_error
from sklearn.compose import ColumnTransformer

In [8]:
def performance(model):
    train_performance = []
    test_performance = []

    y_hat_train = np.exp(model.predict(X_train))-10
    y_hat_test = np.exp(model.predict(X_test))-10

    y_hat_train[y_hat_train <0] = 0
    y_hat_test[y_hat_test<0] = 0


    train_performance.append(np.sqrt(mean_squared_error(y_train, y_hat_train)))
    test_performance.append(np.sqrt(mean_squared_error(y_test, y_hat_test)))

    train_performance.append(mean_absolute_error(y_train, y_hat_train))
    test_performance.append(mean_absolute_error(y_test, y_hat_test))

    train_performance.append(mean_squared_log_error(y_train, y_hat_train, squared= False))
    test_performance.append(mean_squared_log_error(y_test, y_hat_test, squared= False))

    train_performance.append(r2_score(y_train, y_hat_train))
    test_performance.append(r2_score(y_test, y_hat_test))


    df_performance = pd.DataFrame([train_performance,test_performance], columns=['mean_squared_error', 'mean_absolute_error', 'RMSLE', 'r2_score'], index= ['trian','test'])
    display(df_performance)

    fig ,ax = plt.subplots(1,2, figsize = (10,4))
    sns.scatterplot(x = y_train, y = y_hat_train, ax = ax[0])
    ax[0].set_xlabel("y_real")
    ax[0].set_ylabel("y_prediction")
    ax[0].set_title("Train")
    ax[0].plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--')

    sns.scatterplot(x = y_test, y = y_hat_test, ax = ax[1])
    ax[1].set_xlabel("y_real")
    ax[1].set_ylabel("y_prediction")
    ax[1].set_title("Test")
    ax[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
    plt.show()

    return train_performance, test_performance

# Trainer class

# write code based on functions

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_squared_log_error

# Define your variables
cont_vars = [ 'onpromotion', 'dcoilwtico']
encode_vars = ['store_nbr', 'city', 'state', 'type_store','cluster','type_holiday', 'locale', 'dayweek']
other_vars = ['year','month', 'day', 'quarter' , 'weekyear', 'isweekend', 'transferred']
cols = cont_vars +encode_vars + other_vars
categories = list(df['family'].unique())


def preprocess_train_v01(model, split_date = "2017-01-01"):

    split_date = pd.to_datetime(split_date) 
    # Split the data
    train_data = df[df.date< split_date]
    test_data = df[df.date >= split_date]
    y_hat_test_all = pd.Series(index=test_data.index) 

    # Create the preprocessing pipeline, we need to use the ColumnTransfer class since each transforemer will be applied on diffrent part of dataframe
    ct = ColumnTransformer(
        [
            ("scaler", StandardScaler(), cont_vars),
            ("onehot_encoder", OneHotEncoder(), encode_vars)
        ],
        remainder='passthrough'
    )

    # Create the main pipeline
    pipeline = Pipeline([
        ('preprocessing', ct),
        ('clf', model)
    ])
    
    print("RMSLE within each category\n", "="*(50))
    for cat in categories:
        X_train = train_data[train_data['family']==cat][cols]
        X_test = test_data[test_data['family']==cat][cols]
        y_train = train_data[train_data['family']==cat]['sales']
        y_test = test_data[test_data['family']==cat]['sales']
        y_train_log = np.log(y_train+10)
        y_test_log = np.log(y_test+10)

        pipeline.fit(X_train, y_train_log)
        y_hat_test = np.exp(pipeline.predict(X_test))-10
        y_hat_test[y_hat_test<0] = 0
        print(f"{cat}:", " "*(30-len(cat)), mean_squared_log_error(y_test, y_hat_test, squared= False))

        y_hat_test_all[test_data.index[test_data['family'] == cat]] = y_hat_test

    
    # Calculate the MSLE for all categories combined
    overall_rmsle = mean_squared_log_error(test_data['sales'], y_hat_test_all, squared= False)
    print("="*(50))
    print("Overall RMSLE:", " "*(30-len("Overall RMSLE")), overall_rmsle)
    



In [10]:
# Encode time features

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_squared_log_error

# Define your variables
cont_vars = [ 'onpromotion', 'dcoilwtico']
encode_vars = ['store_nbr', 'city', 'state', 'type_store','cluster','type_holiday', 'locale', 'dayweek', 'year','month', 'day', 'quarter' , 'weekyear',]
other_vars = [ 'isweekend', 'transferred']
cols = cont_vars + encode_vars + other_vars
categories = list(df['family'].unique())


def preprocess_train_v02(model, split_date = "2017-01-01"):

    split_date = pd.to_datetime(split_date) 
    # Split the data
    train_data = df[df.date< split_date]
    test_data = df[df.date >= split_date]
    y_hat_test_all = pd.Series(index=test_data.index) 

    # Create the preprocessing pipeline, we need to use the ColumnTransfer class since each transforemer will be applied on diffrent part of dataframe
    ct = ColumnTransformer(
        [
            ("scaler", StandardScaler(), cont_vars),
            ("onehot_encoder", OneHotEncoder(handle_unknown= 'infrequent_if_exist', min_frequency= 0.005), encode_vars)
        ],
        remainder='passthrough'
    )

    # Create the main pipeline
    pipeline = Pipeline([
        ('preprocessing', ct),
        ('clf', model)
    ])
    
    print("RMSLE within each category\n", "="*(50))
    for cat in categories:
        X_train = train_data[train_data['family']==cat][cols]
        X_test = test_data[test_data['family']==cat][cols]
        y_train = train_data[train_data['family']==cat]['sales']
        y_test = test_data[test_data['family']==cat]['sales']
        y_train_log = np.log(y_train+10)
        y_test_log = np.log(y_test+10)

        pipeline.fit(X_train, y_train_log)
        y_hat_test = np.exp(pipeline.predict(X_test))-10
        y_hat_test[y_hat_test<0] = 0
        print(f"{cat}:", " "*(30-len(cat)), mean_squared_log_error(y_test, y_hat_test, squared= False))

        y_hat_test_all[test_data.index[test_data['family'] == cat]] = y_hat_test

    
    # Calculate the MSLE for all categories combined
    overall_rmsle = mean_squared_log_error(test_data['sales'], y_hat_test_all, squared= False)
    print("="*(50))
    print("Overall RMSLE:", " "*(30-len("Overall RMSLE")), overall_rmsle)
    



In [11]:
# replace onehot encoding with target encoding

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, TargetEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_squared_log_error

# Define your variables
cont_vars = [ 'onpromotion', 'dcoilwtico']
encode_vars = ['store_nbr', 'city', 'state', 'type_store','cluster','type_holiday', 'locale', 'dayweek']
other_vars = ['year','month', 'day', 'quarter' , 'weekyear', 'isweekend', 'transferred']
cols = cont_vars +encode_vars + other_vars
categories = list(df['family'].unique())


def preprocess_train_v03(model, split_date = "2017-01-01"):

    split_date = pd.to_datetime(split_date) 
    # Split the data
    train_data = df[df.date< split_date]
    test_data = df[df.date >= split_date]
    y_hat_test_all = pd.Series(index=test_data.index) 

    # Create the preprocessing pipeline, we need to use the ColumnTransfer class since each transforemer will be applied on diffrent part of dataframe
    ct = ColumnTransformer(
        [
            ("scaler", StandardScaler(), cont_vars),
            ("onehot_encoder", TargetEncoder(), encode_vars)
        ],
        remainder='passthrough'
    )

    # Create the main pipeline
    pipeline = Pipeline([
        ('preprocessing', ct),
        ('clf', model)
    ])
    
    print("RMSLE within each category\n", "="*(50))
    for cat in categories:
        X_train = train_data[train_data['family']==cat][cols]
        X_test = test_data[test_data['family']==cat][cols]
        y_train = train_data[train_data['family']==cat]['sales']
        y_test = test_data[test_data['family']==cat]['sales']
        y_train_log = np.log(y_train+10)
        y_test_log = np.log(y_test+10)

        pipeline.fit(X_train, y_train_log)
        y_hat_test = np.exp(pipeline.predict(X_test))-10
        y_hat_test[y_hat_test<0] = 0
        print(f"{cat}:", " "*(30-len(cat)), mean_squared_log_error(y_test, y_hat_test, squared= False))

        y_hat_test_all[test_data.index[test_data['family'] == cat]] = y_hat_test

    
    # Calculate the MSLE for all categories combined
    overall_rmsle = mean_squared_log_error(test_data['sales'], y_hat_test_all, squared= False)
    print("="*(50))
    print("Overall RMSLE:", " "*(30-len("Overall RMSLE")), overall_rmsle)
    



In [12]:
# Do not apply encoding on cluster ans store_nbr

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_squared_log_error

# Define your variables
cont_vars = [ 'onpromotion', 'dcoilwtico']
encode_vars = ['city', 'state', 'type_store','type_holiday', 'locale', 'dayweek']
other_vars = ['year','month', 'day', 'quarter' , 'weekyear', 'isweekend', 'transferred', 'store_nbr', 'cluster']
cols = cont_vars + encode_vars + other_vars
categories = list(df['family'].unique())


def preprocess_train_v04(model, split_date = "2017-01-01"):

    split_date = pd.to_datetime(split_date) 
    # Split the data
    train_data = df[df.date< split_date]
    test_data = df[df.date >= split_date]
    y_hat_test_all = pd.Series(index=test_data.index) 

    # Create the preprocessing pipeline, we need to use the ColumnTransfer class since each transforemer will be applied on diffrent part of dataframe
    ct = ColumnTransformer(
        [
            ("scaler", StandardScaler(), cont_vars),
            ("onehot_encoder", OneHotEncoder(), encode_vars)
        ],
        remainder='passthrough'
    )

    # Create the main pipeline
    pipeline = Pipeline([
        ('preprocessing', ct),
        ('clf', model)
    ])
    
    print("RMSLE within each category\n", "="*(50))
    for cat in categories:
        X_train = train_data[train_data['family']==cat][cols]
        X_test = test_data[test_data['family']==cat][cols]
        y_train = train_data[train_data['family']==cat]['sales']
        y_test = test_data[test_data['family']==cat]['sales']
        y_train_log = np.log(y_train+10)
        y_test_log = np.log(y_test+10)

        pipeline.fit(X_train, y_train_log)
        y_hat_test = np.exp(pipeline.predict(X_test))-10
        y_hat_test[y_hat_test<0] = 0
        print(f"{cat}:", " "*(30-len(cat)), mean_squared_log_error(y_test, y_hat_test, squared= False))

        y_hat_test_all[test_data.index[test_data['family'] == cat]] = y_hat_test

    
    # Calculate the MSLE for all categories combined
    overall_rmsle = mean_squared_log_error(test_data['sales'], y_hat_test_all, squared= False)
    print("="*(50))
    print("Overall RMSLE:", " "*(30-len("Overall RMSLE")), overall_rmsle)
    



## v05

In [13]:
# ************ v05 *****************

# Modification of v04
# Do not consider 'quarter' and 'weakyear' and 'isweekend'
# (v04: Do not apply encoding on cluster ans store_nbr)

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_squared_log_error

# Define your variables
cont_vars = [ 'onpromotion', 'dcoilwtico']
encode_vars = ['city', 'state', 'type_store','type_holiday', 'locale', 'dayweek']
other_vars = ['year','month', 'day', 'store_nbr', 'cluster','transferred']
cols = cont_vars +encode_vars + other_vars
categories = list(df['family'].unique())


def preprocess_train_v05(model, split_date = "2017-01-01"):

    split_date = pd.to_datetime(split_date) 
    # Split the data
    train_data = df[df.date< split_date]
    test_data = df[df.date >= split_date]
    y_hat_test_all = pd.Series(index=test_data.index) 

    # Create the preprocessing pipeline, we need to use the ColumnTransfer class since each transforemer will be applied on diffrent part of dataframe
    ct = ColumnTransformer(
        [
            ("scaler", StandardScaler(), cont_vars),
            ("onehot_encoder", OneHotEncoder(), encode_vars)
        ],
        remainder='passthrough'
    )

    # Create the main pipeline
    pipeline = Pipeline([
        ('preprocessing', ct),
        ('clf', model)
    ])
    
    print("RMSLE within each category\n", "="*(50))
    for cat in categories:
        X_train = train_data[train_data['family']==cat][cols]
        X_test = test_data[test_data['family']==cat][cols]
        y_train = train_data[train_data['family']==cat]['sales']
        y_test = test_data[test_data['family']==cat]['sales']
        y_train_log = np.log(y_train+10)
        y_test_log = np.log(y_test+10)

        pipeline.fit(X_train, y_train_log)
        y_hat_test = np.exp(pipeline.predict(X_test))-10
        y_hat_test[y_hat_test<0] = 0
        print(f"{cat}:", " "*(30-len(cat)), mean_squared_log_error(y_test, y_hat_test, squared= False))

        y_hat_test_all[test_data.index[test_data['family'] == cat]] = y_hat_test

    
    # Calculate the MSLE for all categories combined
    overall_rmsle = mean_squared_log_error(test_data['sales'], y_hat_test_all, squared= False)
    print("="*(50))
    print("Overall RMSLE:", " "*(30-len("Overall RMSLE")), overall_rmsle)
    



In [14]:
# Modification of v04 and v05
# apply scaling on : 'year','month', 'day', 'store_nbr', 'cluster'
# (v:05)Do not consider 'quarter' and 'weakyear' and 'isweekend'
# (v04 and v05): Do not apply encoding on cluster ans store_nbr)

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_squared_log_error

# Define your variables
cont_vars = [ 'onpromotion', 'dcoilwtico', 'year','month', 'day', 'store_nbr', 'cluster']
encode_vars = ['city', 'state', 'type_store','type_holiday', 'locale', 'dayweek']
other_vars = ['transferred']
cols = cont_vars +encode_vars + other_vars
categories = list(df['family'].unique())


def preprocess_train_v06(model, split_date = "2017-01-01"):

    split_date = pd.to_datetime(split_date) 
    # Split the data
    train_data = df[df.date< split_date]
    test_data = df[df.date >= split_date]
    y_hat_test_all = pd.Series(index=test_data.index) 

    # Create the preprocessing pipeline, we need to use the ColumnTransfer class since each transforemer will be applied on diffrent part of dataframe
    ct = ColumnTransformer(
        [
            ("scaler", StandardScaler(), cont_vars),
            ("onehot_encoder", OneHotEncoder(), encode_vars)
        ],
        remainder='passthrough'
    )

    # Create the main pipeline
    pipeline = Pipeline([
        ('preprocessing', ct),
        ('clf', model)
    ])
    
    print("RMSLE within each category\n", "="*(50))
    for cat in categories:
        X_train = train_data[train_data['family']==cat][cols]
        X_test = test_data[test_data['family']==cat][cols]
        y_train = train_data[train_data['family']==cat]['sales']
        y_test = test_data[test_data['family']==cat]['sales']
        y_train_log = np.log(y_train+10)
        y_test_log = np.log(y_test+10)

        pipeline.fit(X_train, y_train_log)
        y_hat_test = np.exp(pipeline.predict(X_test))-10
        y_hat_test[y_hat_test<0] = 0
        print(f"{cat}:", " "*(30-len(cat)), mean_squared_log_error(y_test, y_hat_test, squared= False))

        y_hat_test_all[test_data.index[test_data['family'] == cat]] = y_hat_test

    
    # Calculate the MSLE for all categories combined
    overall_rmsle = mean_squared_log_error(test_data['sales'], y_hat_test_all, squared= False)
    print("="*(50))
    print("Overall RMSLE:", " "*(30-len("Overall RMSLE")), overall_rmsle)
    



In [15]:
# v07

# Modification of v04 and v05
# Do not consider  type_holiday 
#(v05: Do not consider 'quarter' and 'weakyear' and 'isweekend')
# (v05: (v04: Do not apply encoding on cluster ans store_nbr))

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_squared_log_error

# Define your variables
cont_vars = [ 'onpromotion', 'dcoilwtico']
encode_vars = ['city', 'state' , 'locale', 'dayweek', 'type_store' ]
other_vars = ['year','month', 'day', 'store_nbr', 'cluster', 'transferred']
cols = cont_vars +encode_vars + other_vars
categories = list(df['family'].unique())


def preprocess_train_v07(model, split_date = "2017-01-01"):

    split_date = pd.to_datetime(split_date) 
    # Split the data
    train_data = df[df.date< split_date]
    test_data = df[df.date >= split_date]
    y_hat_test_all = pd.Series(index=test_data.index) 

    # Create the preprocessing pipeline, we need to use the ColumnTransfer class since each transforemer will be applied on diffrent part of dataframe
    ct = ColumnTransformer(
        [
            ("scaler", StandardScaler(), cont_vars),
            ("onehot_encoder", OneHotEncoder(), encode_vars)
        ],
        remainder='passthrough'
    )

    # Create the main pipeline
    pipeline = Pipeline([
        ('preprocessing', ct),
        ('clf', model)
    ])
    
    print("RMSLE within each category\n", "="*(50))
    for cat in categories:
        X_train = train_data[train_data['family']==cat][cols]
        X_test = test_data[test_data['family']==cat][cols]
        y_train = train_data[train_data['family']==cat]['sales']
        y_test = test_data[test_data['family']==cat]['sales']
        y_train_log = np.log(y_train+10)
        y_test_log = np.log(y_test+10)

        pipeline.fit(X_train, y_train_log)
        y_hat_test = np.exp(pipeline.predict(X_test))-10
        y_hat_test[y_hat_test<0] = 0
        print(f"{cat}:", " "*(30-len(cat)), mean_squared_log_error(y_test, y_hat_test, squared= False))

        y_hat_test_all[test_data.index[test_data['family'] == cat]] = y_hat_test

    
    # Calculate the MSLE for all categories combined
    overall_rmsle = mean_squared_log_error(test_data['sales'], y_hat_test_all, squared= False)
    print("="*(50))
    print("Overall RMSLE:", " "*(30-len("Overall RMSLE")), overall_rmsle)
    



In [16]:

# Modification of v04 and v05
# change encoding type
#v05:
# Do not consider 'quarter' and 'weakyear' and 'isweekend'
# (v04: Do not apply encoding on cluster ans store_nbr)

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_squared_log_error

# Define your variables
cont_vars = [ 'onpromotion', 'dcoilwtico']
encode_vars = ['city', 'state', 'type_store','type_holiday', 'locale', 'dayweek']
other_vars = ['year','month', 'day', 'store_nbr', 'cluster','transferred']
cols = cont_vars + encode_vars + other_vars
categories = list(df['family'].unique())


def preprocess_train_v08(model, split_date = "2017-01-01"):

    split_date = pd.to_datetime(split_date) 
    # Split the data
    train_data = df[df.date< split_date]
    test_data = df[df.date >= split_date]
    y_hat_test_all = pd.Series(index=test_data.index) 

    # Create the preprocessing pipeline, we need to use the ColumnTransfer class since each transforemer will be applied on diffrent part of dataframe
    ct = ColumnTransformer(
        [
            ("scaler", StandardScaler(), cont_vars),
            ("encoder", LabelEncoder(), encode_vars)
        ],
        remainder='passthrough'
    )

    # Create the main pipeline
    pipeline = Pipeline([
        ('preprocessing', ct),
        ('clf', model)
    ])
    
    print("RMSLE within each category\n", "="*(50))
    for cat in categories:
        X_train = train_data[train_data['family']==cat][cols]
        X_test = test_data[test_data['family']==cat][cols]
        y_train = train_data[train_data['family']==cat]['sales']
        y_test = test_data[test_data['family']==cat]['sales']
        y_train_log = np.log(y_train+10)
        y_test_log = np.log(y_test+10)

        pipeline.fit(X_train, y_train_log)
        y_hat_test = np.exp(pipeline.predict(X_test))-10
        y_hat_test[y_hat_test<0] = 0
        print(f"{cat}:", " "*(30-len(cat)), mean_squared_log_error(y_test, y_hat_test, squared= False))

        y_hat_test_all[test_data.index[test_data['family'] == cat]] = y_hat_test

    
    # Calculate the MSLE for all categories combined
    overall_rmsle = mean_squared_log_error(test_data['sales'], y_hat_test_all, squared= False)
    print("="*(50))
    print("Overall RMSLE:", " "*(30-len("Overall RMSLE")), overall_rmsle)
    



# Linear Regression

In [17]:
lr = LinearRegression()
preprocess_train_v01(lr)

  y_hat_test_all = pd.Series(index=test_data.index)


RMSLE within each category
AUTOMOTIVE:                      0.6669351956467529
BABY CARE:                       0.33832031460662515
BEAUTY:                          0.6472716084002237
BEVERAGES:                       1.110922024640117
BOOKS:                           0.32664611938149074
BREAD/BAKERY:                    0.8907091001870384
CELEBRATION:                     0.8034725617144592
CLEANING:                        1.0262416684422149
DAIRY:                           0.987129067073641
DELI:                            0.8037509949088049
EGGS:                            0.8722937507731852
FROZEN FOODS:                    0.788199773633695
GROCERY I:                       1.1943320143386043
GROCERY II:                      0.872924742658103
HARDWARE:                        0.6030856424033304
HOME AND KITCHEN I:              0.7675111535260642
HOME AND KITCHEN II:             0.7001255512831682
HOME APPLIANCES:                 0.45093786855018936
HOME CARE:                       0.957

In [18]:
lr = LinearRegression()
preprocess_train_v02(lr)

  y_hat_test_all = pd.Series(index=test_data.index)


RMSLE within each category
AUTOMOTIVE:                      0.6671210316976374
BABY CARE:                       0.3382779510580358
BEAUTY:                          0.6475447010366043
BEVERAGES:                       1.1111761075972384
BOOKS:                           0.3267930526188479
BREAD/BAKERY:                    0.8908366395686103
CELEBRATION:                     0.8035316301635201
CLEANING:                        1.0264938427890036
DAIRY:                           0.9872151332054797
DELI:                            0.8038242050277523
EGGS:                            0.8723847398827002
FROZEN FOODS:                    0.787739584546886
GROCERY I:                       1.1945576727926062
GROCERY II:                      0.872487783568732
HARDWARE:                        0.603176870978815
HOME AND KITCHEN I:              0.7676472794149192
HOME AND KITCHEN II:             0.7002893660524371
HOME APPLIANCES:                 0.4508823352221312
HOME CARE:                       0.95749

In [19]:
lr = LinearRegression()
preprocess_train_v03(lr)

  y_hat_test_all = pd.Series(index=test_data.index)


RMSLE within each category
AUTOMOTIVE:                      0.6786856901170848
BABY CARE:                       0.33960182661167737
BEAUTY:                          0.670785426852356
BEVERAGES:                       1.080275700316932
BOOKS:                           0.3264587028774208
BREAD/BAKERY:                    0.8734563394552007
CELEBRATION:                     0.830743270480932
CLEANING:                        1.008233208372312
DAIRY:                           0.9596566399311676
DELI:                            0.8070561993499085
EGGS:                            0.8427247722880498
FROZEN FOODS:                    0.7916326445557488
GROCERY I:                       1.1632399430508082
GROCERY II:                      0.8949615243181247
HARDWARE:                        0.6006578153528735
HOME AND KITCHEN I:              0.7693407553701377
HOME AND KITCHEN II:             0.7214435665491046
HOME APPLIANCES:                 0.45381207038183385
HOME CARE:                       0.9789

In [20]:
lr = LinearRegression()
preprocess_train_v04(lr)

  y_hat_test_all = pd.Series(index=test_data.index)


RMSLE within each category
AUTOMOTIVE:                      0.6669351956467529
BABY CARE:                       0.33832031460662515
BEAUTY:                          0.6472716084002237
BEVERAGES:                       1.110922024640117
BOOKS:                           0.32664611938149074
BREAD/BAKERY:                    0.8907091001870384
CELEBRATION:                     0.8034725617144592
CLEANING:                        1.0262416684422149
DAIRY:                           0.987129067073641
DELI:                            0.8037509949088049
EGGS:                            0.8722937507731852
FROZEN FOODS:                    0.788199773633695
GROCERY I:                       1.1943320143386043
GROCERY II:                      0.872924742658103
HARDWARE:                        0.6030856424033304
HOME AND KITCHEN I:              0.7675111535260642
HOME AND KITCHEN II:             0.7001255512831682
HOME APPLIANCES:                 0.45093786855018936
HOME CARE:                       0.957

In [21]:
lr = LinearRegression()
preprocess_train_v05(lr)

  y_hat_test_all = pd.Series(index=test_data.index)


RMSLE within each category
AUTOMOTIVE:                      0.6669351956467529
BABY CARE:                       0.33832031460662515
BEAUTY:                          0.6472716084002237
BEVERAGES:                       1.110922024640117
BOOKS:                           0.32664611938149074
BREAD/BAKERY:                    0.8907091001870384
CELEBRATION:                     0.8034725617144592
CLEANING:                        1.0262416684422149
DAIRY:                           0.987129067073641
DELI:                            0.8037509949088049
EGGS:                            0.8722937507731852
FROZEN FOODS:                    0.788199773633695
GROCERY I:                       1.1943320143386043
GROCERY II:                      0.872924742658103
HARDWARE:                        0.6030856424033304
HOME AND KITCHEN I:              0.7675111535260642
HOME AND KITCHEN II:             0.7001255512831682
HOME APPLIANCES:                 0.45093786855018936
HOME CARE:                       0.957

In [22]:
lr = LinearRegression()
preprocess_train_v06(lr)

  y_hat_test_all = pd.Series(index=test_data.index)


RMSLE within each category
AUTOMOTIVE:                      0.6669351956467529
BABY CARE:                       0.33832031460662515
BEAUTY:                          0.6472716084002237
BEVERAGES:                       1.110922024640117
BOOKS:                           0.32664611938149074
BREAD/BAKERY:                    0.8907091001870384
CELEBRATION:                     0.8034725617144592
CLEANING:                        1.0262416684422149
DAIRY:                           0.987129067073641
DELI:                            0.8037509949088049
EGGS:                            0.8722937507731852
FROZEN FOODS:                    0.788199773633695
GROCERY I:                       1.1943320143386043
GROCERY II:                      0.872924742658103
HARDWARE:                        0.6030856424033304
HOME AND KITCHEN I:              0.7675111535260642
HOME AND KITCHEN II:             0.7001255512831682
HOME APPLIANCES:                 0.45093786855018936
HOME CARE:                       0.957

In [23]:
lr = LinearRegression()
preprocess_train_v07(lr)

  y_hat_test_all = pd.Series(index=test_data.index)


RMSLE within each category
AUTOMOTIVE:                      0.6669351956467529
BABY CARE:                       0.33832031460662515
BEAUTY:                          0.6472716084002237
BEVERAGES:                       1.110922024640117
BOOKS:                           0.32664611938149074
BREAD/BAKERY:                    0.8907091001870384
CELEBRATION:                     0.8034725617144592
CLEANING:                        1.0262416684422149
DAIRY:                           0.987129067073641
DELI:                            0.8037509949088049
EGGS:                            0.8722937507731852
FROZEN FOODS:                    0.788199773633695
GROCERY I:                       1.1943320143386043
GROCERY II:                      0.872924742658103
HARDWARE:                        0.6030856424033304
HOME AND KITCHEN I:              0.7675111535260642
HOME AND KITCHEN II:             0.7001255512831682
HOME APPLIANCES:                 0.45093786855018936
HOME CARE:                       0.957

# XGBoost

In [24]:

xgb = XGBRegressor(max_depth=12, 
        min_child_weight = 1,
        learning_rate=0.02, 
        subsample=0.8,
        colsample_bytree=0.4, 
        #eval_metric= 'mae',
        n_estimators=1000,)


start = time.time()
preprocess_train_v01(xgb)
end = time.time()
print(f"train time:{end-start}")



  y_hat_test_all = pd.Series(index=test_data.index)


RMSLE within each category
AUTOMOTIVE:                      0.5908885352532536
BABY CARE:                       0.34227234620758606
BEAUTY:                          0.5391496644024714
BEVERAGES:                       0.5390630417551031
BOOKS:                           0.29416815887161757
BREAD/BAKERY:                    0.5578030905247099
CELEBRATION:                     0.7194502580706879
CLEANING:                        0.44575379239171414
DAIRY:                           0.42090311672295977
DELI:                            0.4123416317174955
EGGS:                            0.5977397591082554
FROZEN FOODS:                    0.5076144466597666
GROCERY I:                       0.45925579339514044
GROCERY II:                      0.766675991757525
HARDWARE:                        0.576166445699467
HOME AND KITCHEN I:              0.652356168614581
HOME AND KITCHEN II:             0.5643099171302952
HOME APPLIANCES:                 0.435738148267094
HOME CARE:                       0.6

In [25]:
xgb = XGBRegressor(max_depth=12, 
        min_child_weight = 1,
        learning_rate=0.02, 
        subsample=0.8,
        colsample_bytree=0.4, 
        #eval_metric= 'mae',
        n_estimators=1000,
        alpha = 1,
        gamma = 1.5
)

start = time.time()
preprocess_train_v01(xgb)
end = time.time()
print(f"train time:{end-start}")


  y_hat_test_all = pd.Series(index=test_data.index)


RMSLE within each category
AUTOMOTIVE:                      0.5797739448434569
BABY CARE:                       0.3218792244149799
BEAUTY:                          0.5365178944082964


KeyboardInterrupt: 

In [None]:
xgb = XGBRegressor(max_depth=12, 
        min_child_weight = 1,
        learning_rate=0.02, 
        subsample=0.8,
        colsample_bytree=0.4, 
        #eval_metric= 'mae',
        n_estimators=1000,
        alpha = 1,
        gamma = 1.5
)

start = time.time()
preprocess_train_v02(xgb)
end = time.time()
print(f"train time:{end-start}")

  y_hat_test_all = pd.Series(index=test_data.index)


RMSLE within each category
AUTOMOTIVE:                      0.6235512391016124
BABY CARE:                       0.3296097109400333
BEAUTY:                          0.54864737462978
BEVERAGES:                       0.88478320670847
BOOKS:                           0.32809613573726626
BREAD/BAKERY:                    0.7369035546681804
CELEBRATION:                     0.9392652413475157
CLEANING:                        0.7414402847856355
DAIRY:                           0.6679373951318653
DELI:                            0.6342919362409936
EGGS:                            0.7685564881872687


KeyboardInterrupt: 

In [None]:
xgb = XGBRegressor(max_depth=12, 
        min_child_weight = 1,
        learning_rate=0.02, 
        subsample=0.8,
        colsample_bytree=0.4, 
        #eval_metric= 'mae',
        n_estimators=1000,
        alpha = 1,
        gamma = 1.5
)

start = time.time()
preprocess_train_v04(xgb)
end = time.time()
print(f"train time:{end-start}")

  y_hat_test_all = pd.Series(index=test_data.index)


RMSLE within each category
AUTOMOTIVE:                      0.5831445368163105
BABY CARE:                       0.32190845250520794
BEAUTY:                          0.5335974118304553
BEVERAGES:                       0.5916791682283192
BOOKS:                           0.3058147006278013
BREAD/BAKERY:                    0.5779982578933477
CELEBRATION:                     0.6911547313883426
CLEANING:                        0.49772774166627815
DAIRY:                           0.46622801853199564
DELI:                            0.44766519753448086
EGGS:                            0.5992768674124305
FROZEN FOODS:                    0.5049898122419187
GROCERY I:                       0.5177697405926822
GROCERY II:                      0.7703384907717272
HARDWARE:                        0.5741222390442761
HOME AND KITCHEN I:              0.6539918603560618
HOME AND KITCHEN II:             0.5453940918129798
HOME APPLIANCES:                 0.4273128044357963
HOME CARE:                       

In [None]:
## good 0

xgb = XGBRegressor(max_depth=12, 
        min_child_weight = 1,
        learning_rate=0.02, 
        subsample=0.8,
        colsample_bytree=0.4, 
        #eval_metric= 'mae',
        n_estimators=1000,
        alpha = 1,
        gamma = 1.5
)

start = time.time()
preprocess_train_v05(xgb)
end = time.time()
print(f"train time:{end-start}")

  y_hat_test_all = pd.Series(index=test_data.index)


RMSLE within each category
AUTOMOTIVE:                      0.5797739448434569
BABY CARE:                       0.3218792244149799
BEAUTY:                          0.5365178944082964
BEVERAGES:                       0.537762360256638
BOOKS:                           0.28756043820200083
BREAD/BAKERY:                    0.5576009038888907
CELEBRATION:                     0.6994406225519169
CLEANING:                        0.4507075340799367
DAIRY:                           0.42523564263769026
DELI:                            0.4136818759105467
EGGS:                            0.5891842429226757
FROZEN FOODS:                    0.5041243116501487
GROCERY I:                       0.4593485140821199
GROCERY II:                      0.764930920787872
HARDWARE:                        0.5750657674893602
HOME AND KITCHEN I:              0.6423385542635585
HOME AND KITCHEN II:             0.5442342050416726
HOME APPLIANCES:                 0.42877251857395604
HOME CARE:                       0.6

In [None]:


xgb = XGBRegressor(max_depth=12, 
        min_child_weight = 1,
        learning_rate=0.02, 
        subsample=0.8,
        colsample_bytree=0.4, 
        objective = 'reg:pseudohubererror',
        n_estimators=1000,
        alpha = 1,
        gamma = 1.5
)

start = time.time()
preprocess_train_v05(xgb)
end = time.time()
print(f"train time:{end-start}")

  y_hat_test_all = pd.Series(index=test_data.index)


RMSLE within each category
AUTOMOTIVE:                      0.5793842303667394
BABY CARE:                       0.32153626145485686
BEAUTY:                          0.5303027348325552
BEVERAGES:                       0.3599623551777786
BOOKS:                           0.29462671209376434
BREAD/BAKERY:                    0.5059413884962696
CELEBRATION:                     0.6605951613333063
CLEANING:                        0.31888366034703747
DAIRY:                           0.27475330718632757
DELI:                            0.3522441694153562
EGGS:                            0.557403267589672
FROZEN FOODS:                    0.535568889934484
GROCERY I:                       0.2832435044045889
GROCERY II:                      0.7704860659046597
HARDWARE:                        0.5750375567868675
HOME AND KITCHEN I:              0.6376220230494329
HOME AND KITCHEN II:             0.5679972440639717
HOME APPLIANCES:                 0.4302002073032984
HOME CARE:                       0.

In [None]:
xgb = XGBRegressor(max_depth=12, 
        min_child_weight = 1,
        learning_rate=0.02, 
        subsample=0.8,
        colsample_bytree=0.4, 
        #eval_metric= 'mae',
        n_estimators=1000,
        alpha = 1,
        gamma = 1.5
)

start = time.time()
preprocess_train_v06(xgb)
end = time.time()
print(f"train time:{end-start}")

  y_hat_test_all = pd.Series(index=test_data.index)


RMSLE within each category
AUTOMOTIVE:                      0.5778604524742961
BABY CARE:                       0.3212743922655737
BEAUTY:                          0.5331780748181097
BEVERAGES:                       0.5695938469371041
BOOKS:                           0.28785139512457836
BREAD/BAKERY:                    0.5636413110737879
CELEBRATION:                     0.720884544451348
CLEANING:                        0.46211370097585297
DAIRY:                           0.43549501782843025


KeyboardInterrupt: 

In [None]:
xgb = XGBRegressor(max_depth=12, 
        min_child_weight = 1,
        learning_rate=0.02, 
        subsample=0.8,
        colsample_bytree=0.4, 
        #eval_metric= 'mae',
        n_estimators=1000,
        alpha = 1,
        gamma = 1.5
)

start = time.time()
preprocess_train_v07(xgb)
end = time.time()
print(f"train time:{end-start}")

  y_hat_test_all = pd.Series(index=test_data.index)


RMSLE within each category
AUTOMOTIVE:                      0.5812443295168982
BABY CARE:                       0.3216499891450972
BEAUTY:                          0.5341301959976889
BEVERAGES:                       0.6404741116577736
BOOKS:                           0.28887656621835
BREAD/BAKERY:                    0.591066131729136
CELEBRATION:                     0.7294052815189493
CLEANING:                        0.5266924393096498
DAIRY:                           0.4978395916976604
DELI:                            0.4614423501463614
EGGS:                            0.5995318907095537
FROZEN FOODS:                    0.5315241578422607
GROCERY I:                       0.5572816520146946
GROCERY II:                      0.770247151122271
HARDWARE:                        0.573849511240421
HOME AND KITCHEN I:              0.6503665984135555
HOME AND KITCHEN II:             0.5554473562386402
HOME APPLIANCES:                 0.4289639827227662
HOME CARE:                       0.8225251

In [None]:
xgb = XGBRegressor(
        booster = 'dart',
        max_depth=12, 
        min_child_weight = 1,
        learning_rate=0.02, 
        subsample=0.8,
        colsample_bytree=0.4, 
        #eval_metric= 'mae',
        n_estimators=1000,
        alpha = 1,
        gamma = 1.5
)

start = time.time()
preprocess_train_v05(xgb)
end = time.time()
print(f"train time:{end-start}")

  y_hat_test_all = pd.Series(index=test_data.index)


RMSLE within each category
AUTOMOTIVE:                      0.5811728581444301
BABY CARE:                       0.3210188320013521
BEAUTY:                          0.5341914824592229


KeyboardInterrupt: 

In [None]:
## good 1

xgb = XGBRegressor(max_depth=12, 
        min_child_weight = 1,
        learning_rate=0.02, 
        subsample=0.8,
        colsample_bytree=0.4, 
        objective = 'reg:pseudohubererror',
        n_estimators=1000,
        alpha = 1,
        gamma = 1.5
)

start = time.time()
preprocess_train_v07(xgb)
end = time.time()
print(f"train time:{end-start}")

  y_hat_test_all = pd.Series(index=test_data.index)


RMSLE within each category
AUTOMOTIVE:                      0.5793842303667394
BABY CARE:                       0.32153626145485686
BEAUTY:                          0.5303027348325552
BEVERAGES:                       0.3599623551777786
BOOKS:                           0.29462671209376434
BREAD/BAKERY:                    0.5059413884962696
CELEBRATION:                     0.6605951613333063
CLEANING:                        0.31888366034703747
DAIRY:                           0.27475330718632757
DELI:                            0.3522441694153562
EGGS:                            0.557403267589672
FROZEN FOODS:                    0.535568889934484
GROCERY I:                       0.2832435044045889
GROCERY II:                      0.7704860659046597
HARDWARE:                        0.5750375567868675
HOME AND KITCHEN I:              0.6376220230494329
HOME AND KITCHEN II:             0.5679972440639717
HOME APPLIANCES:                 0.4302002073032984
HOME CARE:                       0.

In [None]:


xgb = XGBRegressor(max_depth=12, 
        min_child_weight = 1,
        learning_rate=0.02, 
        subsample=0.8,
        colsample_bytree=0.4, 
        objective = 'reg:pseudohubererror',
        n_estimators=1000,
        alpha = 0,
        gamma = 1.5
)

start = time.time()
preprocess_train_v07(xgb)
end = time.time()
print(f"train time:{end-start}")

  y_hat_test_all = pd.Series(index=test_data.index)


RMSLE within each category
AUTOMOTIVE:                      0.5795240899384702
BABY CARE:                       0.319941403161085
BEAUTY:                          0.5308771358381121
BEVERAGES:                       0.3603940126111651
BOOKS:                           0.2850915189655425
BREAD/BAKERY:                    0.5037758858251871
CELEBRATION:                     0.6567620174730258
CLEANING:                        0.3234142058976469
DAIRY:                           0.27685813493778894
DELI:                            0.355758810895273
EGGS:                            0.5525787309728392
FROZEN FOODS:                    0.5415113158892415
GROCERY I:                       0.27988099360811347
GROCERY II:                      0.7658867928158248
HARDWARE:                        0.5744555363516771
HOME AND KITCHEN I:              0.6390577914741843
HOME AND KITCHEN II:             0.5630641855251592
HOME APPLIANCES:                 0.4285203151907868
HOME CARE:                       0.43

In [None]:


xgb = XGBRegressor(max_depth=12, 
        min_child_weight = 1,
        learning_rate=0.02, 
        subsample=0.8,
        colsample_bytree=0.4, 
        objective = 'reg:pseudohubererror',
        n_estimators=1000,
        alpha = 0,
        gamma = 1.5
)

start = time.time()
preprocess_train_v08(xgb)
end = time.time()
print(f"train time:{end-start}")

RMSLE within each category


  y_hat_test_all = pd.Series(index=test_data.index)


TypeError: fit_transform() takes 2 positional arguments but 3 were given

# Lasso regression

In [None]:
las = Lasso(alpha = .1)
preprocess_train_v05(las)

  y_hat_test_all = pd.Series(index=test_data.index)


RMSLE within each category
AUTOMOTIVE:                      0.7273654651217092
BABY CARE:                       0.3370599885413166
BEAUTY:                          0.8893385062517619
BEVERAGES:                       1.0067629752779397
BOOKS:                           0.3182157957681024
BREAD/BAKERY:                    0.9236539233874613
CELEBRATION:                     0.8949478531321957
CLEANING:                        0.9358594484618442
DAIRY:                           0.9353277297815673
DELI:                            0.8165238744179747
EGGS:                            0.9160959771866697
FROZEN FOODS:                    0.8083948042535437
GROCERY I:                       0.9920186654789324
GROCERY II:                      1.0855916624215853
HARDWARE:                        0.6364739743711829
HOME AND KITCHEN I:              0.8416524281785381
HOME AND KITCHEN II:             0.7916987885801599
HOME APPLIANCES:                 0.48012130175311984
HOME CARE:                       1.0

## Ridge regression

In [None]:
ridge = Ridge(alpha = 10)
preprocess_train_v05(ridge)

  y_hat_test_all = pd.Series(index=test_data.index)


RMSLE within each category
AUTOMOTIVE:                      0.6678201782379335
BABY CARE:                       0.3380262319809103
BEAUTY:                          0.6442345194659213
BEVERAGES:                       1.106233846036458
BOOKS:                           0.32348772554985816
BREAD/BAKERY:                    0.8884225609450436
CELEBRATION:                     0.80592928165151
CLEANING:                        1.021804519929437
DAIRY:                           0.9831834509771745
DELI:                            0.8017536545269759
EGGS:                            0.8704317599733719
FROZEN FOODS:                    0.793213595266784
GROCERY I:                       1.1881042610626562
GROCERY II:                      0.8904451171658151
HARDWARE:                        0.6029942535459136
HOME AND KITCHEN I:              0.7616358947936931
HOME AND KITCHEN II:             0.7002597157924846
HOME APPLIANCES:                 0.4518402013730527
HOME CARE:                       0.977474