In [None]:
import numpy as np 
import pandas as pd 
from IPython.display import display, HTML
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
import matplotlib.dates as md
from multiprocessing import  Pool
from datetime import datetime
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import GridSearchCV, train_test_split, RepeatedKFold, KFold, cross_val_score, ShuffleSplit
import xgboost as xgb
import tensorflow as tf
from sklearn.metrics import mean_squared_error
import math

In [None]:
#load data:
#items_df = pd.read_csv("items.csv")
#item_categories_df = pd.read_csv("item_categories.csv")
#shops_df = pd.read_csv("shops.csv")
#train_df = pd.read_csv("sales_train.csv")
#test_df = pd.read_csv("test.csv")

In [None]:
# loading data:
PATH = "/kaggle/input/competitive-data-science-predict-future-sales/"
items_df = pd.read_csv(PATH + "items.csv")
item_categories_df = pd.read_csv(PATH + "item_categories.csv")
shops_df = pd.read_csv(PATH + "shops.csv")
train_df = pd.read_csv(PATH + "sales_train.csv")
test_df = pd.read_csv(PATH + "test.csv")

In [None]:
train_df

In [None]:
#change the date format and sort ascendingly
eda_df = train_df.copy()
eda_df["date"]=  pd.to_datetime(eda_df["date"], format='%d.%m.%Y')
eda_df.sort_values(by="date", ascending=True, inplace=True)

In [None]:
#obtain the top 5% for item_price & item_cnt_day and sales_per_item by item_price times item_cnt_day
for col in ["item_price", "item_cnt_day"]:
    upper_quantile = eda_df[col].quantile(0.95)
    eda_df[col] = np.where(eda_df[col]>upper_quantile, upper_quantile,eda_df[col])

eda_df["sales_per_item"] = eda_df["item_price"] * eda_df["item_cnt_day"]

In [None]:
eda_df

In [None]:
#merge data with other dataframe (items_df, item_categories_df)
eda_df = pd.merge(eda_df, items_df, on='item_id', how='inner')
eda_df = pd.merge(eda_df, item_categories_df, on='item_category_id', how='inner')
eda_df.head()

In [None]:
#create item_category_info_df which is item category information
item_category_info_df = pd.DataFrame(
    columns=["name", "num_products", "first_sold", "last_sold", "min_price", "max_price", "mean_price", "median_price",
              "mean_item_cnt_month", "median_item_cnt_month", "mean_cnt_jan", "mean_cnt_feb", "mean_cnt_mar", "mean_cnt_apr",
              "mean_cnt_may", "mean_cnt_jun", "mean_cnt_jul", "mean_cnt_aug", "mean_cnt_sep", "mean_cnt_oct", "mean_cnt_nov", "mean_cnt_dez"],
    index = item_categories_df["item_category_id"].unique())

for cid in item_category_info_df.index:
    item_category_info_df.at[cid, "name"] = item_categories_df[item_categories_df["item_category_id"]==cid]["item_category_name"].values
    item_category_info_df.at[cid, "num_products"] = items_df[items_df["item_category_id"]==cid]["item_id"].nunique()
    cdf= eda_df[eda_df["item_category_id"]==cid].copy()
    item_category_info_df.at[cid, "first_sold"] = cdf["date"].min()
    item_category_info_df.at[cid, "last_sold"] = cdf["date"].max()
    item_category_info_df.at[cid, "min_price"] = cdf["item_price"].min()
    item_category_info_df.at[cid, "max_price"] = cdf["item_price"].max()
    item_category_info_df.at[cid, "mean_price"] = cdf["item_price"].mean()
    item_category_info_df.at[cid, "median_price"] = cdf["item_price"].median()
    
    
    """
    change the date feature to the first of each month so that each date in a month is mapped to the first day of the month.
    Then we group by this new date. We then get a dataframe with the values sum_sales and sum_item_cnt for each month in each year.
    As a final step, we group by month to get the mean value for each month.
    """
    
    cdf["month"] = cdf["date"].dt.month
    cdf["year"] = cdf["date"].dt.year
    cdf["date"] = pd.to_datetime(cdf[["year", "month"]].assign(DAY=1))
    cdf = cdf[["date", "item_cnt_day", "sales_per_item"]].groupby("date").sum().reset_index()
    cdf = cdf[["date", "item_cnt_day", "sales_per_item"]].groupby(cdf["date"].dt.month).mean().reset_index()
    cdf.rename(columns ={"item_cnt_day": "item_cnt_month", "sales_per_item": "sales_per_month"}, inplace=True)
   
    item_category_info_df.at[cid, "mean_item_cnt_month"] = cdf["item_cnt_month"].mean()
    item_category_info_df.at[cid, "median_item_cnt_month"] = cdf["item_cnt_month"].median()
    
    #average number of items sold per month
    month_mapping = {1: "mean_cnt_jan", 2: "mean_cnt_feb", 3: "mean_cnt_mar", 4: "mean_cnt_apr", 5: "mean_cnt_may", 6: "mean_cnt_jun",
              7: "mean_cnt_jul", 8: "mean_cnt_aug", 9: "mean_cnt_sep", 10: "mean_cnt_oct", 11: "mean_cnt_nov", 12: "mean_cnt_dez"}
    for m in cdf["date"].unique():
        item_category_info_df.at[cid, month_mapping[m]] = cdf[cdf["date"]==m]["item_cnt_month"].values[0]

In [None]:
item_category_info_df

In [None]:
item_cat_sales_dev_df = pd.DataFrame(columns=["item_cat_id","month", "mean_cnt"])
i = 0
for index, row in item_category_info_df.iterrows():
    for m in month_mapping.keys():
        item_cat_sales_dev_df.at[i, "item_cat_id"] = index
        item_cat_sales_dev_df.at[i, "month"] = m
        item_cat_sales_dev_df.at[i, "mean_cnt"] = row[month_mapping[m]]
        i += 1;
item_cat_sales_dev_df = item_cat_sales_dev_df.astype({"item_cat_id": "int8", "month": "int32", "mean_cnt": "float32"});

In [None]:
#cast date to datetime
m_df = eda_df.copy()

m_df['date']= pd.to_datetime(m_df['date'])
m_df["month"] = m_df["date"].dt.month
m_df["year"] = m_df["date"].dt.year
m_df['date'] = pd.to_datetime(m_df[['year', 'month']].assign(DAY=28))
m_df = m_df[["date", "item_cnt_day", "sales_per_item"]].groupby("date").sum().reset_index()
m_df.rename(columns ={"item_cnt_day": "item_cnt_month", "sales_per_item": "sales_per_month"}, inplace=True)
m_df.head()

In [None]:
#add the shop_id 
m_shop_df = eda_df.copy()

m_shop_df['date']= pd.to_datetime(m_shop_df['date'])
m_shop_df["month"] = m_shop_df["date"].dt.month
m_shop_df["year"] = m_shop_df["date"].dt.year
m_shop_df['date'] = pd.to_datetime(m_shop_df[['year', 'month']].assign(DAY=28))
m_shop_df = m_shop_df[["date", "item_cnt_day", "sales_per_item", "shop_id"]].groupby(["date", "shop_id"]).sum().reset_index()
m_shop_df.rename(columns ={"item_cnt_day": "item_cnt_month", "sales_per_item": "sales_per_month"}, inplace=True)
m_shop_df.head()

In [None]:
#create shop_info_df which is shop informationn
shop_info_df = pd.DataFrame(columns=["shop_name", "num_products", "fist_business_m", "last_business_m",
                                     "min_price", "max_price", "mean_price", "median_price", "mean_sales_pm", "median_sales_pm"],
                            index = m_shop_df["shop_id"].unique())

for sid in m_shop_df["shop_id"].unique():
    shop_info_df.at[sid, "shop_name"] = shops_df[shops_df["shop_id"]==sid]["shop_name"].values[0]
    shop_info_df.at[sid, "num_products"] = train_df[train_df["shop_id"]==sid]["item_id"].nunique()
    sdf= m_shop_df[m_shop_df["shop_id"]==sid]
    shop_info_df.at[sid, "fist_business_m"] = sdf["date"].min()
    shop_info_df.at[sid, "last_business_m"] = sdf["date"].max()
    shop_info_df.at[sid, "min_price"] = train_df[train_df["shop_id"]==sid]["item_price"].min()
    shop_info_df.at[sid, "max_price"] = train_df[train_df["shop_id"]==sid]["item_price"].max()
    shop_info_df.at[sid, "mean_price"] = train_df[train_df["shop_id"]==sid]["item_price"].mean()
    shop_info_df.at[sid, "median_price"] = train_df[train_df["shop_id"]==sid]["item_price"].median()
    shop_info_df.at[sid, "mean_sales_pm"] = sdf["sales_per_month"].mean()
    shop_info_df.at[sid, "median_sales_pm"] = sdf["sales_per_month"].median()
    """
    We will do it a little bit different for this shop dataframe. Here, the average turnover per month would
    not be so interesting.Instead, we can use our m_shop_df for the sales figures per month.
    """
# change datatypes (datetime float and int)
shop_info_df['fist_business_m']= pd.to_datetime(shop_info_df['fist_business_m'])
shop_info_df['last_business_m']= pd.to_datetime(shop_info_df['last_business_m'])
shop_info_df["fist_business_m"] = shop_info_df["fist_business_m"].dt.strftime("%Y-%m")
shop_info_df["last_business_m"] = shop_info_df["last_business_m"].dt.strftime("%Y-%m")
shop_info_df = shop_info_df.astype({'num_products': 'int32',
                                    "min_price": 'float32',
                                    "max_price": 'float32',
                                    "mean_price": 'float32',
                                    "median_price": 'float32',
                                    "mean_sales_pm": 'float32',
                                    "median_sales_pm": 'float32'})

**Data cleaning & Grouping by month**

In [None]:
def preprocessing(data, item_data=items_df, shop_data=shops_df, category_data=item_categories_df):
    """
    Some basic stuff.
    """ 
    print(50*'-')
    print("preprocessing...")
    # 1). Create a copy of the Dataframe.
    df = data.copy()
    # 2). Remove all values with item_cnt_day < 1.
    df = df[df["item_cnt_day"]>0]
    #3). Add Month feature
    df["date"]= pd.to_datetime(df["date"], format='%d.%m.%Y')
    df["month"] = df["date"].dt.month
    #4). Group by date_block_nu$m.
    df = df[["month", "date_block_num", "shop_id", "item_id", "item_price", "item_cnt_day"]].groupby(
        ["date_block_num", "shop_id", "item_id"]).agg(
        {"item_price": "mean","item_cnt_day": "sum", "month": "min"}).reset_index()
    df.rename(columns={"item_cnt_day": "item_cnt_month"}, inplace=True)
    #5). Add category_id and item_name.
    df = pd.merge(df, item_data, on="item_id", how="inner")
    #6). Add shop_name 
    df = pd.merge(df, shop_data, on="shop_id", how="inner")
    #7). Add category_name
    df = pd.merge(df, category_data, on="item_category_id", how="inner")
    print("done.")
    print(50*'-')
    return df

In [None]:
piped_df = preprocessing(data=train_df)
piped_df.head()

In [None]:
def add_city_feature(data):
    data.loc[data["shop_name"] == 'Сергиев Посад ТЦ "7Я"', "shop_name"] = 'СергиевПосад ТЦ "7Я"'
    data["city"] = data["shop_name"].str.split(" ").map(lambda x: x[0])
    data.loc[data["city"] == "!Якутск", "city"] = "Якутск"
    data["city_code"] = data["city"].factorize()[0]
    return data

In [None]:
piped_df=add_city_feature(piped_df)
piped_df.head()

In [None]:
shop_info_df.head()

In [None]:
# add city
shop_info_df=add_city_feature(shop_info_df)

In [None]:
shop_info_df.head()

In [None]:
#Convert data to int
fist_date = datetime.strptime("2013-01-01", "%Y-%m-%d").date()

shop_info_df['fist_business_m']= pd.to_datetime(shop_info_df['fist_business_m'])
shop_info_df['last_business_m']= pd.to_datetime(shop_info_df['last_business_m'])

shop_info_df["fist_business_m"] = shop_info_df["fist_business_m"].map(lambda x: (x.date() - fist_date).days)
shop_info_df["last_business_m"] = shop_info_df["last_business_m"].map(lambda x: (x.date() - fist_date).days)
shop_info_df.head()

In [None]:
# do pca and clustering
def get_cluster_feature(data, columns, index_name, cluster_name, num_cluster=5, linkage="average", n_pca_components=3, svd_solver="auto", visualize=True):
    df = data[columns].copy()
    
    # PCA
    pca = PCA(n_components=n_pca_components)
    components = pca.fit_transform(df)
    components = pd.DataFrame(components)
    
    # Clustering
    clusterer = AgglomerativeClustering(n_clusters=num_cluster, linkage=linkage)
    labels = clusterer.fit_predict(components)
    x = components[0]
    y = components[1]
    
    # Evaluation:
    scorelist = []
    nrange = range(2, 6)
    for n in nrange:
        clusterer = AgglomerativeClustering(n_clusters=n)
        l = clusterer.fit_predict(components)
        silscore = silhouette_score(df, l)
        scorelist.append(silscore)
        
    for i in df.index:
        df.at[i, cluster_name] = labels[i]
        df = df[cluster_name].reset_index()
        df.rename(columns={"index": index_name}, inplace = True)
    
    # plotting:    
    if visualize:
        fig = plt.figure(figsize=(25,15))
        gs = fig.add_gridspec(2, 2)
        ax00 = fig.add_subplot(gs[0,0])
        ax01 = fig.add_subplot(gs[0,1])
        ax02 = fig.add_subplot(gs[1,:])
        ax00.tick_params(axis='both', labelsize=15)
        ax01.tick_params(axis='both', labelsize=15)
        ax02.tick_params(axis='both', labelsize=15)
        ax00.set_title('PCA Components', fontsize=20)
        ax01.set_title('Cluster Score by Number of Clusters', fontsize=20)
        ax02.set_title('Clustering', fontsize=20)

        ax00.set(xlabel='component number', ylabel='covered variance')
        ax01.set(xlabel='number of clusters', ylabel='silhouette score')
        ax02.set(xlabel='component 1 score', ylabel='component 2 score')

        sns.barplot(x=list(range(pca.n_components_)), y=pca.explained_variance_ratio_, ax=ax00, palette="Set2")
        sns.lineplot(x=nrange, y=scorelist, ax=ax01, color="darkblue")
        sns.scatterplot(x=x, y=y, hue=labels, ax=ax02, palette="dark")

        fig.subplots_adjust(top=0.9)
        fig.suptitle(f"PCA and Clustering Output (num_cluster = {num_cluster})", fontsize="28")
    
    return df

In [None]:
shop_info_df.info()

In [None]:
#find numeric columns
numeric_shop_columns = shop_info_df.select_dtypes(include=["int32", "int64", "float32"]).columns
shop_cluster_df = get_cluster_feature(data=shop_info_df, columns=numeric_shop_columns, n_pca_components=2, index_name="shop_id", cluster_name="shop_cluster");

In [None]:
#add shop_cluster_df
piped_df = pd.merge(piped_df, shop_cluster_df, on="shop_id", how="inner")
piped_df.head()

In [None]:
item_category_info_df.head()

In [None]:
#convert data columns to int
item_category_info_df['first_sold']= pd.to_datetime(item_category_info_df['first_sold'])
item_category_info_df['last_sold']= pd.to_datetime(item_category_info_df['last_sold'])

item_category_info_df["first_sold"] = item_category_info_df["first_sold"].map(lambda x: (x.date() - fist_date).days)
item_category_info_df["last_sold"] = item_category_info_df["last_sold"].map(lambda x: (x.date() - fist_date).days)
item_category_info_df.head()

In [None]:
#obtain numeric columns
#fill na

item_category_info_df = item_category_info_df.astype({"name": "string"})
numeric_item_columns = item_category_info_df.select_dtypes(include=["float64", "int64", "object"]).columns
item_category_info_df.fillna(0, inplace=True);

In [None]:
item_cat_claster = get_cluster_feature(data=item_category_info_df, columns=numeric_item_columns, index_name="item_category_id", cluster_name="item_cat_cluster")

In [None]:
#add item categorie cluster
piped_df = pd.merge(piped_df, item_cat_claster, on="item_category_id", how="inner")

In [None]:
piped_df.head()

In [None]:
#adding lag feature and label
def add_lag_feature_and_label(args):
    
    def add_lags(df, date_block):
        for lag in range(num_lags):
            if (date_block-lag-1) in item_df["date_block_num"].values:
                lag_value = item_df[item_df["date_block_num"]==date_block-lag-1]["item_cnt_month"].values[0]
                df.at[index, f"lag_{lag+1}"] = lag_value
        return df 
    df = args[0].copy()
    num_lags = args[1]
    target_date_block = args[2]
    for lag in range(num_lags):
        df[f"lag_{lag+1}"] = 0
    for shop in df["shop_id"].unique():
        shop_df = df[df["shop_id"]==shop].copy()
        for item in shop_df["item_id"].unique():
            item_df = shop_df[shop_df["item_id"]==item].copy()
            last_index = 0
            for index, row in item_df.iterrows():
                date_block = row["date_block_num"]
                if target_date_block and date_block == target_date_block:
                    df = add_lags(df, date_block)
                if target_date_block is None:
                    df = add_lags(df, date_block)
    if target_date_block:
        df = df[df["date_block_num"]==target_date_block].copy()
    df.rename(columns={"item_cnt_month":"label"}, inplace=True)
    return df

def parallelize_lag_and_target_processing(df, num_lags, target_date_block_num=None, func=add_lag_feature_and_label, n_cores=4, shops=None, items=None):
    if target_date_block_num:
        # get list of valid date_block_num values:
        valid_date_blocks = range(target_date_block_num - num_lags, target_date_block_num + 1)
        df = df[df["date_block_num"].isin(valid_date_blocks)].copy()
    if shops:
        df = df[df["shop_id"].isin(shops)].copy()
    if items:
        df = df[df["item_id"].isin(items).copy()]
    df.sort_values(by="shop_id", inplace=True)
    df_split = np.array_split(df, n_cores)
    param_list = [[df_, num_lags, target_date_block_num] for df_ in df_split]
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, param_list))
    pool.close()
    pool.join()
    return df

**Model**

In [None]:

#evaluate model
def evaluate_xgboost(model):
        results = model.evals_result()
        fig = plt.figure(figsize=(25,10))
        gs = fig.add_gridspec(1, 2)
        ax00 = fig.add_subplot(gs[0,0])
        ax01 = fig.add_subplot(gs[0,1])
        ax00.tick_params(axis='both', labelsize=15)
        ax01.tick_params(axis='both', labelsize=15)
        ax00.set_title('Feature Importance', fontsize=20)
        ax01.set_title('loss vs validation loss', fontsize=20)
        ax00.set(xlabel='Importance', ylabel='Feature')
        ax01.set(xlabel='n_estimators', ylabel='rmse')
        sns.barplot(y=model.get_booster().feature_names, x=model.feature_importances_, ax=ax00, palette="Set2", orient="h")
        sns.lineplot(x=range(model.n_estimators), y=results["validation_0"]["rmse"], ax=ax01, color="darkblue", label="loss")
        sns.lineplot(x=range(model.n_estimators), y=results["validation_1"]["rmse"], ax=ax01, color="orange", label="validation loss")
        fig.subplots_adjust(top=0.9)
        fig.suptitle("Model Evaluation", fontsize="28")

In [None]:
#adding lag feature and label
#avg_lags per item feature
def add_avg_lag_feature(args):
    
    def add_avg_lags(df, date_block):
        for lag in range(num_lags):
            if (date_block-lag-1) in item_df["date_block_num"].values:
                lag_df = item_df[item_df["date_block_num"]==date_block-lag-1]
                lag_value = lag_df["item_cnt_month"].mean()
                df.at[index, f"avg_lag_{lag+1}"] = lag_value
        return df 
    
    df = args[0].copy()
    num_lags = args[1]
    target_date_block = args[2]
    for lag in range(num_lags):
        df[f"avg_lag_{lag+1}"] = 0

    for item in df["item_id"].unique():
        item_df = df[df["item_id"]==item].copy()
        last_index = 0
        for index, row in item_df.iterrows():
            date_block = row["date_block_num"]
            if target_date_block and date_block == target_date_block:
                df = add_avg_lags(df, date_block)
            if target_date_block is None:
                df = add_avg_lags(df, date_block)
    return df

def parallelize_avg_lag_processing(df, num_lags, target_date_block_num=None, func=add_avg_lag_feature, n_cores=4, shops=None, items=None):
    if target_date_block_num:
        # get list of valid date_block_num values:
        valid_date_blocks = range(target_date_block_num - num_lags, target_date_block_num + 1)
        df = df[df["date_block_num"].isin(valid_date_blocks)].copy()
    if shops:
        df = df[df["shop_id"].isin(shops)].copy()
    if items:
        df = df[df["item_id"].isin(items).copy()]
    df.sort_values(by="item_id", inplace=True)
    df_split = np.array_split(df, n_cores)
    param_list = [[df_, num_lags, target_date_block_num] for df_ in df_split]
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, param_list))
    pool.close()
    pool.join()
    return df

In [None]:
#combines previous steps without the preprocessing function
def pipeline(data, num_lags, num_avg_lags, target_date_block_num=None, first_data_str="2013-01-01",
             shop_info_data=shop_info_df, categorie_info_data=item_category_info_df,
             shops=None, items=None):
    
    print(100*"#")
    print(f"running pipeline for target_date_block_num {target_date_block_num}  with {num_lags} lags...")
    fist_date = datetime.strptime(first_data_str, "%Y-%m-%d").date()
    print("adding city feature...")
    df = add_city_feature(data)
    print("done.")
    print("adding shop_cluster feature...")
    # preprocessing shop_info_df
    shop_info_data['fist_business_m']= pd.to_datetime(shop_info_data['fist_business_m'])
    shop_info_data['last_business_m']= pd.to_datetime(shop_info_data['last_business_m'])
    shop_info_data["fist_business_m"] = shop_info_data["fist_business_m"].map(lambda x: (x.date() - fist_date).days)
    shop_info_data["last_business_m"] = shop_info_data["last_business_m"].map(lambda x: (x.date() - fist_date).days)
    numeric_shop_columns = shop_info_data.select_dtypes(include=["int32", "int64", "float32", "float64"]).columns
    shop_cluster_df = get_cluster_feature(data=shop_info_data, columns=numeric_shop_columns, n_pca_components=2,
                                          index_name="shop_id", cluster_name="shop_cluster", visualize=False)
    # add shop_cluster
    df = pd.merge(df, shop_cluster_df, on="shop_id", how="inner")
    print("done.")
    print("adding item_cat_cluster feature...")
    # preprocessing categorie_info_data
    categorie_info_data['first_sold']= pd.to_datetime(categorie_info_data['first_sold'])
    categorie_info_data['last_sold']= pd.to_datetime(categorie_info_data['last_sold'])
    categorie_info_data["first_sold"] = categorie_info_data["first_sold"].map(lambda x: (x.date() - fist_date).days)
    categorie_info_data["last_sold"] = categorie_info_data["last_sold"].map(lambda x: (x.date() - fist_date).days)
    categorie_info_data = categorie_info_data.astype({"name": "string"})
    numeric_item_columns = categorie_info_data.select_dtypes(include=["int32", "int64", "float32", "float64", "object"]).columns
    categorie_info_data.fillna(0, inplace=True)
    item_cluster_df = get_cluster_feature(data=categorie_info_data, columns=numeric_item_columns, n_pca_components=3,
                                          index_name="item_category_id", cluster_name="item_cat_cluster", visualize=False)
    # add item_cat_cluster
    df = pd.merge(df, item_cluster_df, on="item_category_id", how="inner")
    print("done.")
    print("adding avg lag features...")
    df = parallelize_avg_lag_processing(df=df, shops=shops, items=items, num_lags=num_lags, target_date_block_num=target_date_block_num)
    #df = add_avg_lag_feature(df, num_avg_lags, target_date_block_num)
    print("done.")
    print("adding lag features...")
    df = parallelize_lag_and_target_processing(df, shops=shops, items=items, num_lags=num_lags, target_date_block_num=target_date_block_num)
    print("done.")
    
    print("dropping item_name, shop_name, item_category_name and city...")
    df.drop(["item_name", "shop_name", "item_category_name", "city"], axis=1, inplace=True)
    print("done.")
    print(100*"#")
    return df

def get_training_data(num_lags, num_avg_lags,  target_data_block_number=None, train_data=train_df):
    df = pipeline(data=preprocessing(data=train_data), num_lags=num_lags, num_avg_lags=num_avg_lags,
                  target_date_block_num=target_data_block_number)
    #clap the label and all lag features:
    columns = [col for col in df.columns if col[:3]=="lag"]
    columns = ["label"] + columns
    for col in columns:
        df[col] = np.where(df[col]>20, 20,df[col])
    #delete the the fürst "num_lags" date_block_nums
    valid_data_block_num = range(num_lags,34)
    df = df[df["date_block_num"].isin(valid_data_block_num)]

    y = df["label"]
    X = df.copy()
    X.drop(["label"], axis=1, inplace=True)
    return df

In [None]:
#obtain training data
#split into test set which is the final month and training set which is first 33 months
DF = get_training_data(num_lags=3, num_avg_lags=3)

train = DF[DF.date_block_num != 33]
test = DF[DF.date_block_num == 33]
X_train = train.copy()
X_train.drop(["label"], axis=1, inplace=True)
X_test = test.copy()
X_test.drop(["label"], axis=1, inplace=True)

y_train = train["label"]
y_test = test["label"]

In [None]:
#train model
evalset = [(X_train, y_train), (X_test,y_test)]
xgb_model2 = xgb.XGBRegressor(max_depth=5, n_estimators=100, subsample=0.6, eval_metric='rmse', learning_rate=0.1)
xgb_model2.fit(X_train, y_train, eval_set=evalset)

In [None]:
# predict and calculat the rmse for test & train data
test_prediction = xgb_model2.predict(X_test)
train_prediction = xgb_model2.predict(X_train)
rmse_test = mean_squared_error(y_true = y_test, y_pred = test_prediction)**(0.5)
rmse_train = mean_squared_error(y_true = y_train, y_pred = train_prediction)**(0.5)
print(50*"*")
print(f"RMSE test: {rmse_test}")
print(f"RMSE train: {rmse_train}")
print(50*"*")
#evaluate model
evaluate_xgboost(xgb_model2)