In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import dateutil.easter as easter


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Nearly all of our data is categorical, and we do not know a clear correlation between categories and num_sold, so we will hot encode using scikit-learn's OneHotEnocder
from sklearn.preprocessing import OneHotEncoder
import category_encoders as ce
def dataProcess(x):
    one_hot = ce.OneHotEncoder(cols = ['country'])
    x = one_hot.fit_transform(x)

    one_hot1 = ce.OneHotEncoder(cols = ['store']) # Creating a new hot encoder for each column may not be the most efficient, feel free to optimize this
    x = one_hot1.fit_transform(x)

    one_hot2 = ce.OneHotEncoder(cols = ['product'])
    x = one_hot2.fit_transform(x)
    return x
    

def dateProcess1(df, gdp_df):
    # Make a bunch of columns for the dates
    day_mon_list = []
    mon_list = []
    year_list = []

    for k in range(len(df['date'])):
        splt = df.iloc[k]['date'].split('-')
        day_mon_list.append(int(splt[2]))
        mon_list.append(int(splt[1]))
        year_list.append(int(splt[0]) - 2015)
    

    df['day_of_month'] = day_mon_list
    df['month'] = mon_list
    df['year'] = year_list

    gdp_list = []
    for i in range(len(df['year'])):
        if(df.iloc[i]['country'] == 'Finland'):
            gdp_list.append(gdp_df.iloc[(3*df.iloc[i]['year'])]['GDP'])
        elif(df.iloc[i]['country'] == 'Norway'):
            gdp_list.append(gdp_df.iloc[(3*df.iloc[i]['year']) + 1]['GDP'])
        elif(df.iloc[i]['country'] == 'Sweden'):
            gdp_list.append(gdp_df.iloc[(3*df.iloc[i]['year']) + 2]['GDP'])
    df['gdp_list'] = gdp_list

    df['date'] = pd.to_datetime(df['date'])
    df['weekend'] = df.date.dt.weekday >= 5 # Saturday and Sunday
    df['friday'] = df.date.dt.weekday == 4 # Friday
    df['day_of_year'] = df.date.dt.dayofyear
    
    # Christmas
    xmas_date = df.date.dt.year.apply(lambda year: pd.Timestamp(str(year)+'-12-25'))
    df['xmas_adjust'] = (df.date - xmas_date).dt.days.clip(lower=-20,upper=16).astype(str)
          
    # Easter
    easter_date = df.date.apply(lambda date: pd.Timestamp(easter.easter(date.year)))
    df['easter_adj']= (df.date - easter_date).dt.days.clip(lower =-3,upper = 60).astype(float)
    df.loc[df['easter_adj'].isin(range(12, 39)), 'easter_adj'] = 12 
    
    # Black Friday
    black_fri_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-11-27')),
                                         2016: pd.Timestamp(('2016-11-25')),
                                         2017: pd.Timestamp(('2017-11-24')),
                                         2018: pd.Timestamp(('2018-11-23')),
                                         2019: pd.Timestamp(('2019-11-29'))})
    df['days_from_black_friday'] = (df.date - black_fri_date).dt.days.clip(-5, 5)
    
    # Last Wednesday of June
    wed_june_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-06-24')),
                                         2016: pd.Timestamp(('2016-06-29')),
                                         2017: pd.Timestamp(('2017-06-28')),
                                         2018: pd.Timestamp(('2018-06-27')),
                                         2019: pd.Timestamp(('2019-06-26'))})
    df['days_from_wed_jun'] = (df.date - wed_june_date).dt.days.clip(-5, 5)
    
    #First Sunday of November (second Sunday is Father's Day)
    sun_nov_date = df.date.dt.year.map({2015: pd.Timestamp(('2015-11-1')),
                                         2016: pd.Timestamp(('2016-11-6')),
                                         2017: pd.Timestamp(('2017-11-5')),
                                         2018: pd.Timestamp(('2018-11-4')),
                                         2019: pd.Timestamp(('2019-11-3'))})
    df['days_from_sun_nov'] = (df.date - sun_nov_date).dt.days.clip(-1, 9)
    
    print(df['date'])
    df.drop(columns=['date'],inplace=True)



In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv')
nordic_gdp = pd.read_csv('../input/consumer-price-index-20152019-nordic-countries/Best_CPI.csv')

In [None]:
dateProcess1(train_df, nordic_gdp)
dateProcess1(test_df, nordic_gdp)

In [None]:
from sklearn.preprocessing import OrdinalEncoder

object_cols = ['weekend', 'friday', 'xmas_adjust']

ordinal_encoder = OrdinalEncoder()
train_df[object_cols] = ordinal_encoder.fit_transform(train_df[object_cols])
test_df[object_cols] = ordinal_encoder.transform(test_df[object_cols])

In [None]:
train_df = dataProcess(train_df)
test_df = dataProcess(test_df)
display(train_df)
display(test_df)
row_id = test_df.pop('row_id')

In [None]:
test_df.columns

In [None]:
X = train_df.copy()
y = X.pop('num_sold')

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state = 0)
x_train.columns

In [None]:
def smape(act,forc):
    return 100/len(act) * np.sum(2 * np.abs(forc - act) / (np.abs(act) + np.abs(forc)))

In [None]:
import seaborn as sns
def corrplot(df, method="pearson", annot=True, **kwargs):
    sns.clustermap(
        df.corr(method),
        vmin=-1.0,
        vmax=1.0,
        cmap="icefire",
        method="complete",
        annot=annot,
        **kwargs,
    )


corrplot(train_df, annot=None)

In [None]:


def cluster_labels(df, features, n_clusters=20):
    X = df.copy()
    X_scaled = X.loc[:, features]
    X_scaled = (X_scaled - X_scaled.mean(axis=0)) / X_scaled.std(axis=0)
    kmeans = KMeans(n_clusters=n_clusters, n_init=50, random_state=0)
    X_new = pd.DataFrame()
    X_new["Cluster"] = kmeans.fit_predict(X_scaled)
    return X_new


def cluster_distance(df, features, n_clusters=20):
    X = df.copy()
    X_scaled = X.loc[:, features]
    X_scaled = (X_scaled - X_scaled.mean(axis=0)) / X_scaled.std(axis=0)
    kmeans = KMeans(n_clusters=20, n_init=50, random_state=0)
    X_cd = kmeans.fit_transform(X_scaled)
    # Label features and join to dataset
    X_cd = pd.DataFrame(
        X_cd, columns=[f"Centroid_{i}" for i in range(X_cd.shape[1])]
    )
    return X_cd


In [None]:
new_df = x_train

s = (new_df.dtypes == 'object')
object_cols = list(s[s].index)
object_cols


In [None]:
x_train.xmas_adjust

In [None]:
from xgboost import  XGBRegressor

model_x = XGBRegressor(n_estimators = 382, learning_rate = 0.008281241354242047, max_depth=10, random_state=0)
model_x.fit(x_train, y_train)
preds_x = model_x.predict(x_test)
score = smape(preds_x, y_test)

score

In [None]:
train_df.pop('row_id')

In [None]:
train_df

In [None]:
test_df.columns

In [None]:
x_train.columns
x_train.pop('row_id')

In [None]:
final = XGBRegressor(n_estimators = 382, learning_rate = 0.008281241354242047, max_depth=10, random_state=0)
final.fit(x_train, y_train)
preds = final.predict(test_df)


In [None]:
output = pd.DataFrame({'row_id': row_id, 'num_sold': preds})
output.to_csv('my_submission.csv', index=False)
print("Submission was successfully saved!")