In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns; sns.set_theme(color_codes=True)
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, OneHotEncoder
from sklearn.feature_selection import mutual_info_regression
from sklearn.metrics import mean_squared_error,mean_absolute_error,explained_variance_score, r2_score

from xgboost import XGBRegressor

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Tabular Playground Jan 2022

### GOAL

Create a model that will predict the num_sold for the given new data


### Overview

Due to the nature of the dataset, we'll probably want to preprocessing the features and then do some EDA to find the correlation. After that I'll implement the model with a small Neural Network using Keras.

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv')
df_holidays = pd.read_csv('../input/public-and-unofficial-holidays-nor-fin-swe-201519/holidays.csv') #HOLIDAYS CALENDAR AVAILABLE HERE https://www.kaggle.com/vpallares/public-and-unofficial-holidays-nor-fin-swe-201519
df_oecd = pd.read_csv('../input/oecd-data-fin-nor-swe-20152019/oecd_monthly_data.csv') #ECONOMICS DATASET AVAILABLE HERE https://www.kaggle.com/siukeitin/oecd-data-fin-nor-swe-20152019
df_gdp = pd.read_csv('../input/consumer-price-index-20152019-nordic-countries/Best_CPI.csv')  #GDP DATASET https://www.kaggle.com/sardorabdirayimov/consumer-price-index-20152019-nordic-countries


In [None]:
df_train.isnull().sum() #there are no null values, so we don't have to clean the df

### Preliminary Feature Engineering

The date field is not so useful for training the model. I'll break it up to year and month, just to have an idea about the time period.

In [None]:
def set_date_features(df):
    df['date'] = pd.to_datetime(df['date'])          
    df['day_of_week']=df['date'].dt.dayofweek       
    df['day_of_month']=df['date'].dt.day            
    df['weekend']=(df['day_of_week']//5 == 1)       
    df['weekend']=df['weekend'].astype('int64')       
    df['week']=df['date'].dt.isocalendar().week     
    df['week'][df['week']>52]=52                    
    df['week']=df['week'].astype('int64')             
    df['month']=df['date'].dt.month                 
    df['quarter']=df['date'].dt.quarter             
    df['year']=df['date'].dt.year    
    return df

In [None]:
df_train = set_date_features(df_train)
df_test = set_date_features(df_test)

Holidays

In [None]:
df_holidays['date'] = pd.to_datetime(df_holidays['date'])   

In [None]:
def holiday_fe(row):    
    df = df_holidays[(df_holidays['date'] == row['date']) & (df_holidays['country'] == row['country'])]
    if len(df) > 0:
        retval = df.iloc[0]['event']
    else:
        retval = 'None'
    
    #THIS FIXES A PROBLEM WITH THE HOLIDAY DATASET, IN 2019 THE NEW YEAR'S EVE IS MISSING!
    if (retval == 'None') & (row['month'] == 12) & (row['day_of_month'] == 31):
        retval = "New Year's Eve"
        
    return retval 

In [None]:
df_train['Holiday'] = df_train.apply(lambda row: holiday_fe(row), axis = 1)
df_test['Holiday'] = df_test.apply(lambda row: holiday_fe(row), axis = 1)

In [None]:
df_train['Holiday_purchase'] = df_train['Holiday'].apply(lambda holiday: 0 if holiday == 'None' else 1)
df_test['Holiday_purchase'] = df_test['Holiday'].apply(lambda holiday: 0 if holiday == 'None' else 1)

OECD

In [None]:
df_oecd['year'] = df_oecd['date'].apply(lambda date: int(date.split('-')[0]))
df_oecd['month'] = df_oecd['date'].apply(lambda date: int(date.split('-')[1]))

In [None]:
df_oecd.drop('CCI', axis = 1, inplace = True) #THIS VALUES IS MISSING FOR NORWAY, WE DROP IT

In [None]:
def oecd_fe(df):
    df_tmp = pd.merge(left=df, right=df_oecd, how='left', on=['year','month','country'])
    df_tmp = df_tmp.drop('date_y', axis=1)
    df_tmp.rename(columns={'date_x':'date'}, inplace=True)
    #if BCI > 100 means good market"confidence"
    #useful info here https://data.oecd.org/leadind/business-confidence-index-bci.htm
    df_tmp['Confidence'] = df_tmp['BCI'].apply(lambda bci: 1 if bci > 100 else 0)
    df_tmp['Confidence'] = df_tmp['Confidence'].astype('int64')  
    return df_tmp

In [None]:
df_train = oecd_fe(df_train)
df_test = oecd_fe(df_test)

GDP

In [None]:
df_train = df_train.merge(df_gdp[['year','country','GDP']], how='left', on=['year','country'])
df_test = df_test.merge(df_gdp[['year','country','GDP']], how='left', on=['year','country'])

### EDA

Let's plot out the num_sold based on the date

In [None]:
g = sns.FacetGrid(df_train, 
                  col_wrap=2,
                  col="year", 
                  hue='country',
                  height=5,
                  aspect=2,
                  sharex=True, 
                  xlim=(0, 13))
g.map_dataframe(sns.lineplot, 'month', 'num_sold')
g.add_legend()

Looking at the value that we're trying to predict, we can see that there's some big difference between the most usual values and the outloners. So, instead of dropping them, we'll try to predict the logarithm, and then convert it back to exponential before the submission. 

In [None]:
plt.figure(figsize=(12,6))

df_no = df_train[df_train['country'] == 'Norway']
df_fi = df_train[df_train['country'] == 'Finland']
df_se = df_train[df_train['country'] == 'Sweden']

sns.kdeplot(x=df_no['num_sold'], label = 'Norway')
sns.kdeplot(x=df_fi['num_sold'], label = 'Finland')
sns.kdeplot(x=df_se['num_sold'], label = 'Sweden')

plt.title = 'Normal num_sold values'
plt.legend()

We could also see that, logarithm-wise, the distribution of the values through the countries is quite homogeneous

In [None]:
plt.figure(figsize=(12,6))

df_no = df_train[df_train['country'] == 'Norway']
df_fi = df_train[df_train['country'] == 'Finland']
df_se = df_train[df_train['country'] == 'Sweden']

sns.kdeplot(x=np.log(df_no['num_sold']), label = 'Norway')
sns.kdeplot(x=np.log(df_fi['num_sold']), label = 'Finland')
sns.kdeplot(x=np.log(df_se['num_sold']), label = 'Sweden')


plt.legend()

Checking if there's some kind of correlation between the economics info

In [None]:
sns.pairplot(data=df_train[['BCI','CLI','CPI', 'GDP', 'country']], 
             x_vars=['BCI','CLI','CPI', 'GDP'], 
             hue='country')

Seems like there's something between:
- CLI -> BCI
- CPI -> BCI

In [None]:
sns.regplot(data=df_no, x='CLI', y='BCI')
sns.regplot(data=df_fi, x='CLI', y='BCI')
sns.regplot(data=df_se, x='CLI', y='BCI')

In [None]:
sns.regplot(data=df_no, x='CPI', y='BCI')
sns.regplot(data=df_fi, x='CPI', y='BCI')
sns.regplot(data=df_se, x='CPI', y='BCI')

We'll try to create some "ratio" feature between CPI/CLI and BCI

### Further Feature Engineering

Economics new features

In [None]:
df_train['CPI_BCI_ratio'] = df_train['CPI'] / df_train['BCI']
df_train['CLI_BCI_ratio'] = df_train['CLI'] / df_train['BCI']
df_test['CPI_BCI_ratio'] = df_test['CPI'] / df_test['BCI']
df_test['CLI_BCI_ratio'] = df_test['CLI'] / df_test['BCI']

### Splitting

In [None]:
X = df_train.drop(['date','row_id','num_sold'],axis=1)
y = np.log(df_train['num_sold']) #AS WE SAID, IT WOULD BE EASIER TO PREDICT THE LOGARITHM THEN THE REAL VALUE, 
                                 #BECAUSE OF THE OUTLINERS
X_test = df_test.drop(['date','row_id'], axis = 1)

X_train, X_valid, y_train, y_valid = train_test_split(X,y,test_size=0.3,random_state=35,shuffle=True)

In [None]:
object_cols = [col for col in df_train.columns if df_train[col].dtype == "object"]
object_cols

### Encoding Categorical Features

In [None]:
high_cardinality_cols = [col for col in object_cols if df_train[col].nunique() > 10]

ordinal_encoder = OrdinalEncoder()#handle_unknown='use_encoded_value', unknown_value = 100
X_train_encoded = X_train.copy()
X_valid_encoded = X_valid.copy()
X_test_encoded = X_test.copy()

X_train_encoded[high_cardinality_cols] = ordinal_encoder.fit_transform(X_train[high_cardinality_cols])
X_valid_encoded[high_cardinality_cols] = ordinal_encoder.transform(X_valid[high_cardinality_cols])
X_test_encoded[high_cardinality_cols] = ordinal_encoder.transform(X_test[high_cardinality_cols])

X_train_encoded[high_cardinality_cols] = X_train_encoded[high_cardinality_cols].astype('int64')
X_valid_encoded[high_cardinality_cols] = X_valid_encoded[high_cardinality_cols].astype('int64')
X_test_encoded[high_cardinality_cols] = X_test_encoded[high_cardinality_cols].astype('int64')

In [None]:
low_cardinality_cols = [col for col in object_cols if X_train_encoded[col].nunique() < 10]

In [None]:
def to_int(oh_cols):
    for col in oh_cols.columns:
        oh_cols[col] = oh_cols[col].astype('int64')
    
    return oh_cols

In [None]:
def encode_low_card(feature):
    OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train_encoded[[feature]]))
    OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid_encoded[[feature]]))
    OH_cols_test = pd.DataFrame(OH_encoder.transform(X_test_encoded[[feature]]))
    
    #col_names = X_train_encoded[feature].sort_values().unique().tolist()
        
    OH_cols_train.columns = OH_encoder.get_feature_names([feature])
    OH_cols_valid.columns = OH_encoder.get_feature_names([feature])
    OH_cols_test.columns = OH_encoder.get_feature_names([feature])
    
    OH_cols_train = to_int(OH_cols_train)
    OH_cols_valid = to_int(OH_cols_valid)
    OH_cols_valid = to_int(OH_cols_valid)
    
    # One-hot encoding removed index; put it back
    OH_cols_train.index = X_train_encoded.index
    OH_cols_valid.index = X_valid_encoded.index
    OH_cols_test.index = X_test_encoded.index
    
    # Remove categorical columns (will replace with one-hot encoding)
    X_train_encoded.drop([feature], axis=1, inplace = True)
    X_valid_encoded.drop([feature], axis=1, inplace = True)
    X_test_encoded.drop([feature], axis=1, inplace = True)

    return (pd.concat([X_train_encoded, OH_cols_train], axis=1), 
           pd.concat([X_valid_encoded, OH_cols_valid], axis=1), 
           pd.concat([X_test_encoded, OH_cols_test], axis=1))

In [None]:
for feature in low_cardinality_cols:
    X_train_encoded, X_valid_encoded, X_test_encoded = encode_low_card(feature)


In [None]:
X_train_encoded.info()

### MUTUAL INFORMATION

In [None]:
def make_mi_scores(X, y):
    mi_scores = mutual_info_regression(X, y)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [None]:
mi_scores = make_mi_scores(X_train_encoded, y_train)
mi_scores

In [None]:
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)


plt.figure(dpi=100, figsize=(8, 5))
plot_mi_scores(mi_scores)

### Modelling

In [None]:
X_train = X_train_encoded
X_valid = X_valid_encoded
X_test = X_test_encoded

In [None]:
X_train.shape

In [None]:
X_valid.shape

In [None]:
model = XGBRegressor(n_estimators=600, learning_rate=0.05)
model.fit(X_train, y_train, 
          early_stopping_rounds=5, 
          eval_set=[(X_valid, y_valid)], 
          verbose=False)

### Model Evaluation

In [None]:
predictions = model.predict(X_valid)
predictions = np.squeeze(predictions)

In [None]:
plt.scatter(y_valid,predictions)

In [None]:
mean_absolute_error(y_valid,predictions)

In [None]:
np.sqrt(mean_squared_error(y_valid,predictions))

In [None]:
explained_variance_score(y_valid,predictions)

In [None]:
r2_score(y_valid,predictions)

In [None]:
def smape(a, f):
    return 1/len(a) * np.sum(2 * np.abs(f-a) / (np.abs(a) + np.abs(f))*100)

In [None]:
smape(y_valid,predictions)

In [None]:
predictions = np.exp(model.predict(X_test))
predictions = np.squeeze(predictions)
output = pd.DataFrame({'row_id': df_test['row_id'],
                       'num_sold': predictions})

output.to_csv('submission.csv', index=False)