In [None]:
import numpy as np
import pandas as pd

#Reading the test and the train data

train = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv')
print('train shape : ', train.shape)
print('test shape : ', test.shape)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train = train.set_index('row_id')
test = test.set_index('row_id')

In [None]:
#Initial EDA 

train.head(10)

In [None]:
train.info()

In [None]:
#Check for null values

train.isnull().sum()

In [None]:
test.isnull().sum()

The dataset does not have any null values

In [None]:
#Looking at basic stats

#For numeric cols

train.describe().T  #The transpose is used for better view.

In [None]:
test.describe().T

In [None]:
#For categorical columns

train.describe(include=['O'])

Thus we can see in the training dataset we have 3 unique countries, 2 unique stores (KaggleMart or KaggleRama) and 3 unique products

In [None]:
test.describe(include=['O'])

In [None]:
#Lets look at the frequency distribution of the key categorical variables country, store, product

print(train['country'].value_counts())
print('\n')
print(train['store']. value_counts())
print('\n')
print(train['product'].value_counts())



One interesting aspect is that the product and the country distributions are the same (8766), does that mean a particular country produces just one of those products? Lets check it out.

In [None]:
train.groupby(['country','product']).size()

In [None]:
#Looking at the dates feature to understand what period the data belongs to. 

def min_max_dt(df, name='train'):
    #df['date'] = pd.to_datetime(df['date'])
    min_date = df['date'].min()
    max_date = df['date'].max()
    print(f'For the {name} data : Min_date - {min_date} / Max_date - {max_date}')
    return None
          
          
min_max_dt(train, 'train')
min_max_dt(test, 'test')


The train data is from Jan 2015 to Dec 2018, while the test data is from Jan 2019 to Dec 2019

The metric that will be used to evaluate the competition:

In [None]:

def smape_loss(y_true, y_pred):
    """SMAPE Loss"""
    return np.abs(y_true - y_pred) / (y_true + np.abs(y_pred)) * 200

#print(smape_loss(tf.constant([1, 2]), tf.constant([3, 4]))) # should print [100, 66.6667]

In [None]:
#Credit to https://www.kaggle.com/jaredfeng/tps-jan22-inprog-v5

holiday_path = '../input/holidays-finland-norway-sweden-20152019/Holidays_Finland_Norway_Sweden_2015-2019.csv'

def GetHoliday(holiday_path, df):
    """
    Get a boolean feature of whether the current row is a holiday sale
    """
    
    holiday = pd.read_csv(holiday_path)
    fin_holiday = holiday.loc[holiday.Country == 'Finland']
    swe_holiday = holiday.loc[holiday.Country == 'Sweden']
    nor_holiday = holiday.loc[holiday.Country == 'Norway']
    df['fin holiday'] = df.date.isin(fin_holiday.Date).astype(float)
    df['swe holiday'] = df.date.isin(swe_holiday.Date).astype(float)
    df['nor holiday'] = df.date.isin(nor_holiday.Date).astype(float)
    
    df['holiday'] = np.zeros(df.shape[0]).astype(int)
    df.loc[df.country == 'Finland', 'holiday'] = df.loc[df.country == 'Finland', 'fin holiday']
    df.loc[df.country == 'Sweden', 'holiday'] = df.loc[df.country == 'Sweden', 'swe holiday']
    df.loc[df.country == 'Norway', 'holiday'] = df.loc[df.country == 'Norway', 'nor holiday']
    df.drop(['fin holiday', 'swe holiday', 'nor holiday'], axis=1, inplace=True)
    return df

train = GetHoliday(holiday_path, train)
test = GetHoliday(holiday_path, test)


In [None]:
# Credit to https://www.kaggle.com/ranjeetshrivastav/tps-jan-21-base-xgb
# and https://www.kaggle.com/bernhardklinger/tps-jan-2022/notebook

def feature_eng(df):
    df['date'] = pd.to_datetime(df['date'])
    df['week']= df['date'].dt.week
    #df['year'] = 'Y' + df['date'].dt.year.astype(str)
    df['quarter'] = 'Q' + df['date'].dt.quarter.astype(str)
    df['day'] = df['date'].dt.day
    df['dayofyear'] = df['date'].dt.dayofyear
    df.loc[(df.date.dt.is_leap_year) & (df.dayofyear >= 60),'dayofyear'] -= 1
    df['weekend'] = df['date'].dt.weekday >=5
    df['weekday'] = 'WD' + df['date'].dt.weekday.astype(str)
    df.drop(columns=['date'],inplace=True)  

feature_eng(train)
feature_eng(test)

In [None]:
train.columns

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.info()

In [None]:
train['day'].value_counts()

Steps to take

1. Convert the categorical variables into numeric
2. Align the train and test columns
3. Run the baseline linear model
4. Predict and submit predictions

In [None]:
X_train = train.drop(['num_sold'],axis=1)
y_train = train['num_sold']
X_test = test   


In [None]:
X_train.columns

Step 1 - Converting categorical variables into numeric & Step 2 - Align the train and test columns


In [None]:
X_train_ohe = pd.get_dummies(X_train)
X_test_ohe = pd.get_dummies(X_test)
final_train, final_test = X_train_ohe.align(X_test_ohe,join='left', axis=1)

#Please refer to this excellent notebook from Dans Becker to understand how to align categorical columns



In [None]:
final_train.head()

In [None]:
final_test.head()

3. Run the baseline linear regression model

In [None]:
#Standardising the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(final_train)
X_test_scaled = scaler.transform(final_test)



In [None]:
#Creating a custom scoring for cross_validation

from sklearn.metrics import fbeta_score, make_scorer

# Credit to https://www.kaggle.com/c/web-traffic-time-series-forecasting/discussion/36414
def SMAPE(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

smape_score = make_scorer(SMAPE, greater_is_better=False)


Now lets run the Linear regression as our baseline model, since we want to use log for the dependent variable, we can use scikitlearn's TransformedRegressor.

https://scikit-learn.org/stable/modules/generated/sklearn.compose.TransformedTargetRegressor.html



In [None]:

from sklearn.linear_model import LinearRegression
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import cross_val_score


tt = TransformedTargetRegressor(regressor=LinearRegression(),
                                   func=np.log, inverse_func=np.exp)

scores = cross_val_score(tt, X_train_scaled, y_train, scoring=smape_score, cv=5)


In [None]:
scores

In [None]:
#Lets fit on the training data
tt.fit(X_train_scaled, y_train)



4. Predict and submit predictions

In [None]:

test_y_pred = tt.predict(X_test_scaled)



In [None]:
test_y_pred

In [None]:
#submission

assert(len(test.index)==len(test_y_pred))

submission_df = pd.DataFrame(list(zip(test.index, test_y_pred)), columns=['row_id', 'num_sold'])

submission_df.to_csv('submission.csv', index=False)

In [None]:
submission_df

Next steps:
    
    1. Train on the entire training set and predict
    2. Create additional rolling features
    3. Use Gradient Boosting methods to add to the model