In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/train.csv')
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/test.csv')

In [None]:
train_df.head(10)

In [None]:
train_df.shape

In [None]:
train_df.isnull().sum()

In [None]:
train_df['country'].unique()

In [None]:
train_df['store'].unique()

In [None]:
train_df['product'].unique()

In [None]:
train_df = train_df.set_index('row_id')

In [None]:
train_df.info()

In [None]:
train_df['date'] = pd.to_datetime(train_df['date'])

In [None]:
train_df.info()

In [None]:
train_df.head()

In [None]:
train_df['month'] = train_df['date'].dt.month

In [None]:
train_df['day_of_week'] = train_df['date'].dt.dayofweek

In [None]:
train_df.head()

In [None]:
sns.boxplot(data=train_df,x='store',y='num_sold')

In [None]:
sns.boxplot(data=train_df,x='product',y='num_sold')

In [None]:
plt.figure( figsize= (10,8))
sns.boxplot(data=train_df,x='product',y='num_sold',hue='country')

Norway seems to be leading in sales

In [None]:
train_df.groupby('month')['num_sold'].sum().plot(kind='barh')

In [None]:
train_df.groupby('day_of_week')['num_sold'].sum().plot(kind='barh')

In [None]:
## Dropping the date column as important month and day features extracted
train_df = train_df.drop('date',axis=1)

In [None]:
train_df.head()

In [None]:
y= train_df['num_sold']
X= train_df.drop('num_sold',axis=1)

In [None]:
X.head()

In [None]:
## one hot encoding of the dataset
category = ['country','store','product']
X = pd.get_dummies(data=X,columns=category,drop_first=True)

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_validate,y_train,y_validate = train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
print(X_train.shape)
print(X_validate.shape)
print(y_train.shape)
print(y_validate.shape)

In [None]:
X_train.columns

In [None]:
## transforming the test data in similer terms
test_df = test_df.set_index('row_id')
test_df['date'] = pd.to_datetime(test_df['date'])
test_df['month'] = test_df['date'].dt.month
test_df['day_of_week'] = test_df['date'].dt.dayofweek

In [None]:
test_df.drop('date',axis=1,inplace=True)

In [None]:
category = ['country','store','product']
test_df = pd.get_dummies(data=test_df,columns=category)

In [None]:
test_df.columns

In [None]:
## dropping country_Finland,store_KaggleMart,product_Kaggle Hat
test_df.drop(['country_Finland','store_KaggleMart','product_Kaggle Hat'],axis=1,inplace=True)

In [None]:
test_df.columns

In [None]:
test_df.shape

## Model Building

### 1. Based on Linear Regression -Base model

In [None]:
import statsmodels.api as sm

In [None]:
X_train_c = sm.add_constant(X_train)
model  = sm.OLS(y_train,X_train_c)
results = model.fit()
results.summary()

Since probability for all features is less than 0.05 , all features are significant

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vifs = [variance_inflation_factor(X_train,i) for i in range(X_train.shape[1])]
vif_df = pd.DataFrame({'cols':X_train.columns,'vif':vifs})
vif_df

Values are well within range. So no multicollinearity

In [None]:
y_validate_predict = results.predict(sm.add_constant(X_validate))

In [None]:
from sklearn.metrics import r2_score
print('R2 score on validation set is ',r2_score(y_validate,y_validate_predict))
validation_scores = {}
validation_scores['lr'] = r2_score(y_validate,y_validate_predict)

Considering this Linear Regression one as a base model ,go forward with Other type of models

In [None]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_validate,y_validate_predict)
rmse = mse**0.5
print('Root mean squared error is ', rmse)

In [None]:
def smape(A, F):
    return 100/len(A) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)))

In [None]:
print(smape(y_validate,y_validate_predict))

In [None]:
test_df['pred'] = results.predict(sm.add_constant(test_df))

In [None]:
test_df

In [None]:
## for rows with negative values , convert to 0
test_df['pred'] = test_df['pred'].apply(lambda x : 0 if x<0 else x)

In [None]:
test_df.reset_index(inplace=True)

In [None]:
test_df.rename(columns={'pred':'num_sold'},inplace=True)

In [None]:
#test_df[['row_id','num_sold']].to_csv('./submission.csv',index=False)

### 2. Decision tree based model

1) Base Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor(random_state=42)


In [None]:
dt.fit(X_train,y_train)
y_validate_pred_dt = dt.predict(X_validate)
y_train_pred_dt = dt.predict(X_train)

In [None]:
print('R2 score for Validation set is ',r2_score(y_validate,y_validate_pred_dt))
print('R2 score for Train set is',r2_score(y_train,y_train_pred_dt))

2) Hyper parameter tuning

In [None]:
%%time
from sklearn.model_selection import GridSearchCV
dt = DecisionTreeRegressor(random_state=42)
paramgrid = {
    'max_depth':[5,10,15,20],
    'min_samples_split':[20,40,50]
}
grid = GridSearchCV(dt,param_grid=paramgrid,scoring='r2',return_train_score=True,verbose=1,n_jobs=-1)

In [None]:
%%time
grid.fit(X_train,y_train)

In [None]:
grid.best_params_

In [None]:
grid.best_score_

In [None]:
dt_model = grid.best_estimator_

In [None]:
y_validate_pred = dt_model.predict(X_validate)
smape(y_validate,y_validate_pred)

In [None]:
test_df['dt_pred'] = dt_model.predict(test_df[['month', 'day_of_week', 'country_Norway', 'country_Sweden',
       'store_KaggleRama', 'product_Kaggle Mug', 'product_Kaggle Sticker']])

In [None]:
test_df.rename(columns={'num_sold':'lr_pred'},inplace=True)

In [None]:
#test_df[['row_id','dt_pred']].rename(columns={'dt_pred':'num_sold'}).to_csv('./submission.csv',index=False)

### 3. Ensemble Training - Random Forest

1.) Base Model

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf_base = RandomForestRegressor(random_state=42)

In [None]:
rf_base.fit(X_train,y_train)

In [None]:
y_validate_rfbase_pred = rf_base.predict(X_validate)

In [None]:
print('Base RF model r2score is ',r2_score(y_validate,y_validate_rfbase_pred))

2.) Hyperparameter Tuning

In [None]:
paramgrid={
    'max_depth':[5,10,15,20],
    'min_samples_split':[20,40,50],
    'n_estimators':[20,30,40],
    'max_features':[3,4,5]
}

In [None]:
rf = RandomForestRegressor(random_state=42)

In [None]:
grid = GridSearchCV(rf,param_grid=paramgrid,n_jobs=-1,verbose=1,scoring='r2')

In [None]:
%%time
grid.fit(X_train,y_train)

In [None]:
## params of optimised model
grid.best_params_

In [None]:
rf_optimum = grid.best_estimator_

In [None]:
y_validate_rfopt_pred = rf_optimum.predict(X_validate)

In [None]:
print('Optimized RF model r2score is ',r2_score(y_validate,y_validate_rfopt_pred))

In [None]:
## use the optimized model for prediction on test_df
test_df['rf_pred'] = rf_optimum.predict(test_df[['month', 'day_of_week', 'country_Norway', 'country_Sweden',
       'store_KaggleRama', 'product_Kaggle Mug', 'product_Kaggle Sticker']])

In [None]:
## correct negative values to 0
test_df['rf_pred'] = test_df['rf_pred'].apply(lambda x: 0 if x<0 else x)

In [None]:
test_df[['row_id','rf_pred']].rename(columns={'rf_pred':'num_sold'}).to_csv('./submission.csv',index=False)