In [None]:
import numpy as np
import pandas as pd
import matplotlib as mp
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv',index_col='row_id')
test = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv',index_col='row_id')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.describe

In [None]:
for col in ['country','store','product']:
    print(col,train[col].unique())

In [None]:
print('train date duration : ',train['date'].min(),train['date'].max())
print('test date duration : ',test['date'].min(),test['date'].max())

In [None]:
for col in ['country','store','product']:
    print(train[col].value_counts())
    print('\n')

In [None]:
for col in ['country','store','product']:
    print(test[col].value_counts())
    print('\n')

In [None]:
train.isna().sum()

In [None]:
test.isna().sum()

In [None]:
plt.figure(figsize=(8,4))
sns.countplot(x='country',data=train,order=train["country"].value_counts().index[:])
plt.title("Total counts of country",size=15)
plt.show()

In [None]:
plt.figure(figsize=(8,4))
sns.countplot(x='store',data=train,order = train["store"].value_counts().index[:])
plt.title("Total counts of stores",size = 15)
plt.show()

In [None]:
plt.figure(figsize=(7,2))
sns.countplot(x='product', data=train, order=train["product"].value_counts().index[:3],palette = "flag")
plt.title("Total count kaggle products ", size=13)
plt.show()

In [None]:
train['date'] = pd.to_datetime(train['date'],format='%Y/%m/%d')
plt.figure(figsize=(15,7))
sld_time = train.groupby(['date']).sum().reset_index()
sns.lineplot(x=sld_time.date,y=sld_time.num_sold)
plt.title('number of products sold over time',fontsize=14)
plt.show()

In [None]:
fig,ax = plt.subplots(1,1,figsize=(12,7))
train_monthly_country = train.set_index('date').groupby([pd.Grouper(freq='M'),'country'])[['num_sold']].mean()
sns.lineplot(x="date",y='num_sold',hue='country',data=train_monthly_country)
plt.title("Monthly Trend by Country", fontsize=15,fontweight='bold',loc='left')
ax.grid(alpha=0.5)

In [None]:
fig,ax = plt.subplots(1,1,figsize=(12,8))
train_monthly_store = train.set_index('date').groupby([pd.Grouper(freq='M'),'store'])[['num_sold']].mean()
sns.lineplot(x='date',y='num_sold',hue='store',data=train_monthly_store)
plt.title('Monthly Trend by Stores',fontsize=15,fontweight='bold',loc='left')
ax.grid(alpha=0.5)

In [None]:
train['day'] = train['date'].dt.day
train['month'] = train['date'].dt.month
train['year'] = train['date'].dt.year
train['weekday'] = train['date'].dt.dayofweek
train['quarter'] = train['date'].dt.quarter
train['yearday'] = train['date'].dt.dayofyear

In [None]:
train.head(-10)

In [None]:
train['num_sold'].var()

In [None]:
from scipy.stats import boxcox
out = boxcox(train['num_sold'])
train['num_sold'] = out[0]
lam = out[1]

In [None]:
sns.histplot(train['num_sold'],kde = True)

In [None]:
sns.boxplot(train['num_sold'])
print(train['num_sold'].var())
train['num_sold'].skew()

In [None]:
train.head()

In [None]:
col = ['country','store','product']
train = pd.get_dummies(train,columns=col,drop_first = True)
train.head()

In [None]:
import math
def fourier(new_df):
    dayofyear = new_df.date.dt.dayofyear
    for k in [1,2,3]:
        new_df[f'sin{k}'] = np.sin(dayofyear / 365 * 2 * math.pi * k)
        new_df[f'cos{k}'] = np.cos(dayofyear / 365 * 2 * math.pi * k)
        new_df[f'weeksin{k}'] = np.sin(new_df['weekday'] / 7 * 2 * math.pi * k)
        new_df[f'weekcos{k}'] = np.cos(new_df['weekday'] / 7 * 2 * math.pi * k)
        new_df[f'mnthsin{k}'] = np.sin(new_df['month'] / 12 * 2 * math.pi * k)
        new_df[f'mnthcos{k}'] = np.cos(new_df['month'] / 12 * 2 * math.pi * k)
    return new_df
train=fourier(train)

In [None]:
train.drop(['date'],axis=1,inplace=True)
train.head()

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
train[['year']] = sc.fit_transform(train[['year']])

In [None]:
y = train.pop('num_sold')
X = train

In [None]:
X.head()

In [None]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from catboost import CatBoostRegressor
model = CatBoostRegressor(iterations=5000,
                            learning_rate=0.04,
                            bootstrap_type='Bayesian',
                            boosting_type='Plain',
                            loss_function='MAE',
                            l2_leaf_reg = 5, # Added as Regularization
                            eval_metric='SMAPE',
#                           plot = True
#                           use_best_model = True
                            )

In [None]:
model.fit(
        X, y, 
        early_stopping_rounds=1000,
        verbose=1000
    )

In [None]:
model.get_feature_importance(prettified=True)

In [None]:
yp = model.predict(X)

In [None]:
from scipy.special import inv_boxcox
y = inv_boxcox(y, lam)
yp = inv_boxcox(yp, lam)

In [None]:
df_test = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv', index_col = 'row_id')
df_test.head()

In [None]:
df_test['date'] = pd.to_datetime(df_test['date'])
df_test['day'] = df_test['date'].dt.day
df_test['month'] = df_test['date'].dt.month
df_test['year'] = df_test['date'].dt.year
df_test['weekday'] = df_test['date'].dt.dayofweek
df_test['quarter'] = df_test['date'].dt.quarter
df_test['yearday'] = df_test['date'].dt.dayofyear

In [None]:
df_test[['year']] = sc.transform(df_test[['year']])
df_test = pd.get_dummies(df_test, columns = col, drop_first = True)
df_test = fourier(df_test)
df_test.drop(['date'], axis = 1, inplace = True)
df_test.head()

In [None]:
y_pred = model.predict(df_test)

In [None]:
df_test.shape

In [None]:
output = np.ceil(inv_boxcox(y_pred,lam))

In [None]:
data1 = pd.DataFrame({'row_id':df_test.index,'num_sold':output})

In [None]:
data1.head()

In [None]:
data1.to_csv('submission.csv', index = False)