In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# EDA

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/test.csv')
submission = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/sample_submission.csv')
train

In [None]:
# Dropping 'row_id' column from both dataframes
train.drop(['row_id'], axis = 1, inplace = True)
test.drop(['row_id'], axis = 1, inplace = True)

In [None]:
# Check missing values
train.info()

In [None]:
train['country'].value_counts()

In [None]:
train['product'].value_counts()

In [None]:
train['store'].value_counts()

In [None]:
print('Train data duration:', train['date'].min(), 'to', train['date'].max())
print('Test data duration:', test['date'].min(), 'to', test['date'].max())

# Data Visualization

In [None]:
# Convert 'date' to datetime type for easty handling
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

train_monthly = train.set_index('date').groupby([pd.Grouper(freq = 'M')])[['num_sold']].mean()

plt.figure(figsize = (12, 7))
sns.lineplot(x = 'date', y = 'num_sold', data = train, label = 'daily')
sns.lineplot(x = 'date', y = 'num_sold', data = train_monthly, label = 'monthly mean', color = 'black')
plt.title('Monthly Trend')
plt.grid(alpha = 0.5)
plt.show()

In [None]:
train_monthly_country = train.set_index('date').groupby([pd.Grouper(freq = 'M'), 'country'])[['num_sold']].mean()

plt.figure(figsize = (12, 7))
sns.lineplot(x = 'date', y = 'num_sold', hue = 'country', data = train_monthly_country)
plt.title('Monthly Trend by Country')
plt.grid(alpha = 0.5)
plt.show()

In [None]:
# day of week
train['dayofweek'] = train['date'].dt.dayofweek
test['dayofweek'] = test['date'].dt.dayofweek

In [None]:
train_dayofweek = train.set_index('date').groupby([pd.Grouper(freq = 'M'), 'dayofweek'])[['num_sold']].mean()

plt.figure(figsize = (12, 7))
sns.lineplot(x = 'date', y = 'num_sold', hue = 'dayofweek', data = train_dayofweek)
plt.title('Trend by day of the week')
plt.grid(alpha = 0.5)
plt.show()

In [None]:
# weekend
train['weekend'] = train['dayofweek'].apply(lambda x : x >= 5)
test['weekend'] = train['dayofweek'].apply(lambda x : x >= 5)
train_weekend = train.set_index('date').groupby([pd.Grouper(freq = 'M'), 'weekend'])[['num_sold']].mean()

plt.figure(figsize = (12, 7))
sns.lineplot(x = 'date', y = 'num_sold', hue = 'weekend', data = train_weekend)
plt.title('Weekend vs. Weekday Trend Comparison')
plt.grid(alpha = 0.5)
plt.show()

# Feature Engineering

In [None]:
import holidays

# Check if date is a holiday    
def isHoliday(country, date):
    country_holidays = holidays.CountryHoliday(country, years = date.year)
    return int(date in country_holidays)

In [None]:
train['isHoliday'] = train.apply(lambda x: isHoliday(x['country'], x['date'].date()), axis = 1)
test['isHoliday'] = test.apply(lambda x: isHoliday(x['country'], x['date'].date()), axis = 1)

In [None]:
train['isHoliday'].value_counts()

In [None]:
train['year'] = train['date'].dt.year
train['quarter'] = train['date'].dt.quarter
train['month'] = train['date'].dt.month
train['week'] = train['date'].dt.isocalendar().week.astype(int)
train['day'] = train['date'].dt.day
train['dayofyear'] = train['date'].dt.dayofyear
train['daysinmonth'] = train['date'].dt.days_in_month
train['dayofweek'] = train['date'].dt.dayofweek
train['weekend'] = ((train['date'].dt.dayofweek) // 5 == 1).astype(int)

In [None]:
test['year'] = test['date'].dt.year
test['quarter'] = test['date'].dt.quarter
test['month'] = test['date'].dt.month
test['week'] = test['date'].dt.isocalendar().week.astype(int)
test['day'] = test['date'].dt.day
test['dayofyear'] = test['date'].dt.dayofyear
test['daysinmonth'] = test['date'].dt.days_in_month
test['dayofweek'] = test['date'].dt.dayofweek
test['weekend'] = ((test['date'].dt.dayofweek) // 5 == 1).astype(int)

In [None]:
train

In [None]:
# Dropping 'date' column from both dataframes
train.drop(['date'], axis = 1, inplace = True)
test.drop(['date'], axis = 1, inplace = True)

# Data Preprocessing

In [None]:
train.info()

In [None]:
target = ['num_sold']
cat_features = [col for col in test.columns if train[col].dtype == 'object']
num_features = [col for col in test.columns if col not in cat_features]

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

num_pipeline = Pipeline([
    ('num_scaler', StandardScaler()), 
])

cat_pipeline = Pipeline([
    ('cat_encoder', OneHotEncoder(sparse = False, handle_unknown = 'ignore')), 
    ('cat_scaler', StandardScaler()), 
])

In [None]:
from sklearn.compose import ColumnTransformer

preprocess_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_features), 
    ('cat', cat_pipeline, cat_features), 
])

In [None]:
X_train = preprocess_pipeline.fit_transform(train[num_features + cat_features])
X_test = preprocess_pipeline.transform(test[num_features + cat_features])
y_train = train[target]

# LightGBM Regressor

In [None]:
from lightgbm import LGBMRegressor

params = {
    'boosting_type': 'gbdt', 
    'objective': 'regression',
    'learning_rate': 0.05,
    'n_jobs': -1, 
}

lgbm_reg = LGBMRegressor(**params)
lgbm_reg.fit(X_train, y_train)

# Submission

In [None]:
submission

In [None]:
y_pred = lgbm_reg.predict(X_test)
submission['num_sold'] = y_pred
submission.to_csv('my_submission.csv', index = False)

# Acknowledgements

Please visit these notebooks and upvote if you like them:

https://www.kaggle.com/subinium/tps-jan-happy-new-year/notebook

https://www.kaggle.com/c/tabular-playground-series-jan-2022/discussion/298411

https://www.kaggle.com/c/tabular-playground-series-jan-2022/discussion/298300