# Fast Walkthrough : TPS JAN 2k22

This notebook holds a short and brief approach in submitting solutions for TPS competitions.

Do **upvote** if you like it.

In [None]:
# Importing basic data manipulation libraies

import numpy as np
import pandas as pd

In [None]:
# Filtering warning so that they do not appear on the notebook

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Data Paths

train_data_path = '../input/tabular-playground-series-jan-2022/train.csv'
test_data_path = '../input/tabular-playground-series-jan-2022/test.csv'

In [None]:
# Loading Train Data

train_df = pd.read_csv(train_data_path)
train_df.head()

In [None]:
# Loading Test Data

test_df = pd.read_csv(test_data_path)
test_df.head()

In [None]:
# splitting the feature into understandable format

train_df['year'] = train_df['date'].apply(lambda x : int(x.split('-')[0]))
train_df['month'] = train_df['date'].apply(lambda x : int(x.split('-')[1]))
train_df['day'] = train_df['date'].apply(lambda x : int(x.split('-')[2]))
train_df.drop('date', 1, inplace = True)
train_df.head()

In [None]:
test_df['year'] = test_df['date'].apply(lambda x : int(x.split('-')[0]))
test_df['month'] = test_df['date'].apply(lambda x : int(x.split('-')[1]))
test_df['day'] = test_df['date'].apply(lambda x : int(x.split('-')[2]))
test_df.drop('date', 1, inplace = True)
test_df.head()

In [None]:
train_df.info()

In [None]:
test_df.head()

#### Checking the data splitt through various category / features.

In [None]:
train_df.country.value_counts()

In [None]:
test_df.country.value_counts()

In [None]:
train_df.store.value_counts()

In [None]:
test_df.store.value_counts()

In [None]:
train_df.year.value_counts()

In [None]:
test_df.year.value_counts()

In [None]:
train_df.month.value_counts()

In [None]:
test_df.month.value_counts()

In [None]:
train_df.day.value_counts()

In [None]:
test_df.day.value_counts()

In [None]:
train_df['product'].value_counts()

In [None]:
test_df['product'].value_counts()

### EDA : Exploratory Data Analysis

Now checking through some inter-feature covariance .

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Checking the number of sold products by their respective year.

sns.lineplot(x = 'year', y = 'num_sold', hue = 'product', data = train_df)
plt.show()

We can see num_sold increase slightly with year.

In [None]:
# Checking the number of sold products by their respective month.

sns.lineplot(x = 'month', y = 'num_sold', hue = 'year', data = train_df)
plt.show()

In [None]:
# Checking the number of sold products per month through years.

plt.figure(figsize = (20, 6))
sns.barplot(x = 'month', y = 'num_sold', hue = 'year', data = train_df)
plt.show()

In [None]:
# Checking the number of sold products per mdate through years.

plt.figure(figsize = (20, 6))
sns.barplot(x = 'day', y = 'num_sold', hue = 'year', data = train_df)
plt.show()

In [None]:
plt.figure(figsize = (20, 6))
sns.lineplot(x = 'year', y = 'num_sold', hue = 'day', data = train_df)
plt.show()

In [None]:
train_df.head()

As we've checked the feature, now it's time to encode the categorical features. In this sceanrio, the categorical features have a uniform distribution of less common unqiue labels, that's why I've used one hot encoding which can be a great tool to prepare the data more accurately.

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
# One Hot Encoding

train = train_df.copy()
test = test_df.copy()


for feature in test.columns:
    if test[feature].dtype == 'object':
        train_feature_data = train[feature]
        train_encoded_feature = pd.get_dummies(train_feature_data)
        train.drop(feature, 1, inplace = True)
        train = pd.concat([train, train_encoded_feature], axis = 1)
        
        test_feature_data = test[feature]
        test_encoded_feature = pd.get_dummies(test_feature_data)
        test.drop(feature, 1, inplace = True)
        test = pd.concat([test, test_encoded_feature], axis = 1)

In [None]:
train.head()

In [None]:
test.head()

Now, scaling the year as if not it may give this feature too much importance.

In [None]:
train.year = train.year.apply(lambda x : x - 2015)
test.year = test.year.apply(lambda x : x - 2015)

In [None]:
train.head()

In [None]:
test.head()

Now, in this scenario we'll use 2 models which will be Regression models from XGB and RandomForest. and we'll blend the outputs of them to find the answers or the predictions.

In [None]:
from xgboost import XGBRegressor as xgbr

In [None]:
baseline_model = xgbr()
print(baseline_model)

In [None]:
target = train.num_sold
train.drop('num_sold', 1 , inplace = True)
train.head()

In [None]:
plt.hist(target)
plt.show()

We also have to prepare the train and the validation set so that we can understand the overfit quality of the model and wheter it should be discarded or not.

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size = 0.2, random_state = 42)

In [None]:
X_train.head()

In [None]:
X_val.head()

In [None]:
%%time
baseline_model.fit(X_train, y_train)

In [None]:
baseline_model.score(X_train,y_train)

In [None]:
baseline_model.score(X_val, y_val)

In [None]:
submission_format = pd.read_csv('../input/tabular-playground-series-jan-2022/sample_submission.csv')
submission_format.head()

In [None]:
submission_data = submission_format
submission_data.head()

In [None]:
from sklearn.ensemble import RandomForestRegressor as rfr

In [None]:
rfr_model = rfr(n_estimators = 80)
print(rfr_model)

In [None]:
%%time
rfr_model.fit(X_train, y_train)

In [None]:
rfr_model.score(X_train, y_train)

In [None]:
rfr_model.score(X_val, y_val)

In [None]:
plt.bar(X_train.columns, rfr_model.feature_importances_)
plt.xticks(rotation = 60)
plt.show()

In [None]:
X_train.drop(['row_id', 'year', 'Finland', "Sweden"], axis = 1, inplace = True)
X_val.drop(['row_id', 'year', 'Finland', "Sweden"], axis = 1, inplace = True)
test.drop(['row_id', 'year', 'Finland', "Sweden"], axis = 1, inplace = True)

In [None]:
revised_xgb_model = xgbr()
revised_rfr_model = rfr(n_estimators = 80)

In [None]:
%%time
revised_xgb_model.fit(X_train, y_train)
revised_rfr_model.fit(X_train, y_train)

In [None]:
print(revised_xgb_model.score(X_train, y_train))
print(revised_xgb_model.score(X_val, y_val))
xgb_pred = revised_xgb_model.predict(test)
print(revised_rfr_model.score(X_train, y_train))
print(revised_rfr_model.score(X_val, y_val))
rfr_pred = revised_rfr_model.predict(test)

Now we'll blend the outputs from both the models and use that as our target prediction.

In [None]:
submission_data.num_sold = rfr_pred
# Pushing the data into a csv file for submission.
submission_data.to_csv('rfr.csv', index = False)

In [None]:
submission_data.num_sold = xgb_pred
# Pushing the data into a csv file for submission.
submission_data.to_csv('xgb.csv', index = False)

In [None]:
submission_data.num_sold = rfr_pred * 0.5 + xgb_pred * 0.5
# Pushing the data into a csv file for submission.
submission_data.to_csv('blend.csv', index = False)

# Thanks for walking through this notebok :)

## You can find other cool works of me on [kaggle](https://kaggle.com/sagnik1511) or on [github](https://github.com/sagnik1511)

# Thank You :)