## Initialization

In [None]:
# Input data files are available in the read-only "../input/" directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Import libraries
import numpy as np 
import pandas as pd 
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

<br>
<br>
<br>

## Data Acquisition

In [None]:
# Import data
df_shops = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')
df_items = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
df_categories = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')
df_sales = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
df_test = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv')

In [None]:
# View shops
print(df_shops.shape)
df_shops.head()

In [None]:
# View items
print(df_items.shape)
df_items.head()

In [None]:
# View item categories
print(df_categories.shape)
df_categories.head()

In [None]:
# View sales train data
print(df_sales.shape)
df_sales.head()

In [None]:
# View test data
print(df_test.shape)
df_test.head()

<br>
<br>
<br>

## Preprocessing (Item prices)

In [None]:
# Get mean sales price per item and rename aggregate column to item_mean_price
agg_item_price = {'item_price':'item_mean_price'}
df_prices = df_sales.groupby('item_id').agg({'item_price':'mean'}).rename(columns=agg_item_price)
print(df_prices.shape)
df_prices.head()

<br>
<br>
<br>

## Preprocessing (Sales train data)

In [None]:
# Check data types
df_sales.dtypes

In [None]:
# Check null values
df_sales.isnull().sum()

In [None]:
# Drop columns date_block_num and item_price
df_sales.drop(['date_block_num', 'item_price'], axis=1, inplace=True)
print(df_sales.shape)
df_sales.head()

In [None]:
# Merge sales with item prices (mean price per item)
df_sales = pd.merge(df_sales, df_prices, on='item_id', how='left')
print(df_sales.shape)
df_sales.head()

In [None]:
# Rearrange columns
df_sales = df_sales[['date', 'shop_id', 'item_id', 'item_mean_price', 'item_cnt_day']]
print(df_sales.shape)
df_sales.head()

In [None]:
# Merge sales with item categories
df_sales = pd.merge(df_sales, df_items, on='item_id', how='left')
print(df_sales.shape)
df_sales.head()

In [None]:
# Drop column item_name
df_sales.drop('item_name', axis=1, inplace=True)
print(df_sales.shape)
df_sales.head()

In [None]:
# Rearrange columns
df_sales = df_sales[['date', 'shop_id', 'item_id', 'item_category_id', 'item_mean_price', 'item_cnt_day']]
print(df_sales.shape)
df_sales.head()

In [None]:
# Convert date column to YYYY-MM
df_sales['date'] = pd.to_datetime(df_sales['date'], dayfirst=True)
df_sales['date'] = df_sales['date'].apply(lambda x: x.strftime('%Y-%m'))
print(df_sales.shape)
df_sales.head()

In [None]:
# Get sum value for item_cnt_day and rename column to item_sum_qty 
agg_item_cnt = {'item_cnt_day':'item_sum_qty'}
df_sales = df_sales.groupby(['date', 'shop_id', 'item_id', 'item_category_id', 'item_mean_price']).agg({'item_cnt_day':'sum'}).rename(columns=agg_item_cnt)
print(df_sales.shape)
df_sales.head()

In [None]:
# Create train dataframe from sales by converting date rows to columns (to be used as features) 
df_train = df_sales.pivot_table(index=['shop_id', 'item_id', 'item_category_id', 'item_mean_price'], columns='date', values='item_sum_qty', fill_value=0)
df_train.reset_index(inplace=True)
print(df_train.shape)
df_train.head()

In [None]:
# Rename train dataframe columns
df_train_cols1 = ['shop_id', 'item_id', 'item_category_id', 'item_mean_price']
df_train_cols2 = [f'{i}' for i in range(1,35)]
df_train_cols = df_train_cols1 + df_train_cols2
df_train.columns = df_train_cols
print(df_train.shape)
df_train.head()

In [None]:
# Declare features and prediction target for train data
X_train = df_train.drop(['34'], axis = 1)
Y_train = df_train['34'].values
print(X_train.shape, Y_train.shape)

<br>
<br>
<br>

## Preprocessing (Sales test data)

In [None]:
# Prepare test dataframe by merging test and train dataframes 
df_test = pd.merge(df_test, df_train, on=['shop_id','item_id'], how='left')

In [None]:
# Drop columns to match the features of train dataframe and fill na values with 0
df_test.drop(['ID', '1'], axis=1, inplace=True)
df_test = df_test.fillna(0)
print(df_test.shape)
df_test.head()

In [None]:
# Rename test dataframe columns to match those of train dataframe
df_test_cols1 = ['shop_id', 'item_id', 'item_category_id', 'item_mean_price']
df_test_cols2 = [f'{i}' for i in range(1,34)]
df_test_cols = df_test_cols1 + df_test_cols2
df_test.columns = df_test_cols
print(df_test.shape)
df_test.head()

In [None]:
# Declare features for test data
X_test = df_test
print(X_test.shape)

<br>
<br>
<br>

## Pipeline Development & Evaluation

### Used One-Hot Encoding for Item Categories as an example for cases with Nominal Values (Categorical)

In [None]:
# Set column transformer for encoding column item_category_id
column_trans = make_column_transformer((OneHotEncoder(handle_unknown='ignore'), ['item_category_id']), remainder='passthrough')

In [None]:
# Select model
rfr = RandomForestRegressor(n_estimators = 100)

In [None]:
# Create pipeline
pipeline = make_pipeline(column_trans, rfr)

In [None]:
# Split data
x_train, x_train_test, y_train, y_train_test = train_test_split(X_train, Y_train, test_size=0.2, random_state=21)
print('Train set:', x_train.shape,  y_train.shape)
print('Test set:', x_train_test.shape,  y_train_test.shape)

In [None]:
# Fit pipeline
pipeline.fit(x_train, y_train)

In [None]:
# Evaluate pipeline
print('Train set mse:', mean_squared_error(y_train, pipeline.predict(x_train)))
print('Test set mse:', mean_squared_error(y_train_test, pipeline.predict(x_train_test)))
print('Test set score:', pipeline.score(x_train, y_train))

In [None]:
# Cross validate pipeline
'''cross_val_score(pipeline, X_train, Y_train, cv=5, scoring='neg_mean_squared_error').mean()'''

<br>
<br>
<br>

## Model Development & Evaluation

### Not used in Prediction 

In [None]:
# Linear regression
lr = LinearRegression()
lr.fit(x_train, y_train)
print('Train set mse:', mean_squared_error(y_train, lr.predict(x_train)))
print('Test set mse:', mean_squared_error(y_train_test, lr.predict(x_train_test)))
print('Test set score:', lr.score(x_train, y_train))

In [None]:
# Random forest regression
rfr = RandomForestRegressor(n_estimators = 100)
rfr.fit(x_train, y_train)
print('Train set mse:', mean_squared_error(y_train, rfr.predict(x_train)))
print('Test set mse:', mean_squared_error(y_train_test, rfr.predict(x_train_test)))
print('Test set score:', rfr.score(x_train, y_train))

In [None]:
# Cross validate model
'''cross_val_score(rfr, X_train, Y_train, cv=5, scoring='neg_mean_squared_error').mean()'''

<br>
<br>
<br>

## Prediction

In [None]:
# Predict with pipeline
prediction = pipeline.predict(X_test)

In [None]:
# Prepare predictions
prediction = prediction.clip(0, 20)

In [None]:
# Check predictions
print(prediction.shape)
prediction

<br>
<br>
<br>

## Submission

In [None]:
df_submission = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv')
print(df_submission.shape)
df_submission.head()

In [None]:
df_submission['item_cnt_month'] = prediction
df_submission.to_csv('submission.csv', index=False)