## Predict Future Sales

### Objective
We are asking you to predict total sales for every product and store in the next month. You are provided with daily historical sales data. Note that the list of shops and products slightly changes every month.

In [None]:
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Collect initial data
We import the datasets to be use in this project. Those can be obtained from [Kaggle](https://www.kaggle.com/c/competitive-data-science-predict-future-sales/data)

In [None]:
df_items = pd.read_csv(dirname + '/items.csv') 
df_item_categories = pd.read_csv(dirname + '/item_categories.csv') 
df_sales_train = pd.read_csv(dirname + '/sales_train.csv') 
df_shops = pd.read_csv(dirname + '/shops.csv') 
df_test = pd.read_csv(dirname + '/test.csv') 

### Data Description
As we can see the info() method show us that there are no null values. This is part of our Data Quality Verification

In [None]:
df_items.info()

In [None]:
df_item_categories.info()

In [None]:
df_test.info()

In [None]:
df_shops.info()

In [None]:
df_sales_train.info()

We are going to focus on the df_sales_train, which is dataframe we are going to use mainly.

In [None]:
df_sales_train.describe().T

### Data Exploration

In [None]:
df_items.head()

In [None]:
df_item_categories.head()

In [None]:
df_shops.head()

In [None]:
df_test.head()

In [None]:
df_sales_train

In [None]:
# We use the shop_id and item_id as index and the value of date_block_num as columns with item_cnt_day as values
df_sales = df_sales_train.pivot_table(index = ['shop_id', 'item_id'],
                                values = 'item_cnt_day',
                                columns = 'date_block_num',
                                fill_value = 0,
                                aggfunc = np.sum)
df_sales.reset_index(inplace=True)

In [None]:
df_sales

In [None]:
dataset = df_test.merge(df_sales, on=['shop_id', 'item_id'], how='left')
dataset.fillna(0, inplace=True)

In [None]:
dataset

In [None]:
df = dataset.drop(columns=['ID', 'shop_id', 'item_id'])

We leave the last column of the dataframe for label (y) and the other columns for the features (X)

In [None]:
X = df.values[:, :-1]
y = df.values[:, -1:]
print(y)

In [None]:
print(X.shape, y.shape)

We perform 90/10 data split train and validation set

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.1, random_state = 42)

In [None]:
X_test = df.values[:, 1:]
print(X_test.shape)

In [None]:
scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

print(X_test.shape)

In [None]:
X_train.shape, y_train.shape 

In [None]:
X_val.shape, y_val.shape

### Model training

We use various models to test which one has better performance

In [None]:
# Liner Regression model
model_LR = linear_model.LinearRegression()

In [None]:
# Decision Tree Regressor model
model_DTR = DecisionTreeRegressor(random_state = 101)

In [None]:
# Random Forest
model_RF = RandomForestRegressor(random_state = 101)

In [None]:
# Function to find Mean Squared Error
def MSE(model, X_train, y_train, X_val, y_val):
    
    model.fit(X_train, y_train)
    y_predict = model.predict(X_val)
    
    return mean_squared_error(y_val, y_predict)

In [None]:
# Getting the Mean Square error for various models
best_mse = MSE(model_LR, X_train, y_train, X_val, y_val)
model = model_LR
print("MSE with Linear Regression \t:", best_mse)

In [None]:
mse_dtr = MSE(model_DTR, X_train, y_train, X_val, y_val)
print("MSE with Decision Tree Regressor:", mse_dtr)

if mse_dtr < best_mse:
    best_mse = mse_dtr
    model = model_DTR

In [None]:
mse_rf = MSE (model_RF, X_train, y_train.ravel(), X_val, y_val)
print("MSE with Random Forest:", mse_rf)

if mse_rf < best_mse:
    best_mse = mse_rf
    model = model_RF

In [None]:
# Make predictions using the validation set
y_predict = model.predict(X_val)
y_predict.shape

In [None]:
predictions = model.predict(X_test)

# Let's eliminate the negative numbers
predictions = predictions.clip(min = 0)

In [None]:
print(predictions)

### Prediction results
Let's check how well our model predict

In [None]:
submission = pd.DataFrame({'ID': dataset['ID'], 'item_cnt_month': np.round(predictions)})
submission

In [None]:
submission.to_csv('submission.csv',index=False)