Importing libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Loading datasets

In [None]:
cats=pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')
shops=pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv')
items=pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
train=pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
test=pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')

Count of each datasets

In [None]:
print('There are ' + str(len(items)) + ' different items')
print('There are ' + str(len(cats)) + ' different item categories')
print('There are ' + str(len(shops)) + ' different shops')

Display of each dataset

In [None]:
items.head()

In [None]:
cats.head()

In [None]:
shops.head()

In [None]:
test.head()

In [None]:
train.head()

Checking Missing values

In [None]:
train.isnull().values.any()

In [None]:
items.isnull().values.any()

In [None]:
cats.isnull().values.any()

In [None]:
shops.isnull().values.any()

In [None]:
test.isnull().values.any()

Checking for duplicate values

In [None]:
train.duplicated().any()

In [None]:
items.duplicated().any()

In [None]:
cats.duplicated().any()

In [None]:
shops.duplicated().any()

In [None]:
test.duplicated().any()

Train data cleaning

In [None]:
print("Shape of the data: ", train.shape)
print("\n")
display(train.head())
print("No of Unique date block num: ", train['date_block_num'].nunique())
print("No of Unique shop id: ", train['shop_id'].nunique())
print("No of Unique item id: ", train['item_id'].nunique())
print("Any Null Values?")
print(train.isnull().sum())

print('\nBasic Stats of Item Price')
display(train['item_price'].describe())

print('\nBasic Stats of Item_count_day')
display(train['item_cnt_day'].describe())

print("\nThere are few negative values too in the sale and count \
does they represent any return? How many such values are there?")

print('\nLets check the negative values')

display(train[train['item_price']<0])
display(train[train['item_cnt_day']<0])

print("\nStats of negative item count day")
display(train.loc[train['item_cnt_day']<0, 'item_cnt_day'].describe())

In [None]:
# Filling the negative value with median value
train[train['item_price']< 0] = train['item_price'].median()  #replaced all negative values with median values
train[train['item_cnt_day']< 0] = 0

In [None]:
#splitting the date column
import datetime as dt
train['date'] = pd.to_datetime(train['date'], format='%d.%m.%Y' , infer_datetime_format=True)

train['day'] = train['date'].dt.day
train['month'] = train['date'].dt.month
train['year'] = train['date'].dt.year

train.drop('date', axis=1, inplace=True)

train.head()

In [None]:
#dropping the date block num and item price
train.drop(['date_block_num', 'item_price'], axis=1, inplace=True)
train.head()

In [None]:
#splitting the dataset
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(train, test_size=0.2, random_state=29)

In [None]:
train_set.head()

In [None]:
test_set.head()

In [None]:
train.shape

In [None]:
train_set.shape

In [None]:
test_set.shape

In [None]:
#defining dependent variable and independent variable in train set
x = train_set.drop('item_cnt_day', axis=1)
y = train_set['item_cnt_day'].copy()

In [None]:
x.shape

In [None]:
y.shape

In [None]:
#scaling the train dataset
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler()

X_prepared = min_max_scaler.fit_transform(x)

In [None]:
#Random forest regressor model building
from sklearn.ensemble import RandomForestRegressor

RF_model = RandomForestRegressor()
RF_model.fit(X_prepared, y)

In [None]:
X_test = test_set.drop('item_cnt_day', axis=1)
y_test = test_set['item_cnt_day'].copy()

In [None]:
X_test_prepared = min_max_scaler.transform(X_test)

In [None]:
y_predicted = RF_model.predict(X_test_prepared)

print("Predictions:")
for i in y_predicted[:10]:
    print(i)
    
print("Orginal test:")
y_test.iloc[:10]

In [None]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_test, y_predicted)

print("MAE=", mae)

In [None]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_predicted)

print("RMSE=", np.sqrt(mse))

In [None]:
df_test = test.copy()

In [None]:
df_test['day'] = 1
df_test['month'] = 11
df_test['year'] = 2015

In [None]:
df_test.drop('ID', axis=1, inplace=True)
df_test

In [None]:
df_test_prepared = min_max_scaler.transform(df_test)

In [None]:
l = list()

for i in range(1, 31):
  df_test['day'] = i
  df_test_prepared = min_max_scaler.transform(df_test)
  l.append(RF_model.predict(df_test_prepared))

In [None]:
sample_subm=pd.read_csv('../input/competitive-data-science-predict-future-sales/sample_submission.csv')
RF_subm = sample_subm.copy()
RF_subm['item_cnt_month'] = sum(l)

In [None]:
RF_subm.to_csv("submission.csv", index=False)