In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
sales = '/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv'
test = '/kaggle/input/competitive-data-science-predict-future-sales/test.csv'
sample = '/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv'

In [None]:
df_train = pd.read_csv(sales)
print(df_train.shape)
df_train.head()

In [None]:
df_train.drop(['date_block_num','item_price'], axis=1, inplace=True)
df_train['date'] = pd.to_datetime(df_train['date'], dayfirst=True)  
df_train['date'] = df_train['date'].apply(lambda x: x.strftime('%Y-%m'))
df_train.head(3)

In [None]:
df = df_train.groupby(['date','shop_id','item_id']).sum()
df = df.pivot_table(index=['shop_id','item_id'], columns='date', 
                    values='item_cnt_day', fill_value=0)
df.reset_index(inplace=True)

In [None]:
df_test = pd.read_csv(test)

In [None]:
df_test = pd.merge(df_test, df, on=['shop_id','item_id'], how='left')
# drop ID so that we have only one column of index
df_test.drop(['ID', '2013-01'], axis=1, inplace=True) 
df_test = df_test.fillna(0) # fill all NAN values to 0.0

In [None]:
df_test.head(5)

In [None]:
Y_train = df['2015-10'].values 
X_train = df.drop(['2015-10'], axis = 1)
X_test = df_test               # test data being compared

In [None]:
# Return a tuple representing the dimensionality of the DataFrame.
print('\n-------------------------------')
print("Our Dataframes dimensionalities")
print('-------------------------------')
print("Data DataFrame: {0}\nTarget Values:  {1}\nTest Dataframe: {2}"
                    .format( X_train.shape, Y_train.shape, X_test.shape))
print('-------------------------------\n')

In [None]:
print('----------------------------------')
print("Starting the training phase")
print('----------------------------------')
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split( X_train, Y_train, 
                                                     test_size=0.2, random_state=101)
print ('Train set: ', x_train.shape,  y_train.shape)
print ('Test set:  ', x_test.shape,  y_test.shape)
print('----------------------------------\n')

In [None]:
print('|vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv|')
print('|           --------------------------              |')
print("|           Linear Regression accuracy              |")
print('|           --------------------------              |')
from sklearn.linear_model import LinearRegression
LR = LinearRegression() # y=mx+b
LR.fit(x_train,y_train)

from sklearn.metrics import mean_squared_error
LR_train_set = mean_squared_error(y_train, LR.predict(x_train))
LR_test_set = mean_squared_error(y_test, LR.predict(x_test))
LR_test_score = LR.score(x_train,y_train)
print('|      =====================================        |')
if LR_train_set >= 10.0: 
    print('|     | Train set mse:  {:.14f}   |       |'.format(LR_train_set))
else: print('|     | Train set mse:  {:.14f}    |       |'.format(LR_train_set))
print('|     | Test set mse:   {:.14f}    |       |'.format(LR_test_set))
print('|     | Test set score: {:.14f}    |       |'.format(LR_test_score))
print('|      =====================================        |')

print('|       ---------------------------------           |')
print("|       Random Forest Regression accuracy           |")
print('|       ---------------------------------           |')
from sklearn.ensemble import RandomForestRegressor
RFR = RandomForestRegressor(n_estimators = 10)
RFR.fit(x_train,y_train)

RFR_train_set = mean_squared_error(y_train, RFR.predict(x_train))
RFR_test_set = mean_squared_error(y_test, RFR.predict(x_test))
RFR_test_score = RFR.score(x_train,y_train)
print('|      =====================================        |')
print('|     | Train set mse:  {:.14f}    |       |'.format(RFR_train_set))
print('|     | Test set mse:   {:.14f}    |       |'.format(RFR_test_set))
print('|     | Test set score: {:.14f}    |       |'.format(RFR_test_score))
print('|      =====================================        |')
print('|vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv|')
print('\n')

In [None]:
print('-------------------------------------------------')
print("Comparing LinearRegression with RandomForestRegressor.")
print('-------------------------------------------------')
if LR.score(x_train,y_train) < RFR.score(x_train,y_train):
    print("LinearRegression is better than RandomForestRegressor")
if RFR.score(x_train,y_train) < LR.score(x_train,y_train):
    print("RandomForestRegressor is better than LinearRegression")
print('-------------------------------------------------')

In [None]:
df_submission = pd.read_csv(sample)
prediction = RFR.predict(X_test)
df_submission['item_cnt_month'] = prediction
df_submission.to_csv('prediction.csv', index=False)
print("Work Done.")