In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd 
import seaborn as sns 
import matplotlib
%matplotlib inline
from matplotlib import pyplot as plt 
import numpy as np
pd.set_option('display.max_rows',50)
sns.set(rc={'figure.figsize':(11, 4)})# Use seaborn style defaults and set the default figure size
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import warnings   # To avoid warning messages in the code run
warnings.filterwarnings("ignore")

In [None]:
item_category=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')
items=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
train=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv',parse_dates=["date"])
## As many data sets do contain datetime information in one of the columns, 
#pandas input function like pandas.read_csv() can do thetransformation to dates when reading the data using the parse_dates parameter with a list of the columns to read as Timestamp:
shops=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')
test=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv')

In [None]:
print(item_category.isnull().sum()) # checking the null items in all files 
print(items.isnull().sum())
print(train.isnull().sum())
print(shops.isnull().sum())
print(test.isnull().sum())
print('No null items in all files')

In [None]:
print('item_category columns name',item_category.columns) # checking the column in all files to find common columns for merging 
print('items columns name',items.columns)
print('train columns name',train.columns)
print('shops',shops.columns)
print('test',test.columns) 

### Merging the file 

In [None]:
## 1,. left join on 'train' and 'items'  file on matching column 'item id' . New merged file name = df_2
## for further analysis we dont require item_category_name, shop_name, item_name as we have theri respective ids 

In [None]:
# using merge function by setting how= left
df = pd.merge(train,items, on='item_id',how='left')
# displaying the result 
print(df)

In [None]:
df.describe()

#### item_price and item_cnt_day has negative values

In [None]:
df.isnull().sum()
# no missing values

In [None]:
df.info()

In [None]:
for i in df.columns: # check the no of unique values in all columns 
    print(i,' ',df[i].nunique())

In [None]:
# total shops : 60
# total item id and count : 21807
# total item category: 84 

In [None]:
for i in df.columns: # check the  unique values in all columns 
    print(i,'',df[i].unique())

In [None]:
df.describe(include='all')
# mean of item_price is more than median, it means data is positively skewed 

### 1. Univariate Analysis

#### 1.1 univariate analysis of item price 

In [None]:
df['item_price'].hist() ## data in item price is < RS 50000. Beyond that we have outliers 

In [None]:
df[df['item_price']>40000].count()

In [None]:
sns.boxplot(x=df['item_price'],data=df)

In [None]:
sns.displot(df, x='item_price', kind="kde")

### 1.2 univariate analysis of 'item_cnt_day'

In [None]:

# checking whether otlier exist in y variable 
sns.boxplot(df['item_cnt_day'])

In [None]:
df[df['item_cnt_day']>150].count()

In [None]:
sns.displot(df, x='item_cnt_day', kind="kde")

## 2. Bivariate analysis

#### 2. bivariate analysis of item_price and Item_count

In [None]:
sns.scatterplot(x='item_price',y='item_cnt_day',data=df)

#####  item price and item cnt day has negative corelation
Items with lower sales price has more demand

In [None]:
corr=df.corr()
corr
### No strong co relation between any variable 

#### no multicollinearity between variables 

In [None]:
print(corr['item_cnt_day'].sort_values(ascending =False))

### 3. Feature engineering

#### 3.1 separating month  date as new columns

In [None]:
##for better analysis we are making separate column of  month from date 

In [None]:
df["month"] = df["date"].dt.month

In [None]:
df.drop('date',axis=1,inplace=True) ## dropping the date column as we have extracted three new column from date

#### 3.2 Removing outliers from item price 

In [None]:
df.drop(df.loc[df['item_price']>40000].index,inplace=True)

In [None]:
df.shape

In [None]:
df.drop('item_name',axis=1,inplace=True)

#### 3.4 removing outliers from item cnt day

In [None]:
df.drop(df.loc[df['item_cnt_day']>150].index,inplace=True)

### converting the train data monthly basis 

In [None]:
df_2 = df.groupby(['date_block_num','shop_id', 'item_id','item_category_id','month']).agg({'item_price':'mean','item_cnt_day':'sum'}).reset_index()
## making the train data month wise 
### renaming the column item_cnt_day to item_cnt_month
df_2=df.rename(columns={'item_cnt_day':'item_cnt_month'},inplace=False)

### Preparing the test data

In [None]:
test['month']=int('11')
test['date_block_num']=34
test.head()

In [None]:
# using merge function by setting how= left
df_3=df.groupby(['shop_id','item_id'])['item_price'].last().reset_index()
test = pd.merge(test,df_3, on=['shop_id','item_id'],how='left')
# displaying the result 
print(test)

#### Mapped the item price column in test from train file using the same shop id and item id column. Some test ids which are not present in train will hold null values in item price

In [None]:
sns.displot(test, x='item_price', kind="kde")

#### Since the data of item price in test is skewed we will fill the missing value through median. Mean is generally used if data in normally distributed

In [None]:
## checking missing values in test
print(test.isnull().sum())


In [None]:
#Replacing Missing Value with median price
test['item_price']=test['item_price'].fillna(test['item_price'].median())
test['item_price']

In [None]:
## Adding item category column in test 

In [None]:
test = pd.merge(test,items, on=['item_id'],how='left')
## display the result 
test.head()

In [None]:
test.drop('item_name',axis=1,inplace=True)

In [None]:
test.columns

In [None]:
test.isnull().sum()

In [None]:
test_X= test[['shop_id', 'item_id', 'month', 'date_block_num', 'item_price', 'item_category_id']]

## Linear Regression

In [None]:
y=df_2[['item_cnt_month']]
x=df_2.drop(['item_cnt_month'],axis=1)

In [None]:
### scalling te data

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(x)
x = sc.transform(x)

In [None]:
x_train,x_valid,y_train,y_valid=train_test_split(x,y,train_size=0.6,random_state=100)

In [None]:
print('x_train size',x_train.shape)
print('y_train size',y_train.shape)
print('x_valid size',x_valid.shape)
print('y_valid size',y_valid.shape)

In [None]:
## Fitting into model 
from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(x_train,y_train)

In [None]:
## checking the acuuray from mse 

y_pred=model.predict(x_valid)
from sklearn import metrics 
from math import sqrt
mse=metrics.mean_squared_error(y_valid,y_pred)
print(mse)

In [None]:
rmse=sqrt(mse)
print(rmse)

## Random forest

In [None]:
from sklearn import*
rf_model = ensemble.RandomForestRegressor(n_estimators=50,
                                           max_leaf_nodes=12,
                                          random_state=15)
rf_model.fit(x_train, y_train)

Y_pred_test = rf_model.predict(x_valid)
Y_pred_train = rf_model.predict(x_train)

In [None]:
mse=metrics.mean_squared_error(y_valid,Y_pred_test)
print(mse)

In [None]:
rmse=sqrt(mse)
print(rmse)

### Prediction on test

In [None]:
z = sc.transform(test_X)
prediction_nov2015=rf_model.predict(z)

In [None]:
#Creating Dataframe to Display the output, The Id is the item id from the test data and output is the predicted cnt_per_month
sample_submission= pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv")
sample_submission.item_cnt_month=prediction_nov2015
linear_result=sample_submission
print(linear_result)

In [None]:
linear_result.to_csv("Sales_Prediction.csv",index=False)
print("Completed")