In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV files I/O
import statsmodels.api as sm

#For visualizations
import plotly.offline as ply
import plotly.graph_objs as go
from plotly.tools import make_subplots
import colorlover as cl


import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
SMALL_SIZE = 10
MEDIUM_SIZE = 11
BIGGER_SIZE = 14
BIGGEST_SIZE = 18 #Some uniform font sizes

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=BIGGER_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGEST_SIZE)  # fontsize of the figure title

In [None]:
#Importing all the datasets

train_data = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
# Table on the Date Of Purchase,Shop ID, Item ID, Price and Quantity(Bought Or Returned)
categories_data = pd.read_csv("../input/competitive-data-science-predict-future-sales/item_categories.csv")
# Table on Item Category Name corresponding to the Item Category ID
items_data = pd.read_csv("../input/competitive-data-science-predict-future-sales/items.csv")
# Table Of Name Of the Item along with its ID along with the Item Category ID it is associated with
shops_data = pd.read_csv("../input/competitive-data-science-predict-future-sales/shops.csv")
# Table Of Name Of The Shop along with its ID and setting the ID in the file as the ID pandas should use as well
test_data = pd.read_csv("../input/competitive-data-science-predict-future-sales/test.csv").set_index('ID')
# Table Of Shop ID along with Item ID for prediction

In [None]:
#dimensions of datasets
print('train:', train_data.shape, 'test:', test_data.shape)
print('items:', items_data.shape, 'item_cats:', categories_data.shape, 'shops:', shops_data.shape)

In [None]:
train_data.head()
# date_block_num is a consecutive month number, used for convenience. January 2013 is 0, February 2013 is 1,..., 
# October 2015 is 33 and so on

In [None]:
(train_data["shop_id"].min(), train_data["shop_id"].max()) #To get the range of values

In [None]:
(categories_data["item_category_id"].min(), categories_data["item_category_id"].max()) #To get the range of values 

In [None]:
test_data.head()

In [None]:
categories_data.head()

In [None]:
items_data.head()

In [None]:
shops_data.head()

In [None]:
#To suppress scientific notation in pandas
pd.options.display.float_format = '{:.5f}'.format

In [None]:
#Explore the statistics of the prices of all items in the training dataset
#Some statistics include max, min, avg and deviation
print(train_data['item_price'].describe())
# NOTE: This includes double calculation for the returned items as well

In [None]:
#Explore the statistics of the count of all items in the training dataset
#Some statistics include max, min, avg and deviation
print(train_data['item_cnt_day'].describe())
# NOTE: This is a proper count as returned quantity will be subtracted appropriately

In [None]:
negative_item_price_entries = train_data[train_data['item_price'] < 0]

negative_item_cnt_day_entries = train_data[train_data['item_cnt_day'] < 0]
negative_item_cnt_day_entries = negative_item_cnt_day_entries.reset_index()

# Filter the entries that have negative price(incorrect due to a data entry error) and negative count(item returned)

negative_item_price_entries

In [None]:
negative_item_cnt_day_entries

In [None]:
mean = train_data[(train_data.shop_id==32) & (train_data.item_id==2973) & (train_data.date_block_num==4) & (train_data.item_price > 0)].item_price.mean()
train_data.loc[train_data.item_price < 0, 'item_price'] = mean
# Replacing negative item price  entry with the mean of the non-negative prices of the same item from the same shop in the same
# month for eliminating data entry error

train_data.loc[(train_data.shop_id==32) & (train_data.item_id==2973) & (train_data.date_block_num==4) & (train_data.item_price > 0)]

In [None]:
#Merging datasets, can be used if required for visualization
train_merged = pd.merge(train_data, items_data, on='item_id', how='inner')
train_merged = pd.merge(train_merged, categories_data, on='item_category_id', how='inner')
train_merged = pd.merge(train_merged, shops_data, on='shop_id', how='inner')
# Inner join selects only those rows from both the tables which satisfy the join condition of ID

In [None]:
#Snippet of merged dataset
train_merged.head()

In [None]:
#Extract and add some more features

train_merged['total_sales'] = train_merged.item_price * train_merged.item_cnt_day
train_merged['date'] = pd.to_datetime(train_merged.date)
train_merged['Month'] = train_merged['date'].dt.month
train_merged['Year'] = train_merged['date'].dt.year
train_merged['day_of_week'] = train_merged['date'].dt.day_name()

In [None]:
#snippet of train_merged
print(train_merged.head())

In [None]:
#drop entries with negative item count
train_merged=train_merged[train_merged.item_cnt_day>=0]

In [None]:
# Design specifications for the plot
chosen_colors=cl.scales['5']['qual']['Paired']
ply.init_notebook_mode(connected=True)
temp_df = train_merged.groupby('date_block_num')[['total_sales']].sum().reset_index()
temp_df.astype('float')

fig = go.Figure()
fig.add_trace(go.Scatter(x=temp_df.date_block_num, y=temp_df.total_sales,
                    mode='lines',
                    line= dict(color='green', width=2)))

# Plot of revenue generated per month
fig.update_layout(title='Monthly Revenue',
                   xaxis_title='Month',
                   yaxis_title='Total sales')

fig.show()

In [None]:
temp_df2 = train_merged.groupby('date_block_num')[['item_cnt_day']].sum().reset_index()
temp_df2.astype('float')

fig = go.Figure()
fig.add_trace(go.Scatter(x=temp_df2.date_block_num, y=temp_df2.item_cnt_day,
                    mode='lines',
                    line= dict(color='green', width=2)))

# Plot of items bought per month
fig.update_layout(title='Monthly Item Count',
                   xaxis_title='Month',
                   yaxis_title='Item Count')

fig.show()

In [None]:
temp_df3 = train_merged.groupby('day_of_week')[['total_sales']].sum().reset_index()
fig = plt.figure(figsize = (10, 5))

# creating the bar plot 
plt.bar(temp_df3['day_of_week'], temp_df3['total_sales'], color ='red',  
        width = 0.4) 
# Plot of revenue generated per day of the week
plt.xlabel("Day of week") 
plt.ylabel("Revenue Generated") 
plt.title("Sales on each day of the week") 
plt.show()
#Friday, Saturday, Sunday highest in terms of sales

In [None]:
temp_df4 = train_merged.groupby('Month')[['total_sales']].sum().reset_index()
fig = plt.figure(figsize = (10, 5))

# creating the bar plot 
plt.bar(temp_df4['Month'], temp_df4['total_sales'], color ='red',  
        width = 0.4) 
# Plot of revenue geerated per month
plt.xlabel("Month of year") 
plt.ylabel("Revenue Generated") 
plt.title("Sales for every month of the year") 
plt.xticks([i for i in range(1, 13)])
plt.show()
#Most sales during december festive season

In [None]:
temp_df5 = train_merged.groupby('shop_id')[['total_sales']].sum().reset_index()
fig = plt.figure(figsize = (30, 15))

# creating the bar plot 
plt.bar(temp_df5['shop_id'], temp_df5['total_sales'], color ='red',  
        width = 0.4) 
# Plot of revenue generated per shop
plt.xlabel("Shop Id") 
plt.ylabel("Revenue Generated") 
plt.title("Revenue generated per shop") 
plt.xticks([i for i in range(60)])
plt.show()
#Most sales during december festive season

In [None]:
temp_df6 = train_merged.groupby('item_category_id')[['item_cnt_day']].sum().reset_index()
fig = plt.figure(figsize=(30,15))

plt.bar(temp_df6['item_category_id'],temp_df6['item_cnt_day'], color='red',width=0.5)

# Plot of item sold per category
plt.xlabel("Item Category ID")
plt.ylabel("Total Number Of Items Sold")
plt.title("Comparison Between Sales Of Item Categories")
plt.xticks([i for i in range(84)])
plt.show()
# Maximum Sales in Item Category 40, followed by 30 and then 55

In [None]:
# Outlier Plot for showing outliers in the item price as well as item count
sns.jointplot(x="item_cnt_day", y="item_price", data=train_merged, height=5)
plt.title("Outliers In Terms Of Count And Price")
plt.show()
# Thus outliers are price greater than 50,000 and count greater than 500 and can cause significant
# amount of noise in the computation for prediction

In [None]:
x=train_merged.groupby(['item_category_id']).count()
x=x.sort_values(by='item_id',ascending=False) # Sort In Descending Order Of The Number Of Items 
x=x.iloc[0:10].reset_index() # Select The Top 10 for the graph

#Plot of total number of items per item category
plt.figure(figsize=(10,5))
y= sns.barplot(x.item_category_id, x.item_id, color='red')
plt.title("Number Of Items Per Category")
plt.ylabel('Total Number Of Items')
plt.xlabel('Item Category ID')
plt.show()
#Thus total number of sales that were high for item_categories 40,30 and 55 were due to proportionately
# high number of items in those categories

In [None]:
# Trend analysis through rolling mean and std deviation
plt.figure(figsize=(16,6))
plt.plot(temp_df2.rolling(window=12,center=False).mean(),label='Rolling Mean');
plt.plot(temp_df2.rolling(window=12,center=False).std(),label='Rolling sd');
plt.legend();

In [None]:
#Trend, seasonal and residual analysis
import statsmodels.api as sm
# multiplicative
res = sm.tsa.seasonal_decompose(temp_df2.values,period=12,model="additive")
#plt.figure(figsize=(16,12))
fig = res.plot()
#fig.show()

In [None]:
train_data.head()

In [None]:
#Drop entries with negative Item_cnt_day
train_data=train_data[train_data['item_cnt_day']>0]

In [None]:
train_data.head()

In [None]:
# Joining item,shop and categories tables
train_final= train_data.join(items_data, on='item_id',rsuffix='_')
train_final = train_final.join(shops_data, on='shop_id', rsuffix='_')
train_final = train_final.join(categories_data, on='item_category_id', rsuffix='_')

In [None]:
#Since dataset consumes large amount of data, we reduce size of some datatypes by downcasting
def downcast_dtypes(df):
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols = [c for c in df if df[c].dtype in ["int64", "int32"]]
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int16)
    return df

train_final = downcast_dtypes(train_final)
print(train_final.info())

In [None]:
test_shop_ids = test_data['shop_id'].unique()
test_item_ids = test_data['item_id'].unique()
# Filtering only shops that exist in test set.
leak_df = train_final[train_final['shop_id'].isin(test_shop_ids)]
# Filtering only items(in the filtered shops) that exist in test set.
leak_df = leak_df[leak_df['item_id'].isin(test_item_ids)]
print('Data set size before leaking:', train_final.shape[0])
print('Data set size after leaking:', leak_df.shape[0])
train_final = leak_df

In [None]:
#Drop these string based columns, that are not required for training purposes
train_final.drop(['item_name','shop_name','item_category_name','item_category_id'],axis=1,inplace=True)

In [None]:
train_final.head()

In [None]:
#shop_id_ and shop_id are the same so we drop one of them
#item_id_ and item_id are the same so we drop one of them
train_final.drop(['shop_id_','item_id_'],axis=1,inplace=True)

In [None]:
train_final.head()

In [None]:
#Drop rows where item price is less than zero
train_final=train_final[train_final['item_price']>0]

In [None]:
#We group the data by the month block number, shop_id and item_id, so that predictions can be made based on that
train_final = train_final.sort_values('date').groupby(['date_block_num', 'shop_id','item_id'], as_index=False)
#getting aggregates as extra columns
train_final = train_final.agg({'item_price':['sum', 'mean'], 'item_cnt_day':['sum', 'mean','count']})
# Rename features.
train_final.columns = ['date_block_num', 'shop_id', 'item_id', 'item_price', 'mitem_price', 'item_cnt', 'mitem_cnt', 'transactions']

In [None]:
# Converting month block number to corresponding year and month
train_final['year'] = train_final['date_block_num'].apply(lambda x: ((x//12) + 2013))
train_final['month'] = train_final['date_block_num'].apply(lambda x: (x % 12))

In [None]:
#Keep only the data that satisfies the condition as the rest are outliers(Refer outlier plot)
train_final = train_final.query('item_cnt >= 0 and item_cnt <= 1500 and item_price < 400000')

In [None]:
#Getting item count per item per shop on a monthly basis by sorting based on date_block_num
train_final['cnt_m'] = train_final.sort_values('date_block_num').groupby(['shop_id','item_id'])['item_cnt'].shift(-1)

In [None]:
train_final.head()

In [None]:
#We need to have all combinations of date_block_num,shop_id,item_id
#So we add the ones that are not already present
#Then we fill them up with zeros
import time
ts = time.time()
shop_ids = train_final['shop_id'].unique()
item_ids = train_final['item_id'].unique()
empty_df = []
for i in range(34):
    for shop in shop_ids:
        for item in item_ids:
            empty_df.append([i, shop, item])
    
empty_df = pd.DataFrame(empty_df, columns=['date_block_num','shop_id','item_id'])
print(time.time()-ts)

In [None]:
#Left join
train_final = pd.merge(empty_df, train_final, on=['date_block_num','shop_id','item_id'], how='left')
train_final.fillna(0, inplace=True)

In [None]:
#Split the dataset into train, test and validation sets
#date block no.s 0 to 25 -> train set
#date block no.s 26 to 32 -> validation set
#date block no 33 -> test set
train_set = train_final.query('date_block_num >= 0 and date_block_num < 26').copy()
validation_set = train_final.query('date_block_num >= 26 and date_block_num < 33').copy()
test_set = train_final.query('date_block_num == 33').copy()

print('Train set records:', train_set.shape[0])
print('Validation set records:', validation_set.shape[0])
print('Test set records:', test_set.shape[0])

print('Percent of train_set:',(train_set.shape[0]/train_final.shape[0])*100,'%')
print('Percent of validation_set:',(validation_set.shape[0]/train_final.shape[0])*100,'%')
print('Percent of test_set:',(test_set.shape[0]/train_final.shape[0])*100,'%')

In [None]:
#dropping NA
train_set.dropna(subset=['cnt_m'], inplace=True)
validation_set.dropna(subset=['cnt_m'], inplace=True)

In [None]:
# Creating training set
x_train = train_set.drop(['cnt_m','date_block_num'],axis=1)
y_train = train_set['cnt_m'].astype(int)

#Creating validation set
x_val = validation_set.drop(['cnt_m','date_block_num'],axis=1)
y_val = validation_set['cnt_m'].astype(int)

In [None]:
#Dropping duplicates and creating test set in required format
latest_records = pd.concat([train_set, validation_set]).drop_duplicates(subset=['shop_id', 'item_id'], keep='last')
x_test = pd.merge(test_data, latest_records, on=['shop_id', 'item_id'], how='left', suffixes=['', '_'])
x_test['year'] = 2015
x_test['month'] = 9
x_test.drop('cnt_m', axis=1, inplace=True)
x_test = x_test[x_train.columns]

In [None]:
#Replace null value by median of the unique values in that particular column 
sets = [x_train, x_val, x_test]
for dataset in sets:
    for shop_id in dataset['shop_id'].unique():
        for column in dataset.columns:
            shop_median = dataset[(dataset['shop_id'] == shop_id)][column].median()
            dataset.loc[(dataset[column].isnull()) & (dataset['shop_id'] == shop_id), column] = shop_median

In [None]:
#Fill the remaining missing values in test set with the mean
x_test.fillna(x_test.mean(), inplace=True)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from scipy import stats

In [None]:
#base model is m1->Linear Regression
#Another model is m2->Random Forest Regression
m1 = LinearRegression()
m2 = RandomForestRegressor(n_estimators=10)

In [None]:
m1.fit(x_train,y_train)

In [None]:
m2.fit(x_train,y_train)

In [None]:
train_pred1=m1.predict(x_train)
train_pred2=m2.predict(x_train)

In [None]:
from sklearn.metrics import mean_squared_error
print('Linear Regression RMSE error on training set:',np.sqrt(mean_squared_error(y_train, train_pred1)))
print('Random Forest Regression RMSE error on training set:',np.sqrt(mean_squared_error(y_train, train_pred2)))

In [None]:
val_pred1=m1.predict(x_val)
val_pred2=m2.predict(x_val)

In [None]:
print('Linear Regression RMSE error on validation set:',np.sqrt(mean_squared_error(y_val, val_pred1)))
print('Random Forest Regression RMSE error on validation set:',np.sqrt(mean_squared_error(y_val, val_pred2)))

In [None]:
#predictions on test set
test_pred1=m1.predict(x_test)
test_pred2=m2.predict(x_test)

In [None]:
indices=[i for i in range(test_pred1.shape[0])]
test_pred1=test_pred1.clip(0,20)
test_pred2=test_pred2.clip(0,20)
#Normalising output value in the range[0,20] as per submission specifications
#Creating CSV file for submission to Kaggle Leaderboard
submission = pd.DataFrame({
    "ID": indices, 
    "item_cnt_month": test_pred1
})
submission.to_csv('lin_reg.csv', index=False)


In [None]:
#Creating CSV file for submission to Kaggle Leaderboard
submission2 = pd.DataFrame({
    "ID": indices, 
    "item_cnt_month": test_pred2
})
submission2.to_csv('random_forest.csv', index=False)

In [None]:
from sklearn.tree import DecisionTreeRegressor
m3 = DecisionTreeRegressor()
m3.fit(x_train,y_train)
train_pred3=m3.predict(x_train)
val_pred3=m3.predict(x_val)
print('Decision Tree Regression RMSE error on training set:',np.sqrt(mean_squared_error(y_train, train_pred3)))
print('Decision Tree RMSE error on validation set:',np.sqrt(mean_squared_error(y_val, val_pred3)))

In [None]:
test_pred3=m3.predict(x_test)
test_pred3=test_pred3.clip(0,20)
#Normalising output value in the range[0,20] as per submission specifications
#Creating CSV file for submission to Kaggle Leaderboard
submission3 = pd.DataFrame({
    "ID": indices, 
    "item_cnt_month": test_pred3
})
submission3.to_csv('dec_tree.csv', index=False)

In [None]:
#Ensemble Model with Max Voting
from sklearn.ensemble import VotingRegressor
maj_vot_model = VotingRegressor([('lr', m1), ('rf', m2),('dt', m3)])
maj_vot_model.fit(x_train, y_train)
train_pred_ens1 = maj_vot_model.predict(x_train)
val_pred_ens1 = maj_vot_model.predict(x_val)
print('Majority Voting Ensemble Model RMSE error on training set:',np.sqrt(mean_squared_error(y_train, train_pred_ens1)))
print('Majority Voting Ensemble Model RMSE error on validation set:',np.sqrt(mean_squared_error(y_val, val_pred_ens1)))

In [None]:
test_pred4=maj_vot_model.predict(x_test)
test_pred4=test_pred4.clip(0,20)
#Normalising output value in the range[0,20] as per submission specifications
#Creating CSV file for submission to Kaggle Leaderboard
submission4 = pd.DataFrame({
    "ID": indices, 
    "item_cnt_month": test_pred4
})
submission3.to_csv('maj_vot.csv', index=False)

In [None]:
#Weighted Average Ensemble Model
weighted_val=0.75*val_pred1+ 0.25*val_pred2+ 0*val_pred3
print('Weighted Averaging Ensemble Model RMSE error on validation set:',np.sqrt(mean_squared_error(y_val, weighted_val)))

In [None]:

#Experimenting with dimension reduction through PCA and checking RMSE for the same
from sklearn.decomposition import PCA
from sklearn import preprocessing

labels=preprocessing.LabelEncoder()
x_pca_train=train_set.iloc[:,:-1]
y_pca_train=train_set.iloc[:,-1:]

'''
x_pca_train=preprocessing.scale(x_pca_train)
mm_scaler = preprocessing.MinMaxScaler()
x_pca_train=mm_scaler.fit_transform(x_pca_train)
y_pca_train=labels.fit_transform(y_pca_train)
'''
comp=8
pca=PCA(n_components=comp)
principalComponent=pca.fit_transform(x_pca_train)
cols=list()
for i in range(comp):
    cols.append('principal_comp_'+str(i+1))
principalDf=pd.DataFrame(data=principalComponent,columns=cols)
x_pca_train=principalDf.iloc[:,:-1]
#pca_rf_model=RandomForestRegressor(n_estimators=10)

#pca_rf_model.fit(x_pca_train,y_pca_train)
#train_pca_pred=pca_rf_model.predict(x_pca_train)
pca_lr_model=LinearRegression()
pca_lr_model.fit(x_pca_train,y_pca_train)
pca_pred=pca_lr_model.predict(x_pca_train)
print('PCA Model RMSE error on training set:',np.sqrt(mean_squared_error(y_train, pca_pred)))