## Data Explanation ##

**Stores**
* Store: The store number. Range from 1-45.
* Type: Three types of stores ‘A’, ‘B’ or ‘C’.
* Size: Sets the size of a Store would be calculated by the no. of products available in the particular store ranging from 34,000 to 210,000.

**Features**
* Store: The store which observation in recorded 1-45.
* Date: The date of the week where this observation was taken. 
* Temperature: Temperature of the region during that week.
* Fuel_Price: Fuel Price in that region during that week.
* MarkDown1:5 : Represents the Type of markdown and what quantity was available during that week.
* CPI: Consumer Price Index during that week.
* Unemployment: The unemployment rate during that week in the region of the store.
* IsHoliday: Boolean value representing a holiday week or not.

**Sales**
* Store: The store which observation in recorded 1-45.
* Dept: One of 1-99 that shows the department.
* Date: The date of the week where this observation was taken. 
* Weekly_Sales: The sales recorded during that Week.
* IsHoliday: Boolean value representing a holiday week or not.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from prettytable import PrettyTable
import seaborn as sns

import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.ensemble import ExtraTreesRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, make_scorer, mean_absolute_error, r2_score
from scipy.stats import skew

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

%matplotlib inline

stores = pd.read_csv('../input/stores data-set.csv')
features = pd.read_csv('../input/Features data set.csv')
sales = pd.read_csv('../input/sales data-set.csv')

pt = PrettyTable(['data', 'dimension'])
pt.add_row(['stores', stores.shape])
pt.add_row(['features', features.shape])
pt.add_row(['sales', sales.shape])

print(pt, '\n')


In [None]:
#Denormalize data for easy processing
denorm = pd.merge(sales, features, how = 'left', on=['Store', 'Date', 'IsHoliday'])
denorm = pd.merge(denorm, stores, how = 'left', on=['Store'])

denorm['IsReturn'] = ((denorm['Weekly_Sales'] < 0))
denorm = denorm.drop(['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5'], axis = 1)

denorm['Date'] = pd.to_datetime(denorm['Date'], format="%d/%m/%Y")
#Sales Only
#denormSales = denorm.loc[denorm['IsReturn'] == 0]
denormSales = denorm.copy()
denormSales = denormSales.assign(IsHolidayInt = [int(x) for x in list(denormSales.IsHoliday)])

print(denormSales.columns.tolist())

In [None]:
denormSales.describe().transpose()

In [None]:
#causes kernel to fail 
#scatterplot
#sns.set()
#cols = ['Weekly_Sales', 'Temperature', 'Fuel_Price', 'CPI', 'Size', 'IsHoliday', 'Type', 'Dept']
#sns.pairplot(denormSales[cols], size = 5)
#plt.show()


In [None]:
corrmat = denormSales[['Weekly_Sales', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Type', 'Size', 'Dept']].corr()

#print(corrmat)

f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True, annot=True);
#print(denorm)

In [None]:
denormSales = denormSales.assign(year = denormSales.Date.dt.year)
denormSales = denormSales.assign(week = denormSales.Date.dt.week)
denormSales = denormSales.assign(yearweek = denormSales.year * 100 + denormSales.week)
denormSales.head()

In [None]:
dept_sales = denormSales.groupby(by=(['Dept', 'year']), as_index=False)['Weekly_Sales'].sum()
top_depts = dept_sales.nlargest(30, 'Weekly_Sales')
sns.set(style="whitegrid")
g = sns.factorplot(x="year", y="Weekly_Sales", hue="Dept", data=top_depts,
                   size=10, kind="bar", palette="Set1")
g.despine(left=True)
g.set_ylabels("department sale contribution")



bottom_depts = dept_sales.nsmallest(30, 'Weekly_Sales')
sns.set(style="whitegrid")
g = sns.factorplot(x="year", y="Weekly_Sales", hue="Dept", data=bottom_depts,
                   size=10, kind="bar", palette="muted")
g.despine(left=True)
g.set_ylabels("department sale contribution")

In [None]:
train1 = denormSales.loc[denormSales['year'] != 2012]
test1 = denormSales.loc[denormSales['year'] == 2012]

featureset1 = ['Store', 'Dept', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'IsHolidayInt', 'Size']

x_train1 = train1[featureset1]
y_train1 = train1['Weekly_Sales']

x_test1 = test1[featureset1]
y_test1 = test1['Weekly_Sales']

lr = LinearRegression()
lr.fit(x_train1, y_train1)

y_pred1 = lr.predict(x_test1)
y_train_pred1 = lr.predict(x_train1)

# The coefficients
print('Coefficients: \n', lr.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_test1, y_pred1))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test1, y_pred1))

# Plot residuals
plt.figure(figsize=(12,6))
plt.scatter(y_train_pred1, y_train_pred1 - y_train1, c = "blue", marker = "s", label = "Training data")
plt.scatter(y_pred1, y_pred1 - y_test1, c = "lightgreen", marker = "s", label = "Validation data")
plt.title("Linear regression")
plt.xlabel("Predicted values")
plt.ylabel("Residuals")
plt.legend(loc = "upper left")
plt.hlines(y = 0, xmin = 0, xmax = 30000, color = "red")
plt.show()

# Plot predictions
plt.figure(figsize=(12,6))
plt.scatter(y_train_pred1, y_train1, c = "blue", marker = "s", label = "Training data")
plt.scatter(y_pred1, y_test1, c = "lightgreen", marker = "s", label = "Validation data")
plt.title("Linear regression")
plt.xlabel("Predicted values")
plt.ylabel("Real values")
plt.legend(loc = "upper left")
plt.show()

In [None]:
train2 = denormSales.loc[denormSales['year'] != 2012]
test2 = denormSales.loc[denormSales['year'] == 2012]

featureset2 = ['Store', 'Dept', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'IsHolidayInt', 'Size']

x_train2 = train2[featureset2]
y_train2 = train2['Weekly_Sales']

x_test2 = test2[featureset2]
y_test2 = test2['Weekly_Sales']

etr = ExtraTreesRegressor(n_estimators=200)
etr.fit(x_train2, y_train2)
y_pred2 = etr.predict(x_test2)

y_train_pred2 = etr.predict(x_train2)

print("ExtraTreeRegressor Score:", etr.score(x_test2, y_test2))
print("Mean squared error: %.2f" %
       np.mean((y_test2 - y_pred2) ** 2))

# Plot residuals
plt.figure(figsize=(12,6))
plt.plot(y_train_pred2, y_train_pred2 - y_train2, c = "blue", marker = "s", label = "Training data")
plt.scatter(y_pred2, y_pred2 - y_test2, c = "lightgreen", marker = "s", label = "Validation data")
plt.title("ExtraTreesRegressor")
plt.xlabel("Predicted values")
plt.ylabel("Residuals")
plt.legend(loc = "upper left")
plt.show()

# Plot predictions
plt.figure(figsize=(12,6))
plt.plot(y_train_pred2, y_train2, c = "blue", marker = "s", label = "Training data")
plt.scatter(y_pred2, y_test2, c = "lightgreen", marker = "s", label = "Validation data")
plt.title("ExtraTreesRegressor")
plt.xlabel("Predicted values")
plt.ylabel("Real values")
plt.legend(loc = "upper left")
plt.show()