## Importing Libraries

In [2]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from dask import dataframe as dd

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import gc

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import cross_val_score
import xgboost as xgb


## Loading Data through Dask 

In [None]:
data = dd.read_csv('../input/09-project3/09_Project3.csv')
loc_data = dd.read_csv('../input/03-locationmaster/03_LocationMaster.csv')
prod_data = dd.read_csv('../input/02-productmaster/02_ProductMaster.csv')
date_data = dd.read_csv('../input/01-calendarmaster/01_CalendarMaster.csv')

## Merging Dataset based on Unique Key Variables

In [None]:
main_df = dd.merge(data, date_data, left_on='Date', right_on='DateKey', how='left')

In [None]:
main_df = dd.merge(main_df, prod_data, left_on='ProductKey', right_on='ProductKey', how='left')

In [None]:
main_df = dd.merge(main_df, loc_data, left_on='LocationKey', right_on='LocationKey', how='left')

In [None]:
main_df = main_df.drop(["DateKey", "CatEdition", "Supplier", "DIorDOM", 
                         "Region", "UpstreamLocKey", "StockPolicy", "ShopFormat"], axis=1)

## Calculating Propensities 

In [None]:
main_df["SimpleOrderPropensity"] = (main_df.OrderedUnits / main_df.DemandUnits)

In [None]:
main_df["CollectOrderPropensity"] = (main_df.CollectedUnits / main_df.DemandUnits)

In [None]:
main_df = main_df.compute() ## Compute function applies all the orevious transformations in parallel so takes up less space

In [None]:
del data, loc_data, prod_data, date_data
gc.collect()

## Removing Outliers 

In [None]:
outlier = main_df['DemandUnits'].mean() + (3*main_df['DemandUnits'].std())
main_df = main_df.drop((main_df[main_df['DemandUnits'] >= outlier]).index, 0)

outlier = main_df['OrderedUnits'].mean() + (3*main_df['OrderedUnits'].std())
main_df = main_df.drop((main_df[main_df['OrderedUnits'] >= outlier]).index, 0)

outlier = main_df['CollectedUnits'].mean() + (3*main_df['CollectedUnits'].std())
main_df = main_df.drop((main_df[main_df['CollectedUnits'] >= outlier]).index, 0)

main_df = main_df.drop((main_df[main_df['SimpleOrderPropensity'] > 1]).index, 0)
main_df = main_df.drop((main_df[main_df['CollectOrderPropensity'] > 1]).index, 0)

In [None]:
# from sklearn.base import BaseEstimator, TransformerMixin

# class AttrTransf(BaseEstimator, TransformerMixin):
    
#     def __init__(self, removExtraCols = True, allCatStr = True, mergCols = True):
#         self.removExtraCols = removExtraCols
#         self.allCatStr = allCatStr
#         self.mergCols = mergCols
        
#     def fit(self, X, y=None):
#         return self
        
#     def transform(self, X):
        
#     ### calculating outlier for DemandUnits (mean into 3 times s.d) 
#         outD = X['DemandUnits'].mean() + (3*X['DemandUnits'].std())
#         outO = X['OrderedUnits'].mean() + (3*X['OrderedUnits'].std())
#         outC = X['CollectedUnits'].mean() + (3*X['CollectedUnits'].std())

#     ### dropping rows of data where Demand, Ordered or Collected grater than outlier
#         X.drop((X[(X['DemandUnits'] >= outD) | 
#                   (X['OrderedUnits'] >= outO) | 
#                   (X['CollectedUnits'] >= outC) |
#                   (X['SimpleOrderPropensity'] > 1) | 
#                   (X['CollectOrderPropensity'] > 1)]).index, 0)
        
#     ### removing extra or unnecessary columns to reduce model complexity (if True)
#         if self.removExtraCols:
#             X.drop(X[["Date", "ProductKey", "LocationKey", 
#                       "DemandUnits", "OrderedUnits", "CollectedUnits", 
#                       "IsBankHoliday", "Seasonal", "IsHub", 
#                       "Latitude", "Longitude"]], 1)
#         else:
#             X.drop(X[["Date", "ProductKey", "LocationKey", 
#                       "DemandUnits", "OrderedUnits", "CollectedUnits"]], 1)
        
#     ### merging categorical columns to reduce datasize (and complexity)   
#         if self.mergCols:
#             X['prodHierarchy'] = X["HierarchyLevel2"].astype(str) + "-" + X["HierarchyLevel1"].astype(str)
#             X['locType'] = X["LocationType2"].astype(str) + "-" + X["LocationType1"].astype(str)
#             X.drop(["HierarchyLevel1", "HierarchyLevel2", "LocationType1", "LocationType2"], 1)
        
#     ### changing dtypes for categorical variables to string from integer/boolean
#         if self.allCatStr:
            
            

## Selecting random Sample of Data

In [None]:
lines = main_df.shape[0]

In [None]:
skiplines = np.random.choice(np.arange(1, lines), size=lines-1000000, replace=False)

#sort the list
skiplines=np.sort(skiplines)

## Loading only one million sample rows that are required

In [None]:
main_df = pd.read_csv("../input/main1m/main1m.csv", skiprows=skiplines, nrows=1000000) #

## String conversion of categorical variable for one hot encoding

In [None]:
main_df['YearWeek'] = main_df["YearWeek"].astype(str)
main_df['DayOfWeek'] = main_df["DayOfWeek"].astype(str)
main_df['IsBankHoliday'] = main_df["IsBankHoliday"].astype(str)
main_df['IsWorkingDay'] = main_df["IsWorkingDay"].astype(str)
main_df['HierarchyLevel1'] = main_df["HierarchyLevel1"].astype(str)
main_df['HierarchyLevel2'] = main_df["HierarchyLevel2"].astype(str)
main_df['Seasonal'] = main_df["Seasonal"].astype(str)
main_df['IsHub'] = main_df["IsHub"].astype(str)
main_df['LocationType1'] = main_df["LocationType1"].astype(str)
main_df['LocationType2'] = main_df["LocationType2"].astype(str)

## Seperating feature and target variables

In [None]:
Xall = pd.DataFrame()
# Xless = pd.DataFrame()
Xall = main_df[["YearWeek", "DayOfWeek", "IsWorkingDay", "IsBankHoliday", 
                "HierarchyLevel1", "HierarchyLevel2", "Seasonal", "IsHub", 
                "LocationType1", "LocationType2", "Latitude", "Longitude"]].copy()

# Xless = main_df[["YearWeek", "DayOfWeek", "IsWorkingDay"]].copy()
# Xless['prodHierarchy'] = main_df["HierarchyLevel2"] + "-" + main_df["HierarchyLevel1"]
# Xless['locType'] = main_df["LocationType2"] + "-" + main_df["LocationType1"]

yS = main_df["SimpleOrderPropensity"]
yC = main_df["CollectOrderPropensity"]

In [None]:
del main_df
gc.collect()

## Transforming feature variables 

In [None]:
catFeat = ["YearWeek", "DayOfWeek", "IsWorkingDay", "IsBankHoliday", "HierarchyLevel1", "HierarchyLevel2", 
           "Seasonal", "IsHub", "LocationType1", "LocationType2"]

numFeat = ["Latitude", "Longitude"]

numTransf = MinMaxScaler()
Xnum = numTransf.fit_transform(Xall[numFeat])

Xcat = pd.get_dummies(Xall[catFeat], sparse = False)

Xall = np.concatenate((Xnum, Xcat), axis=1)

## Train test split 80/20 ratio

In [None]:
X_train, X_test, yS_train, yS_test = train_test_split(Xall, yS, test_size = 0.2, random_state = 0)

## Linear Regression Model

In [None]:
linReg = LinearRegression().fit(X_train, yS_train)
print("Training set score: {:.4f}".format(linReg.score(X_train, yS_train)))
print("Test set score: {:.4f}".format(linReg.score(X_test, yS_test)))

In [None]:
y_pred = linReg.predict(X_test)
rmse = np.sqrt(mean_squared_error(yS_test, y_pred))
print("Root Mean Squared Error: {:.4f}".format(rmse))
mae = mean_absolute_error(yS_test, y_pred)
print('Mean Absolute Error: ', mae)

## Ridge Regression Model

In [None]:
ridge = Ridge().fit(X_train, yS_train)
print("Training set score: {:.4f}".format(ridge.score(X_train, yS_train)))
print("Test set score: {:.4f}".format(ridge.score(X_test, yS_test)))
print("Number of features used: {}".format(np.sum(ridge.coef_ != 0)))

In [None]:
best_score = 0
best_alpha = 0
for alpha in [0.01, 0.1, 1, 10, 100]:
    ridge = Ridge(alpha=alpha).fit(X_train, yS_train)
    testscore = ridge.score(X_test, yS_test)
    if testscore > best_score:
        best_score = testscore
        best_alpha = alpha
print('Best score: {:.4f}'.format(best_score))
print('Best alpha: {:.2f}'.format(best_alpha))

In [None]:
ridge = Ridge(alpha=best_alpha).fit(X_train, yS_train)

In [None]:
y_pred = ridge.predict(X_test)
rmse = np.sqrt(mean_squared_error(yS_test, y_pred))
print("Root Mean Squared Error: {:.4f}".format(rmse))
mae = mean_absolute_error(yS_test, y_pred)
print('Mean Absolute Error: ', mae)

## LASSO Regression Model

In [None]:
lasso1 = Lasso(alpha=0.01, max_iter=100000).fit(X_train, yS_train)
print("Training set score: {:.4f}".format(lasso1.score(X_train, yS_train)))
print("Test set score: {:.4f}".format(lasso1.score(X_test, yS_test)))
print("Number of features used: {}".format(np.sum(lasso1.coef_ != 0)))

In [None]:
lasso2 = Lasso(alpha=0.0001, max_iter=100000).fit(X_train, yS_train)
print("Training set score: {:.4f}".format(lasso2.score(X_train, yS_train)))
print("Test set score: {:.4f}".format(lasso2.score(X_test, yS_test)))
print("Number of features used: {}".format(np.sum(lasso2.coef_ != 0)))

In [None]:
y_pred = lasso2.predict(X_test)
rmse = np.sqrt(mean_squared_error(yS_test, y_pred))
print("Root Mean Squared Error: {:.4f}".format(rmse))
mae = mean_absolute_error(yS_test, y_pred)
print("Mean Absolute Error: ", mae)

## Random Forest Regression Model

In [None]:
forest = RandomForestRegressor(n_estimators=10, max_depth=10, bootstrap=True, n_jobs=-1)
forest.fit(X_train, yS_train)
print("Accuracy on training set: {:.3f}".format(forest.score(X_train, yS_train)))
print("Accuracy on test set: {:.3f}".format(forest.score(X_test, yS_test)))

In [None]:
y_pred = forest.predict(X_test)
rmse = np.sqrt(mean_squared_error(yS_test, y_pred))
print("Root Mean Squared Error: {:.4f}".format(rmse))
mae = mean_absolute_error(yS_test, y_pred)
print('Mean Absolute Error: ', mae)

In [None]:
forest1 = RandomForestRegressor(n_estimators=5, max_depth=20, bootstrap=True, n_jobs=-1)
forest1.fit(X_train, yS_train)
print("Accuracy on training set: {:.3f}".format(forest.score(X_train, yS_train)))
print("Accuracy on test set: {:.3f}".format(forest.score(X_test, yS_test)))

In [None]:
y_pred = forest1.predict(X_test)
rmse = np.sqrt(mean_squared_error(yS_test, y_pred))
print("Root Mean Squared Error: {:.4f}".format(rmse))
mae = mean_absolute_error(yS_test, y_pred)
print('Mean Absolute Error: ', mae)

## XGBoost Regression Model

In [None]:
xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(X_train, yS_train)
y_pred = xgb_reg.predict(X_test)

In [None]:
print("Accuracy on test set: {:.3f}".format(xgb_reg.score(X_test, yS_test)))

In [None]:
rmse = np.sqrt(mean_squared_error(yS_test, y_pred))
print("Root Mean Squared Error: {:.4f}".format(rmse))
mae = mean_absolute_error(yS_test, y_pred)
print('Mean Absolute Error: ', mae)