In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style = 'whitegrid')
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
import scipy.stats as stats
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from scipy.stats import randint as sp_randint, uniform as sp_uniform
import lightgbm as lgb
from xgboost import XGBRFRegressor, XGBRegressor
from catboost import CatBoostRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.decomposition import PCA

In [2]:
pd.set_option('display.max_columns', None)

In [59]:
train = pd.read_csv('train_events.csv')
train.head()

Unnamed: 0,Year,Month,ProductCategory,Sales,Avg_Sales,Event,Federal Holiday
0,2009,1,WomenClothing,1755.0,1215.466667,0.0,3.0
1,2009,1,MenClothing,524.0,1215.466667,0.0,3.0
2,2009,1,OtherClothing,936.0,1215.466667,0.0,3.0
3,2009,2,WomenClothing,1729.0,1308.433333,1.0,1.0
4,2009,2,MenClothing,496.0,1308.433333,1.0,1.0


In [60]:
test = pd.read_csv('test_events.csv')
test.head()

Unnamed: 0,Year,Month,ProductCategory,Avg_Sales,Event,Federal Holiday
0,2014,1,WomenClothing,1215.466667,0.0,3.0
1,2014,1,MenClothing,1215.466667,0.0,3.0
2,2014,1,OtherClothing,1215.466667,0.0,3.0
3,2014,2,WomenClothing,1308.433333,1.0,1.0
4,2014,2,MenClothing,1308.433333,1.0,1.0


In [61]:
train_dum = pd.get_dummies(train, drop_first = True)
test_dum = pd.get_dummies(test, drop_first = True)
X_train = train_dum.drop('Sales', 1)
y = train_dum['Sales']
X_test = test_dum

In [62]:
X_train.head()

Unnamed: 0,Year,Month,Avg_Sales,Event,Federal Holiday,ProductCategory_OtherClothing,ProductCategory_WomenClothing
0,2009,1,1215.466667,0.0,3.0,0,1
1,2009,1,1215.466667,0.0,3.0,0,0
2,2009,1,1215.466667,0.0,3.0,1,0
3,2009,2,1308.433333,1.0,1.0,0,1
4,2009,2,1308.433333,1.0,1.0,0,0


#### RF

In [51]:
rfr = RandomForestRegressor(random_state = 0)
rfr.fit(X_train, y)
imp = list(zip(rfr.feature_importances_, X_train.columns))
sorted(imp)

[(7.860868018786031e-07, 'Thunderstorm'),
 (3.320796514791274e-05, 'Fog , Rain , Snow'),
 (7.644313438445564e-05, 'Snow'),
 (0.00016808532102975546, 'Fog'),
 (0.00027061177273521466, 'Year'),
 (0.0002845584107518643, 'Visibility_High'),
 (0.0003353436099591242, 'Rain , Snow'),
 (0.00037647740488739836, 'Fog , Snow'),
 (0.0004024559786027238, 'Precip'),
 (0.0004152710114675468, 'Fog , Rain'),
 (0.00045453907686247586, 'Mill_Use'),
 (0.0004812277287255418, 'Wind_High'),
 (0.0004886764373068641, 'Normal'),
 (0.0005208762688843206, 'Humidity_Avg'),
 (0.0005721252981574184, 'Upland_Planted'),
 (0.0005875229073685211, 'Dew_High'),
 (0.0006348928533320882, 'Visibility_Low'),
 (0.0006384989132439447, 'Temp_Avg'),
 (0.0006683246586416982, 'Wind_Low'),
 (0.0006963818058038136, 'Sea_Avg'),
 (0.0007086510965173477, 'Yield_Harvested'),
 (0.0007503995402160682, 'Sea_High'),
 (0.0007565222364006875, 'Federal_Holidays'),
 (0.0009200945291257584, 'Visibility_Avg'),
 (0.0009755559370322316, 'Exports'),


In [52]:
X_train = X_train[['ProductCategory_WomenClothing', 'Month', 'ProductCategory_OtherClothing', 'Events', 'Real_GDP',
                  'Cotton_Price', 'CPI']]
X_test = X_test[['ProductCategory_WomenClothing', 'Month', 'ProductCategory_OtherClothing', 'Events', 'Real_GDP',
                  'Cotton_Price', 'CPI']]

### Reducing Columns

In [53]:
combined = [train, test]

train.columns

for i in combined:
    i['Holidays'] = i['Events'] + i['Federal_Holidays']
    i['GDP'] = (i['Nominal_GDP'] + i['Real_GDP']) / 2
    i['Interest_Rate'] = (i['Rate_CreditCard'] + i['Rate_PersonalLoan']) / 2
    i['Bad_Weather'] = i['Fog'] + i['Fog , Rain'] + i['Fog , Rain , Snow'] + i['Fog , Snow'] + i['Rain'] + i['Rain , Snow'] + i['Snow'] + i['Thunderstorm']

train = train.drop(['Events', 'Federal_Holidays', 'Nominal_GDP', 'Real_GDP', 'Rate_CreditCard', 'Rate_PersonalLoan', 
                'Upland_Planted', 'Upland_Harvested', 'Temp_High', 'Temp_Low', 'Dew_High', 'Dew_Low', 
                'Humidity_High', 'Humidity_Low', 'Sea_High', 'Sea_Low', 'Visibility_High', 'Visibility_Low',
               'Wind_Low', 'Wind_High', 'Fog , Rain', 'Fog , Rain , Snow', 'Fog , Snow', 'Rain', 'Rain , Snow', 'Snow',
               'Thunderstorm', 'Fog', 'Change'], axis = 1)

test = test.drop(['Events', 'Federal_Holidays', 'Nominal_GDP', 'Real_GDP', 'Rate_CreditCard', 'Rate_PersonalLoan', 
                'Upland_Planted', 'Upland_Harvested', 'Temp_High', 'Temp_Low', 'Dew_High', 'Dew_Low', 
                'Humidity_High', 'Humidity_Low', 'Sea_High', 'Sea_Low', 'Visibility_High', 'Visibility_Low',
               'Wind_Low', 'Wind_High', 'Fog , Rain', 'Fog , Rain , Snow', 'Fog , Snow', 'Rain', 'Rain , Snow', 'Snow',
               'Thunderstorm', 'Fog', 'Change'], axis = 1)

In [54]:
train.head()

Unnamed: 0,Year,Month,ProductCategory,Sales,CPI,Unemp_Rate,Wages,Cotton_Price,Yield_Harvested,Production,Mill_Use,Exports,Temp_Avg,Dew_Avg,Humidity_Avg,Sea_Avg,Visibility_Avg,Wind_Avg,Precip,Normal,Holidays,GDP,Interest_Rate,Bad_Weather
0,2009,1,WomenClothing,1755.0,233.402,7.8,22.05,0.577,799,12.589,4.17,11.55,-2.096774,-9.903226,57.354839,1015.967742,14.0,12.0,2.445806,21.0,3.0,14414.403119,11.735,10.0
1,2009,1,MenClothing,524.0,233.402,7.8,22.05,0.577,799,12.589,4.17,11.55,-2.096774,-9.903226,57.354839,1015.967742,14.0,12.0,2.445806,21.0,3.0,14414.403119,11.735,10.0
2,2009,1,OtherClothing,936.0,233.402,7.8,22.05,0.577,799,12.589,4.17,11.55,-2.096774,-9.903226,57.354839,1015.967742,14.0,12.0,2.445806,21.0,3.0,14414.403119,11.735,10.0
3,2009,2,WomenClothing,1729.0,234.663,8.3,22.22,0.5521,799,12.589,3.87,11.1,2.785714,-6.392857,54.0,1017.071429,15.178571,12.214286,0.7825,22.0,2.0,14377.688519,12.01,6.0
4,2009,2,MenClothing,496.0,234.663,8.3,22.22,0.5521,799,12.589,3.87,11.1,2.785714,-6.392857,54.0,1017.071429,15.178571,12.214286,0.7825,22.0,2.0,14377.688519,12.01,6.0


In [55]:
test.head()

Unnamed: 0,Year,Month,ProductCategory,CPI,Unemp_Rate,Wages,Cotton_Price,Yield_Harvested,Production,Mill_Use,Exports,Temp_Avg,Dew_Avg,Humidity_Avg,Sea_Avg,Visibility_Avg,Wind_Avg,Precip,Normal,Holidays,GDP,Interest_Rate,Bad_Weather
0,2014,1,WomenClothing,259.596,6.6,24.35,0.9096,807,12.551,3.58,9.75,-1.766667,-9.4,59.433333,1019.2,13.866667,10.1,2.000333,20.0,3.0,16330.790646,11.035,10.0
1,2014,1,MenClothing,259.596,6.6,24.35,0.9096,807,12.551,3.58,9.75,-1.766667,-9.4,59.433333,1019.2,13.866667,10.1,2.000333,20.0,3.0,16330.790646,11.035,10.0
2,2014,1,OtherClothing,259.596,6.6,24.35,0.9096,807,12.551,3.58,9.75,-1.766667,-9.4,59.433333,1019.2,13.866667,10.1,2.000333,20.0,3.0,16330.790646,11.035,10.0
3,2014,2,WomenClothing,259.019,6.7,24.58,0.9405,807,12.551,3.58,9.75,0.285714,-7.821429,58.714286,1016.964286,12.357143,10.25,4.972857,17.0,2.0,16395.592634,10.96,11.0
4,2014,2,MenClothing,259.019,6.7,24.58,0.9405,807,12.551,3.58,9.75,0.285714,-7.821429,58.714286,1016.964286,12.357143,10.25,4.972857,17.0,2.0,16395.592634,10.96,11.0


In [56]:
train_dum = pd.get_dummies(train, drop_first = True)
test_dum = pd.get_dummies(test, drop_first = True)
X_train = train_dum.drop('Sales', 1)
y = train_dum['Sales']
X_test = test

In [57]:
X_train.head()

Unnamed: 0,Year,Month,CPI,Unemp_Rate,Wages,Cotton_Price,Yield_Harvested,Production,Mill_Use,Exports,Temp_Avg,Dew_Avg,Humidity_Avg,Sea_Avg,Visibility_Avg,Wind_Avg,Precip,Normal,Holidays,GDP,Interest_Rate,Bad_Weather,ProductCategory_OtherClothing,ProductCategory_WomenClothing
0,2009,1,233.402,7.8,22.05,0.577,799,12.589,4.17,11.55,-2.096774,-9.903226,57.354839,1015.967742,14.0,12.0,2.445806,21.0,3.0,14414.403119,11.735,10.0,0,1
1,2009,1,233.402,7.8,22.05,0.577,799,12.589,4.17,11.55,-2.096774,-9.903226,57.354839,1015.967742,14.0,12.0,2.445806,21.0,3.0,14414.403119,11.735,10.0,0,0
2,2009,1,233.402,7.8,22.05,0.577,799,12.589,4.17,11.55,-2.096774,-9.903226,57.354839,1015.967742,14.0,12.0,2.445806,21.0,3.0,14414.403119,11.735,10.0,1,0
3,2009,2,234.663,8.3,22.22,0.5521,799,12.589,3.87,11.1,2.785714,-6.392857,54.0,1017.071429,15.178571,12.214286,0.7825,22.0,2.0,14377.688519,12.01,6.0,0,1
4,2009,2,234.663,8.3,22.22,0.5521,799,12.589,3.87,11.1,2.785714,-6.392857,54.0,1017.071429,15.178571,12.214286,0.7825,22.0,2.0,14377.688519,12.01,6.0,0,0


#### RF

In [58]:
rfr = RandomForestRegressor(random_state = 0)
rfr.fit(X_train, y)
imp = list(zip(rfr.feature_importances_, X_train.columns))
sorted(imp)

[(0.0005742170282335855, 'Bad_Weather'),
 (0.0006940089847722433, 'Normal'),
 (0.0007795165895839045, 'Year'),
 (0.0010137328727602805, 'Precip'),
 (0.0011736462993858117, 'Mill_Use'),
 (0.0013396501250802783, 'Visibility_Avg'),
 (0.0014424146993200178, 'Exports'),
 (0.0016282693806000399, 'Yield_Harvested'),
 (0.0016787635920028413, 'Wind_Avg'),
 (0.0020458911191043065, 'Interest_Rate'),
 (0.002098326477740676, 'Sea_Avg'),
 (0.0021206117861465486, 'Production'),
 (0.0022091793575647286, 'Humidity_Avg'),
 (0.004232203726486151, 'Temp_Avg'),
 (0.004296066145336905, 'Dew_Avg'),
 (0.004757717486275387, 'Unemp_Rate'),
 (0.006853105676826772, 'Wages'),
 (0.00962266595153729, 'Cotton_Price'),
 (0.01062884676414824, 'CPI'),
 (0.01695446492774541, 'GDP'),
 (0.020055213957737773, 'Holidays'),
 (0.02137335533088042, 'ProductCategory_OtherClothing'),
 (0.031703044801386526, 'Month'),
 (0.8507250869193438, 'ProductCategory_WomenClothing')]

### Modeling

#### GBR

In [117]:
gbr = GradientBoostingRegressor(random_state = 0)
gbr.fit(X_train, y)
gbr.score(X_train, y)

0.9987631267566738

##### Tuning GBR

In [108]:
gbr = GradientBoostingRegressor(random_state = 0)

params = {'n_estimators' : sp_randint(50, 200),
          'max_features' : sp_randint(1, 5),
          'max_depth' : sp_randint(2, 20),
          'min_samples_leaf' : sp_randint(1, 10),
          'min_samples_split' : sp_randint(2, 10),
          'learning_rate' : sp_uniform(0.01, 0.5)}

rsearch_gbr = RandomizedSearchCV(gbr, param_distributions = params, n_jobs = -1,
                                 cv = 5, n_iter = 100, random_state = 0)

rsearch_gbr.fit(X_train, y)
print(rsearch_gbr.best_params_)

{'learning_rate': 0.2844067519636624, 'max_depth': 2, 'max_features': 4, 'min_samples_leaf': 4, 'min_samples_split': 9, 'n_estimators': 59}


###### Tuned GBR

In [109]:
gbr = GradientBoostingRegressor(**rsearch_gbr.best_params_, random_state = 0)
gbr.fit(X_train, y)
gbr.score(X_train, y)

0.9783609661980899

#### LGBMR

In [10]:
lgbmr = lgb.LGBMRegressor(random_state = 0)
lgbmr.fit(X_train, y)
lgbmr.score(X_train, y)

0.9830653549383642

##### Tuning LGBMR

In [13]:
lgbmr = lgb.LGBMRegressor(random_state = 0)

params = {'n_estimators' : sp_randint(50, 200),
         'max_depth' : sp_randint(1, 25),
         'learning_rate' : sp_uniform(0.1, 1)}

rsearch_lgbm = RandomizedSearchCV(lgbmr, param_distributions = params,
                                 cv = 5, n_iter = 100, n_jobs = -1, random_state = 0)

rsearch_lgbm.fit(X_train, y)
print(rsearch_lgbm.best_params_)

{'learning_rate': 0.533288061982044, 'max_depth': 9, 'n_estimators': 69}


###### Tuned LGBMR

In [14]:
lgbmr = lgb.LGBMRegressor(**rsearch_lgbm.best_params_, random_state = 0)
lgbmr.fit(X_train, y)
lgbmr.score(X_train, y)

0.996808946051753

In [19]:
imp = list(zip(lgbmr.feature_importances_, X_train.columns))
sorted(imp)

[(21, 'ProductCategory_OtherClothing'),
 (42, 'Events'),
 (52, 'CPI'),
 (59, 'ProductCategory_WomenClothing'),
 (59, 'Real_GDP'),
 (88, 'Cotton_Price'),
 (97, 'Month')]

#### Submission

In [15]:
pred = lgbmr.predict(test)

In [16]:
kaggle = pd.read_csv('Kaggle_Submission_Format.csv')

In [17]:
kaggle['Sales(In ThousandDollars)'] = pred

In [18]:
kaggle.to_csv('Iteration_8_lgbmr_feature_selection.csv', index = False)
kaggle.head()

Unnamed: 0,Year,Sales(In ThousandDollars)
0,1,2736.024818
1,2,563.621608
2,3,1085.865911
3,4,3067.681284
4,5,619.737687
