In [351]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style = 'whitegrid')
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
import scipy.stats as stats
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from scipy.stats import randint as sp_randint, uniform as sp_uniform
import lightgbm as lgb

In [352]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,Year,Month,ProductCategory,Sales(In ThousandDollars)
0,2009,1,WomenClothing,1755.0
1,2009,1,MenClothing,524.0
2,2009,1,OtherClothing,936.0
3,2009,2,WomenClothing,1729.0
4,2009,2,MenClothing,496.0


In [353]:
test = pd.read_csv('submission.csv')
test.head()

Unnamed: 0,Year,Month,ProductCategory,Unnamed: 3,Sales(In ThousandDollars)
0,2014,1,WomenClothing,,
1,2014,1,MenClothing,,
2,2014,1,OtherClothing,,
3,2014,2,WomenClothing,,
4,2014,2,MenClothing,,


In [354]:
test.drop(['Unnamed: 3', 'Sales(In ThousandDollars)'], axis = 1, inplace = True)
test.head()

Unnamed: 0,Year,Month,ProductCategory
0,2014,1,WomenClothing
1,2014,1,MenClothing
2,2014,1,OtherClothing
3,2014,2,WomenClothing
4,2014,2,MenClothing


In [355]:
train.isna().sum()

Year                          0
Month                         0
ProductCategory               0
Sales(In ThousandDollars)    10
dtype: int64

In [356]:
train.tail()

Unnamed: 0,Year,Month,ProductCategory,Sales(In ThousandDollars)
175,2013,11,MenClothing,798.0
176,2013,11,OtherClothing,1209.0
177,2013,12,WomenClothing,4865.0
178,2013,12,MenClothing,1085.0
179,2013,12,OtherClothing,1566.0


In [357]:
train.shape

(180, 4)

In [358]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 4 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Year                       180 non-null    int64  
 1   Month                      180 non-null    int64  
 2   ProductCategory            180 non-null    object 
 3   Sales(In ThousandDollars)  170 non-null    float64
dtypes: float64(1), int64(2), object(1)
memory usage: 5.8+ KB


In [359]:
train.describe()

Unnamed: 0,Year,Month,Sales(In ThousandDollars)
count,180.0,180.0,170.0
mean,2011.0,6.5,1616.729412
std,1.418158,3.461682,1158.224405
min,2009.0,1.0,471.0
25%,2010.0,3.75,704.0
50%,2011.0,6.5,1041.0
75%,2012.0,9.25,2609.75
max,2013.0,12.0,4865.0


In [360]:
train['ProductCategory'].unique()

array(['WomenClothing', 'MenClothing', 'OtherClothing'], dtype=object)

In [361]:
train['Sales'] = train['Sales(In ThousandDollars)']
train.drop('Sales(In ThousandDollars)', axis = 1, inplace = True)
train.head()

Unnamed: 0,Year,Month,ProductCategory,Sales
0,2009,1,WomenClothing,1755.0
1,2009,1,MenClothing,524.0
2,2009,1,OtherClothing,936.0
3,2009,2,WomenClothing,1729.0
4,2009,2,MenClothing,496.0


In [362]:
train['Sales'] = train.groupby(['Month', 'ProductCategory'])['Sales'].apply(lambda x : x.fillna(x.median()))
train.isna().sum()

Year               0
Month              0
ProductCategory    0
Sales              0
dtype: int64

In [363]:
df_avgsales = pd.DataFrame(train.groupby(['Month'])['Sales'].mean().reset_index())
df_avgsales.columns = ['Month', 'Avg_Sales']
df_avgsales

Unnamed: 0,Month,Avg_Sales
0,1,1215.466667
1,2,1308.433333
2,3,1626.133333
3,4,1730.233333
4,5,1736.466667
5,6,1557.266667
6,7,1464.733333
7,8,1540.533333
8,9,1541.433333
9,10,1596.366667


In [364]:
train = train.merge(df_avgsales, on = 'Month', how = 'left')
test = test.merge(df_avgsales, on = 'Month', how = 'left')
train.head()

Unnamed: 0,Year,Month,ProductCategory,Sales,Avg_Sales
0,2009,1,WomenClothing,1755.0,1215.466667
1,2009,1,MenClothing,524.0,1215.466667
2,2009,1,OtherClothing,936.0,1215.466667
3,2009,2,WomenClothing,1729.0,1308.433333
4,2009,2,MenClothing,496.0,1308.433333


In [365]:
df = pd.concat([train, test], ignore_index=True)
df

Unnamed: 0,Year,Month,ProductCategory,Sales,Avg_Sales
0,2009,1,WomenClothing,1755.0,1215.466667
1,2009,1,MenClothing,524.0,1215.466667
2,2009,1,OtherClothing,936.0,1215.466667
3,2009,2,WomenClothing,1729.0,1308.433333
4,2009,2,MenClothing,496.0,1308.433333
...,...,...,...,...,...
211,2014,11,MenClothing,,1720.733333
212,2014,11,OtherClothing,,1720.733333
213,2014,12,WomenClothing,,2317.600000
214,2014,12,MenClothing,,2317.600000


In [366]:
df['Sales_Lag1'] = df['Sales'].shift(3)
#df['Sales_Lag2'] = df['Sales'].shift(6)
#df['Sales_Lag3'] = df['Sales'].shift(9)
df.head(12)

Unnamed: 0,Year,Month,ProductCategory,Sales,Avg_Sales,Sales_Lag1
0,2009,1,WomenClothing,1755.0,1215.466667,
1,2009,1,MenClothing,524.0,1215.466667,
2,2009,1,OtherClothing,936.0,1215.466667,
3,2009,2,WomenClothing,1729.0,1308.433333,1755.0
4,2009,2,MenClothing,496.0,1308.433333,524.0
5,2009,2,OtherClothing,859.0,1308.433333,936.0
6,2009,3,WomenClothing,2256.0,1626.133333,1729.0
7,2009,3,MenClothing,542.0,1626.133333,496.0
8,2009,3,OtherClothing,921.0,1626.133333,859.0
9,2009,4,WomenClothing,2662.0,1730.233333,2256.0


In [367]:
df.tail(36)

Unnamed: 0,Year,Month,ProductCategory,Sales,Avg_Sales,Sales_Lag1
180,2014,1,WomenClothing,,1215.466667,4865.0
181,2014,1,MenClothing,,1215.466667,1085.0
182,2014,1,OtherClothing,,1215.466667,1566.0
183,2014,2,WomenClothing,,1308.433333,
184,2014,2,MenClothing,,1308.433333,
185,2014,2,OtherClothing,,1308.433333,
186,2014,3,WomenClothing,,1626.133333,
187,2014,3,MenClothing,,1626.133333,
188,2014,3,OtherClothing,,1626.133333,
189,2014,4,WomenClothing,,1730.233333,


In [368]:
df['Sales_Lag1'] = df.groupby(['Month', 'ProductCategory'])['Sales_Lag1'].apply(lambda x: x.fillna(x.mean()))
#df['Sales_Lag2'] = df.groupby(['Month', 'ProductCategory'])['Sales_Lag2'].apply(lambda x: x.fillna(x.mean()))
#df['Sales_Lag3'] = df.groupby(['Month', 'ProductCategory'])['Sales_Lag3'].apply(lambda x: x.fillna(x.mean()))

In [369]:
test = df.tail(36)

In [370]:
test = test.drop('Sales', 1)
test

Unnamed: 0,Year,Month,ProductCategory,Avg_Sales,Sales_Lag1
180,2014,1,WomenClothing,1215.466667,4865.0
181,2014,1,MenClothing,1215.466667,1085.0
182,2014,1,OtherClothing,1215.466667,1566.0
183,2014,2,WomenClothing,1308.433333,2139.4
184,2014,2,MenClothing,1308.433333,522.2
185,2014,2,OtherClothing,1308.433333,984.8
186,2014,3,WomenClothing,1626.133333,2486.8
187,2014,3,MenClothing,1626.133333,513.8
188,2014,3,OtherClothing,1626.133333,924.7
189,2014,4,WomenClothing,1730.233333,3175.4


In [371]:
test.isna().sum().sum()

0

In [372]:
train = df[:180]
train

Unnamed: 0,Year,Month,ProductCategory,Sales,Avg_Sales,Sales_Lag1
0,2009,1,WomenClothing,1755.0,1215.466667,4442.8
1,2009,1,MenClothing,524.0,1215.466667,1032.8
2,2009,1,OtherClothing,936.0,1215.466667,1477.2
3,2009,2,WomenClothing,1729.0,1308.433333,1755.0
4,2009,2,MenClothing,496.0,1308.433333,524.0
...,...,...,...,...,...,...
175,2013,11,MenClothing,798.0,1720.733333,785.0
176,2013,11,OtherClothing,1209.0,1720.733333,975.0
177,2013,12,WomenClothing,4865.0,2317.600000,3834.0
178,2013,12,MenClothing,1085.0,2317.600000,798.0


In [373]:
train.isna().sum().sum()

0

In [374]:
events = pd.read_excel('Events_HolidaysData.xlsx')
events

Unnamed: 0,Year,MonthDate,Event,DayCategory
0,2009,2001-01-01,New Year's Day,Federal Holiday
1,2009,2019-01-01,Martin Luther King Jr. Day,Federal Holiday
2,2009,2014-02-01,Valentine's Day,Event
3,2009,2016-02-01,Presidents' Day,Federal Holiday
4,2009,2012-04-01,Easter Sunday,Event
...,...,...,...,...
145,2016,2024-11-01,Thanksgiving Day,Federal Holiday
146,2016,2024-12-01,Christmas Eve,Event
147,2016,2025-12-01,Christmas Day,Federal Holiday
148,2016,2026-12-01,'Christmas Day' observed,Federal Holiday


In [375]:
sorted(events['MonthDate'].apply(lambda x: x.month).unique())

[1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12]

In [376]:
events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Year         150 non-null    int64         
 1   MonthDate    150 non-null    datetime64[ns]
 2   Event        150 non-null    object        
 3   DayCategory  150 non-null    object        
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 4.8+ KB


In [377]:
events.shape

(150, 4)

In [378]:
events['DayCategory'].unique()

array(['Federal Holiday', 'Event'], dtype=object)

In [379]:
events['Event'].unique()

array(["New Year's Day", 'Martin Luther King Jr. Day', "Valentine's Day",
       "Presidents' Day", 'Easter Sunday', "Mother's Day", 'Memorial Day',
       "Father's Day", "'Independence Day' observed", 'Independence Day',
       'Labor Day', 'Columbus Day (Most regions)', 'Halloween',
       'Veterans Day', 'Thanksgiving Day', 'Christmas Eve',
       'Christmas Day', "New Year's Eve", "'Christmas Day' observed",
       "'New Year's Day' observed", 'Election Day',
       "Thomas Jefferson's Birthday", 'Day After Christmas Day'],
      dtype=object)

In [380]:
events['Month'] = events['MonthDate'].apply(lambda x: x.month)
events['Day'] = events['MonthDate'].apply(lambda x: x.day)

In [381]:
events.drop(['Year', 'MonthDate'], axis = 1, inplace = True)

In [382]:
events.head(15)

Unnamed: 0,Event,DayCategory,Month,Day
0,New Year's Day,Federal Holiday,1,1
1,Martin Luther King Jr. Day,Federal Holiday,1,1
2,Valentine's Day,Event,2,1
3,Presidents' Day,Federal Holiday,2,1
4,Easter Sunday,Event,4,1
5,Mother's Day,Event,5,1
6,Memorial Day,Federal Holiday,5,1
7,Father's Day,Event,6,1
8,'Independence Day' observed,Federal Holiday,7,1
9,Independence Day,Federal Holiday,7,1


In [383]:
events.drop_duplicates(inplace = True, ignore_index = True)
events.sort_values(['Month'])

Unnamed: 0,Event,DayCategory,Month,Day
0,New Year's Day,Federal Holiday,1,1
1,Martin Luther King Jr. Day,Federal Holiday,1,1
20,'New Year's Day' observed,Federal Holiday,1,1
2,Valentine's Day,Event,2,1
3,Presidents' Day,Federal Holiday,2,1
22,Easter Sunday,Event,3,1
4,Easter Sunday,Event,4,1
23,Thomas Jefferson's Birthday,Event,4,1
5,Mother's Day,Event,5,1
6,Memorial Day,Federal Holiday,5,1


In [384]:
df_events = pd.DataFrame(events.groupby(['Month', 'DayCategory'])['Day'].count().reset_index())
df_events

Unnamed: 0,Month,DayCategory,Day
0,1,Federal Holiday,3
1,2,Event,1
2,2,Federal Holiday,1
3,3,Event,1
4,4,Event,2
5,5,Event,1
6,5,Federal Holiday,1
7,6,Event,1
8,7,Federal Holiday,2
9,9,Federal Holiday,1


In [385]:
df_events = df_events.pivot_table(index='Month', columns='DayCategory',values=  'Day' ).reset_index()
df_events

DayCategory,Month,Event,Federal Holiday
0,1,,3.0
1,2,1.0,1.0
2,3,1.0,
3,4,2.0,
4,5,1.0,1.0
5,6,1.0,
6,7,,2.0
7,9,,1.0
8,10,1.0,1.0
9,11,1.0,2.0


In [386]:
train.head()

Unnamed: 0,Year,Month,ProductCategory,Sales,Avg_Sales,Sales_Lag1
0,2009,1,WomenClothing,1755.0,1215.466667,4442.8
1,2009,1,MenClothing,524.0,1215.466667,1032.8
2,2009,1,OtherClothing,936.0,1215.466667,1477.2
3,2009,2,WomenClothing,1729.0,1308.433333,1755.0
4,2009,2,MenClothing,496.0,1308.433333,524.0


In [387]:
train = train.merge(df_events, on= 'Month', how = 'left')
train.head()

Unnamed: 0,Year,Month,ProductCategory,Sales,Avg_Sales,Sales_Lag1,Event,Federal Holiday
0,2009,1,WomenClothing,1755.0,1215.466667,4442.8,,3.0
1,2009,1,MenClothing,524.0,1215.466667,1032.8,,3.0
2,2009,1,OtherClothing,936.0,1215.466667,1477.2,,3.0
3,2009,2,WomenClothing,1729.0,1308.433333,1755.0,1.0,1.0
4,2009,2,MenClothing,496.0,1308.433333,524.0,1.0,1.0


In [388]:
test = test.merge(df_events, on= 'Month', how = 'left')
test.head()

Unnamed: 0,Year,Month,ProductCategory,Avg_Sales,Sales_Lag1,Event,Federal Holiday
0,2014,1,WomenClothing,1215.466667,4865.0,,3.0
1,2014,1,MenClothing,1215.466667,1085.0,,3.0
2,2014,1,OtherClothing,1215.466667,1566.0,,3.0
3,2014,2,WomenClothing,1308.433333,2139.4,1.0,1.0
4,2014,2,MenClothing,1308.433333,522.2,1.0,1.0


In [389]:
train = train.fillna(0)
test = test.fillna(0)

In [390]:
train.isna().sum().sum(), test.isna().sum().sum()

(0, 0)

In [391]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 180 entries, 0 to 179
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Year             180 non-null    int64  
 1   Month            180 non-null    int64  
 2   ProductCategory  180 non-null    object 
 3   Sales            180 non-null    float64
 4   Avg_Sales        180 non-null    float64
 5   Sales_Lag1       180 non-null    float64
 6   Event            180 non-null    float64
 7   Federal Holiday  180 non-null    float64
dtypes: float64(5), int64(2), object(1)
memory usage: 12.7+ KB


In [392]:
train.to_csv('train_events.csv', index = False)
test.to_csv('test_events.csv', index = False)

In [393]:
X_train = train.drop('Sales', 1)
X_train.head()

Unnamed: 0,Year,Month,ProductCategory,Avg_Sales,Sales_Lag1,Event,Federal Holiday
0,2009,1,WomenClothing,1215.466667,4442.8,0.0,3.0
1,2009,1,MenClothing,1215.466667,1032.8,0.0,3.0
2,2009,1,OtherClothing,1215.466667,1477.2,0.0,3.0
3,2009,2,WomenClothing,1308.433333,1755.0,1.0,1.0
4,2009,2,MenClothing,1308.433333,524.0,1.0,1.0


In [394]:
y_train = train['Sales']

In [395]:
X_train.shape, y_train.shape

((180, 7), (180,))

In [396]:
X_test = test
X_test.head()

Unnamed: 0,Year,Month,ProductCategory,Avg_Sales,Sales_Lag1,Event,Federal Holiday
0,2014,1,WomenClothing,1215.466667,4865.0,0.0,3.0
1,2014,1,MenClothing,1215.466667,1085.0,0.0,3.0
2,2014,1,OtherClothing,1215.466667,1566.0,0.0,3.0
3,2014,2,WomenClothing,1308.433333,2139.4,1.0,1.0
4,2014,2,MenClothing,1308.433333,522.2,1.0,1.0


In [397]:
X_train = pd.get_dummies(X_train, drop_first = True)
X_test = pd.get_dummies(X_test, drop_first = True)
display(X_train.head())
display(X_test.head())

Unnamed: 0,Year,Month,Avg_Sales,Sales_Lag1,Event,Federal Holiday,ProductCategory_OtherClothing,ProductCategory_WomenClothing
0,2009,1,1215.466667,4442.8,0.0,3.0,0,1
1,2009,1,1215.466667,1032.8,0.0,3.0,0,0
2,2009,1,1215.466667,1477.2,0.0,3.0,1,0
3,2009,2,1308.433333,1755.0,1.0,1.0,0,1
4,2009,2,1308.433333,524.0,1.0,1.0,0,0


Unnamed: 0,Year,Month,Avg_Sales,Sales_Lag1,Event,Federal Holiday,ProductCategory_OtherClothing,ProductCategory_WomenClothing
0,2014,1,1215.466667,4865.0,0.0,3.0,0,1
1,2014,1,1215.466667,1085.0,0.0,3.0,0,0
2,2014,1,1215.466667,1566.0,0.0,3.0,1,0
3,2014,2,1308.433333,2139.4,1.0,1.0,0,1
4,2014,2,1308.433333,522.2,1.0,1.0,0,0


In [349]:
#X_train['Year'] = X_train['Year'] - 2014
#X_test['Year'] = X_test['Year'] - 2014

In [350]:
#display(X_train.head())
#display(X_test.head())

Unnamed: 0,Year,Month,Avg_Sales,Sales_Lag1,Event,Federal Holiday,ProductCategory_OtherClothing,ProductCategory_WomenClothing
0,-5,1,1215.466667,4442.8,0.0,3.0,0,1
1,-5,1,1215.466667,1032.8,0.0,3.0,0,0
2,-5,1,1215.466667,1477.2,0.0,3.0,1,0
3,-5,2,1308.433333,1755.0,1.0,1.0,0,1
4,-5,2,1308.433333,524.0,1.0,1.0,0,0


Unnamed: 0,Year,Month,Avg_Sales,Sales_Lag1,Event,Federal Holiday,ProductCategory_OtherClothing,ProductCategory_WomenClothing
0,0,1,1215.466667,4865.0,0.0,3.0,0,1
1,0,1,1215.466667,1085.0,0.0,3.0,0,0
2,0,1,1215.466667,1566.0,0.0,3.0,1,0
3,0,2,1308.433333,2139.4,1.0,1.0,0,1
4,0,2,1308.433333,522.2,1.0,1.0,0,0


### RF

In [398]:
rf = RandomForestRegressor(random_state = 0)
rf.fit(X_train, y_train)
rf.score(X_train, y_train)

0.9976782682094658

#### Tuning RF

In [399]:
rf = RandomForestRegressor(random_state = 0)
params = {'n_estimators' : sp_randint(50, 200),
         'max_features' : sp_randint(1, 8),
         'max_depth' : sp_randint(2, 20),
         'min_samples_leaf' : sp_randint(1, 10),
         'min_samples_split' : sp_randint(2, 10),
         'oob_score' : [False, True]}
rsearch = RandomizedSearchCV(rf, param_distributions=params, cv = 5, n_jobs = -1, random_state = 0, n_iter = 100)
rsearch.fit(X_train, y_train)
rsearch.best_params_

{'max_depth': 10,
 'max_features': 4,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 88,
 'oob_score': True}

#### Tuned RF

In [400]:
rf = RandomForestRegressor(**rsearch.best_params_, random_state = 0)
rf.fit(X_train, y_train)
rf.score(X_train, y_train)

0.9978279221100703

In [401]:
sorted(list(zip(rf.feature_importances_, X_train.columns)))

[(0.0046303961639799305, 'Federal Holiday'),
 (0.01256891050355818, 'Event'),
 (0.02389839529671127, 'Year'),
 (0.02446121097172031, 'ProductCategory_OtherClothing'),
 (0.026077453167536097, 'Month'),
 (0.041119694991882894, 'Avg_Sales'),
 (0.4061393909769849, 'ProductCategory_WomenClothing'),
 (0.4611045479276265, 'Sales_Lag1')]

### GBM

In [402]:
gbm = GradientBoostingRegressor(random_state = 0)
gbm.fit(X_train, y_train)
gbm.score(X_train, y_train)

0.9985352891865837

#### Tuning GBM

In [403]:
gbm = GradientBoostingRegressor(random_state = 0)

params = {'n_estimators' : sp_randint(50, 200),
          'max_features' : sp_randint(1, 8),
          'max_depth' : sp_randint(2, 20),
          'min_samples_leaf' : sp_randint(1, 10),
          'min_samples_split' : sp_randint(2, 10),
          'learning_rate' : sp_uniform(0, 1)}

rsearch_gbm = RandomizedSearchCV(gbm, param_distributions = params, n_jobs = -1,
                                 cv = 5, n_iter = 100, random_state = 0)

rsearch_gbm.fit(X_train, y_train)
print(rsearch_gbm.best_params_)

{'learning_rate': 0.48340861659875267, 'max_depth': 6, 'max_features': 2, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 64}


##### Tuned GBM

In [404]:
gbm = GradientBoostingRegressor(**rsearch_gbm.best_params_, random_state = 0)
gbm.fit(X_train, y_train)
gbm.score(X_train, y_train)

0.9999999667985698

### LightGBM

In [405]:
lgbr = lgb.LGBMRegressor(random_state = 0)
lgbr.fit(X_train, y_train)
lgbr.score(X_train, y_train)

0.9892158060412123

#### Tuning LightGBM

In [406]:
lgbr = lgb.LGBMRegressor(random_state = 0)
params = {'n_estimators' : sp_randint(50, 200),
         'max_depth' : sp_randint(1, 20),
         'learning_rate' : sp_uniform(0, 1)}
rsearch_lgbm = RandomizedSearchCV(lgbr, param_distributions = params, n_jobs = -1,
                                 cv = 5, n_iter = 100, random_state = 0)
rsearch_lgbm.fit(X_train, y_train)
print(rsearch_lgbm.best_params_)

{'learning_rate': 0.3732907507832548, 'max_depth': 15, 'n_estimators': 158}


##### Tuned LightGBM

In [407]:
lgbr = lgb.LGBMRegressor(**rsearch_lgbm.best_params_, random_state = 0)
lgbr.fit(X_train, y_train)
lgbr.score(X_train, y_train)

0.9983882908962997

In [408]:
sorted(list(zip(lgbr.feature_importances_, X_train.columns)))

[(3, 'ProductCategory_WomenClothing'),
 (34, 'ProductCategory_OtherClothing'),
 (38, 'Event'),
 (40, 'Federal Holiday'),
 (132, 'Month'),
 (141, 'Avg_Sales'),
 (173, 'Year'),
 (377, 'Sales_Lag1')]

### Stacking

#### Voting Regressor

In [409]:
votreg = VotingRegressor(estimators = [('rf', rf), ('lgbr', lgbr), ('gbm', gbm)])
votreg.fit(X_train, y_train)
votreg.score(X_train, y_train)

0.9993829597703939

### Submission

In [410]:
pred = lgbr.predict(X_test)

In [411]:
kaggle = pd.read_csv('Kaggle_Submission_Format.csv')

In [412]:
kaggle.columns

Index(['Year', 'Sales(In ThousandDollars)'], dtype='object')

In [413]:
kaggle['Sales(In ThousandDollars)'] = pred

In [414]:
kaggle.to_csv('Iter5_AvgSales_LagSales1_Year0MonthNum_LGBR', index = False)
kaggle.head()

Unnamed: 0,Year,Sales(In ThousandDollars)
0,1,2451.649293
1,2,573.681626
2,3,1104.54247
3,4,2233.279952
4,5,572.020243
