In [14]:
import pandas as pd

In [15]:
df = pd.read_csv('/Users/amir.ziai/Downloads/train2.csv')

# Convert object to datetime
df['Date'] = pd.to_datetime(df['Date'])

In [16]:
df.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,2010-02-05,24924.5,False
1,1,1,2010-02-12,46039.49,True
2,1,1,2010-02-19,41595.55,False
3,1,1,2010-02-26,19403.54,False
4,1,1,2010-03-05,21827.9,False


In [17]:
# One-hot-encode "Store" and "Dept"
df = pd.get_dummies(df, columns=['Store', 'Dept'])

In [19]:
df.head()

Unnamed: 0,Date,Weekly_Sales,IsHoliday,Store_1,Store_2,Store_3,Store_4,Store_5,Store_6,Store_7,...,Dept_90,Dept_91,Dept_92,Dept_93,Dept_94,Dept_95,Dept_96,Dept_97,Dept_98,Dept_99
0,2010-02-05,24924.5,False,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2010-02-12,46039.49,True,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2010-02-19,41595.55,False,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2010-02-26,19403.54,False,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2010-03-05,21827.9,False,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
# Extract date features
df['Date_dayofweek'] = df['Date'].dt.dayofweek
df['Date_month'] = df['Date'].dt.month
df['Date_year'] = df['Date'].dt.year
df['Date_day'] = df['Date'].dt.day

In [26]:
# Extract time-lag features for 1 day, 2 day, 3 day, 5 day, 1 week, 2 week, and a month ago
for days_to_lag in [1, 2, 3, 5, 7, 14, 30]:
    df['Weekly_sales_lag_{}'.format(days_to_lag)] = df.Weekly_Sales.shift(days_to_lag)

In [27]:
df.head()

Unnamed: 0,Date,Weekly_Sales,IsHoliday,Store_1,Store_2,Store_3,Store_4,Store_5,Store_6,Store_7,...,Date_month,Date_year,Date_day,Weekly_sales_lag_1,Weekly_sales_lag_2,Weekly_sales_lag_3,Weekly_sales_lag_5,Weekly_sales_lag_7,Weekly_sales_lag_14,Weekly_sales_lag_30
0,2010-02-05,24924.5,False,1,0,0,0,0,0,0,...,2,2010,5,,,,,,,
1,2010-02-12,46039.49,True,1,0,0,0,0,0,0,...,2,2010,12,24924.5,,,,,,
2,2010-02-19,41595.55,False,1,0,0,0,0,0,0,...,2,2010,19,46039.49,24924.5,,,,,
3,2010-02-26,19403.54,False,1,0,0,0,0,0,0,...,2,2010,26,41595.55,46039.49,24924.5,,,,
4,2010-03-05,21827.9,False,1,0,0,0,0,0,0,...,3,2010,5,19403.54,41595.55,46039.49,,,,


In [42]:
# Replace all NaN values with 0
df = df.fillna(0)

In [44]:
df.IsHoliday = df.IsHoliday.astype(int)

In [46]:
# Grab features and target
# Remove date from features because it's overly-unique
# Remove weekly_sales from features since it's the target and
# we don't have access to it at the time of prediction
x = df[df.columns.difference(['Date', 'Weekly_Sales'])]  
y = df.Weekly_Sales

In [47]:
x.head()

Unnamed: 0,Date_day,Date_dayofweek,Date_month,Date_year,Dept_1,Dept_10,Dept_11,Dept_12,Dept_13,Dept_14,...,Store_7,Store_8,Store_9,Weekly_sales_lag_1,Weekly_sales_lag_14,Weekly_sales_lag_2,Weekly_sales_lag_3,Weekly_sales_lag_30,Weekly_sales_lag_5,Weekly_sales_lag_7
0,5,4,2,2010,1,0,0,0,0,0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,12,4,2,2010,1,0,0,0,0,0,...,0,0,0,24924.5,0.0,0.0,0.0,0.0,0.0,0.0
2,19,4,2,2010,1,0,0,0,0,0,...,0,0,0,46039.49,0.0,24924.5,0.0,0.0,0.0,0.0
3,26,4,2,2010,1,0,0,0,0,0,...,0,0,0,41595.55,0.0,46039.49,24924.5,0.0,0.0,0.0
4,5,4,3,2010,1,0,0,0,0,0,...,0,0,0,19403.54,0.0,41595.55,46039.49,0.0,0.0,0.0


In [48]:
y[:3]

0    24924.50
1    46039.49
2    41595.55
Name: Weekly_Sales, dtype: float64

In [49]:
from sklearn.model_selection import train_test_split

In [50]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1/3)

In [51]:
x_train.shape

(281046, 138)

In [52]:
x_test.shape

(140524, 138)

In [34]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [53]:
clf = LinearRegression()
clf.fit(x_train, y_train)



LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [54]:
# Calculate R2
clf.score(x_test, y_test)

0.91004060811981491

In [57]:
%%time
clf = RandomForestRegressor(n_jobs=-1)  # use all cores
clf.fit(x_train, y_train)

CPU times: user 4min 41s, sys: 1.75 s, total: 4min 43s
Wall time: 45.7 s


In [58]:
# Better R2 with random forest
# You can probably do hyper-parameter grid/random search to improve
clf.score(x_test, y_test)

0.95018888743499019

In [60]:
# Other regression metrics
# http://scikit-learn.org/stable/modules/model_evaluation.html#regression-metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [63]:
# Mean absolute error
predicted = clf.predict(x_test)
mean_absolute_error(y_test, predicted)

1687.739176638866

In [64]:
# MSE
mean_squared_error(y_test, predicted)

25711761.750341382