In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

---
# Data Processing

In [None]:
df_train = pd.read_csv('../input/walmart-recruiting-store-sales-forecasting/train.csv.zip')
print(df_train.isnull().sum()) 
print(df_train.info())
display(df_train.head(5))

In [None]:
df_test = pd.read_csv('../input/walmart-recruiting-store-sales-forecasting/test.csv.zip')
print(df_test.isnull().sum()) 
print(df_test.info())
display(df_test.head(5))

In [None]:
df_stores = pd.read_csv('../input/walmart-recruiting-store-sales-forecasting/stores.csv')
print(df_stores.isnull().sum()) 
print(df_stores.info())
display(df_stores.head(5))

In [None]:
display(df_stores['Type'].drop_duplicates())

In [None]:
df_features = pd.read_csv('../input/walmart-recruiting-store-sales-forecasting/features.csv.zip')
print(df_features.isnull().sum()) 
print(df_features.info())
display(df_features.head(5))

In [None]:
df_sample = pd.read_csv('../input/walmart-recruiting-store-sales-forecasting/sampleSubmission.csv.zip')
print(df_sample.isnull().sum())
df_sample.info()
df_sample.head(5)

In [None]:
print("df_sample:\t{}".format(df_sample.shape))
print("df_features:\t{}".format(df_features.shape))
print("df_stores:\t{}".format(df_stores.shape))
print("df_test:\t{}".format(df_test.shape))
print("df_train:\t{}".format(df_train.shape))

In [None]:
# feature와 store, train을 조합하여 학습을 하기 위한 데이터프레임 만들기
df_features_stores = pd.merge(df_features, df_stores, how = "left")
df_features_stores.info()

In [None]:
df_features_stores.isnull().sum()

In [None]:
print("MarkDown1 null 비율\t{:.2f}%".format(((df_features_stores['MarkDown1'].isnull().sum())/len(df_features_stores))*100))
print("MarkDown2 null 비율\t{:.2f}%".format(((df_features_stores['MarkDown2'].isnull().sum())/len(df_features_stores))*100))
print("MarkDown3 null 비율\t{:.2f}%".format(((df_features_stores['MarkDown3'].isnull().sum())/len(df_features_stores))*100))
print("MarkDown4 null 비율\t{:.2f}%".format(((df_features_stores['MarkDown4'].isnull().sum())/len(df_features_stores))*100))
print("MarkDown5 null 비율\t{:.2f}%".format(((df_features_stores['MarkDown5'].isnull().sum())/len(df_features_stores))*100))


In [None]:
del df_features_stores['MarkDown1']
del df_features_stores['MarkDown2']
del df_features_stores['MarkDown3']
del df_features_stores['MarkDown4']
del df_features_stores['MarkDown5']

df_features_stores.head()

In [None]:
# Type은 범주형 데이터로 변환

df_features_stores.loc[df_features_stores.Type == "A", "Type"] = 0
df_features_stores.loc[df_features_stores.Type == "B", "Type"] = 1
df_features_stores.loc[df_features_stores.Type == "C", "Type"] = 2
# df_features_stores.loc[df_features_stores.Type == "D", "Type"] = 3


df_features_stores.Type = pd.Categorical(df_features_stores.Type)
# Store도 범주형 데이터로 변환
df_features_stores.Store = pd.Categorical(df_features_stores.Store)


In [None]:
train_total = pd.merge(df_features_stores, df_train, 
                       how = "inner", on = ['Store','Date', 'IsHoliday']).sort_values(
    by=['Store','Dept','Date']).reset_index(drop=True)
                        # holyday 안해주면 holyday_x y 생김

test_total = pd.merge(df_features_stores, df_test, 
                      how = "inner", on = ['Store','Date', 'IsHoliday']).sort_values(
    by=['Store','Dept','Date']).reset_index(drop=True)

In [None]:
print(train_total.info(), "\n")
print(test_total.info(), "\n")

In [None]:
train_total.Date = pd.to_datetime(train_total.Date)
test_total.Date = pd.to_datetime(test_total.Date)

train_total['Week'] = train_total.Date.dt.week # 주간 판매량 예측이므로, 주간 정보 생성
test_total['Week'] = test_total.Date.dt.week


# Week 범주형 데이터로 변환
train_total.Week = pd.Categorical(train_total.Week)
test_total.Week = pd.Categorical(test_total.Week)

In [None]:
train_total.describe()

In [None]:
test_total.describe()

---
# Data visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams

In [None]:
# df_visualization = train_total.copy()
# df_visualization.info()

In [None]:
# # store, Dept는 범주형 데이터임!
# df_visualization.Store = pd.Categorical(df_visualization.Store)
# df_visualization.Dept = pd.Categorical(df_visualization.Dept)
# df_visualization.Type = pd.Categorical(df_visualization.Type)
# df_visualization.Date = pd.to_datetime(df_visualization.Date)

# df_visualization.set_index(keys = "Date", inplace = True)
# df_visualization.info()

In [None]:
# df_visualization.head(3)

In [None]:
# rcParams['figure.figsize'] = 10,6.0
# sns.lineplot(x = df_visualization.index, y = "Weekly_Sales", data = df_visualization)

# # 매해 연말에 치솟는 패턴을 볼 수 있음.
# # 0, 1, 2 => A, B, C

In [None]:
# rcParams['figure.figsize'] = 10,6.0
# sns.lineplot(x = df_visualization.index, y = "Weekly_Sales", data = df_visualization, hue = 'Type')

# # 매해 연말에 치솟는 패턴을 볼 수 있음.
# # 0, 1, 2 => A, B, C

In [None]:
# rcParams['figure.figsize'] = 10,6.0
# sns.lineplot(x = df_visualization.index, y = "Weekly_Sales", data = df_visualization, hue = 'Dept')

# # 대부분의 부서도 이를 따르는 것을 알 수 있음.

In [None]:
# rcParams['figure.figsize'] = 10,6.0
# sns.lineplot(x = df_visualization.index, y = "Weekly_Sales", data = df_visualization, hue = 'IsHoliday')

# ##### False일때는 앞의 시각화 결과처럼 연말에 튀지만, True일 때에는 Thanksgiving날 팍 튐 => False로 바꿔서 일반화하는게 좋을 것 같다.
# ###### **Super Bowl**: 12-Feb-10, 11-Feb-11, 10-Feb-12, 8-Feb-13
# ###### **Labor Day**: 10-Sep-10, 9-Sep-11, 7-Sep-12, 6-Sep-13
# ###### **Thanksgiving**: 26-Nov-10, 25-Nov-11, 23-Nov-12, 29-Nov-13
# ###### **Christmas**: 31-Dec-10, 30-Dec-11, 28-Dec-12, 27-Dec-13

In [None]:
# rcParams['figure.figsize'] = 10,6.0
# sns.lineplot(x = "Temperature", y = "Weekly_Sales", data = df_visualization)
# # 아무런 패턴이 안보임

In [None]:
# rcParams['figure.figsize'] = 10,6.0
# sns.lineplot(x = "Fuel_Price", y = "Weekly_Sales", data = df_visualization)

# # 아무런 패턴이 안보임

In [None]:
# rcParams['figure.figsize'] = 10,6.0
# sns.lineplot(x = "CPI", y = "Weekly_Sales", data = df_visualization);

# # 아무런 패턴이 안보임

In [None]:
# rcParams['figure.figsize'] = 10,6.0
# sns.lineplot(x = "Unemployment", y = "Weekly_Sales", data = df_visualization);

# # 아무런 패턴이 안보임

In [None]:
# rcParams['figure.figsize'] = 10,6.0
# sns.lineplot(x = "Size", y = "Weekly_Sales", data = df_visualization);

# # 대체적으로 사이즈가 크면 매출이 높지만 절대적이진 않다. -> 일관성이 없다

In [None]:
# # 상관계수
# fig, ax = plt.subplots( figsize=(10,10) )

# # 삼각형 마스크를 만든다(위 쪽 삼각형에 True, 아래 삼각형ㅁ에 False)
# mask = np.zeros_like(df_visualization.corr(), dtype=np.bool)
# mask[np.triu_indices_from(mask)] = False

# sns.heatmap(data = df_visualization.corr(), annot=True, fmt = '.4f', mask = mask, linewidths=.5, cmap='Blues')
# plt.show()

# # 상관 계수는 -1과 +1 사이의 값이 됩니다. 
# # 절대 상관이 1에 더 가까울수록 데이터 점이 더 밀접하게 선을 형성합니다. 0에 가까운 상관 값은 선형 관계가 없음을 나타냅니다.

---
# Define model

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,StackingClassifier
import lightgbm as lgb
import time

In [None]:
# 관련없는 feature 제외한 데이터 셋
x1 = train_total[['Store','Dept','IsHoliday','Size','Week','Type']]
y1 = train_total['Weekly_Sales']
x_train1, x_test1, y_train1, y_test1 = train_test_split(x1, y1,test_size=0.2,random_state = 42)

In [None]:
# # 관련없는 feature 포함한 데이터 셋
# x2 = train_total.drop(['Weekly_Sales', 'Date'],axis=1)
# y2 = train_total['Weekly_Sales']
# x_train2, x_test2, y_train2, y_test2 = train_test_split(x2, y2,test_size=0.2,random_state = 42)

In [None]:
test_total1 = test_total[['Store','Dept','IsHoliday','Size','Week','Type']]
test_total2 = test_total.drop(['Date'],axis=1)

In [None]:
# model = []

# model.append(('LinearRegression', LinearRegression()))  # LinearRegression 모델 
# model.append(('DecisionTreeRegressor', DecisionTreeRegressor()))  # DecisionTreeRegressor 모델
# model.append(('RandomForestRegressor', RandomForestRegressor()))  # RandomForestRegressor
# model.append(('lgb.LGBMRegressor', lgb.LGBMRegressor()))  # lgb.LGBMRegressor()

In [None]:
# print('case1: 관련 없는 featuue 제외한 데이터셋으로 학습시키기')
# for name, m in model:
#     m.fit(x_train1, y_train1)
#     print("{}\ttrain_score : {} \ttest_score: {}".format(name, m.score(x_train1, y_train1), m.score(x_test1, y_test1)))

In [None]:
# print('case2: 관련 없는 featuue 포함한 데이터셋으로 학습시키기')
# for name, m in model:
#     m.fit(x_train2, y_train2)
#     print("{}\ttrain_score : {} \ttest_score: {}".format(name, m.score(x_train2, y_train2), m.score(x_test2, y_test2)))

---
# Train models

In [None]:
# start = time.time()  # 시작 시간 저장
# print("=====train start=======\n")
# # param: 아무렇게나

# model_rf1 = RandomForestRegressor(n_estimators=100, max_samples = 0.4)
# model_rf1.fit(x_train1, y_train1)

# print("done!\ntime :", time.time() - start)  # 현재시각 - 시작시간 = 실행 시간

In [None]:
# print("훈련 세트 정확도: {:.3f}".format(model_rf1.score(x_train1, y_train1)))
# print("테스트 세트 정확도: {:.3f}".format(model_rf1.score(x_test1, y_test1)))

In [None]:
# features = x_train1.columns
# importances = model_rf1.feature_importances_
# indices = np.argsort(importances)

# plt.title('Feature Importances')
# plt.barh(range(len(indices)), importances[indices], color='b', align='center')
# plt.yticks(range(len(indices)), [x_train1.columns[i] for i in indices])
# plt.xlabel('Relative Importance')
# plt.show()

---

In [None]:
# start = time.time()  # 시작 시간 저장
# print("=====train start=======\n")
# # param: 아무렇게나

# model_rf2 = RandomForestRegressor(n_estimators=100, max_samples = 0.4)
# model_rf2.fit(x_train2, y_train2)

# print("done!\ntime :", time.time() - start)  # 현재시각 - 시작시간 = 실행 시간

In [None]:
# print("훈련 세트 정확도: {:.3f}".format(model_rf2.score(x_train2, y_train2)))
# print("테스트 세트 정확도: {:.3f}".format(model_rf2.score(x_test2, y_test2)))

In [None]:
# features = x_train2.columns
# importances = model_rf2.feature_importances_
# indices = np.argsort(importances)

# plt.title('Feature Importances')
# plt.barh(range(len(indices)), importances[indices], color='b', align='center')
# plt.yticks(range(len(indices)), [x_train2.columns[i] for i in indices])
# plt.xlabel('Relative Importance')
# plt.show()

---
# Predict

In [None]:
# # x_TEST = test_total.drop(['Date'],axis=1)
# # x_TEST

test_pred1 = test_total1.copy()
test_pred2 = test_total2.copy()

In [None]:
# # x_Test data processing 
# # CPI, Unemployment의 nan 값이 있어서 예측이 안된다 randomforest...
# # 여러 방법이 있겠지만 이전 값으로 대체하는 방법 선택
# # 이유는 급격하게 변할 수치가 아니라고 판다.

# test_pred1 = test_pred1.fillna(method='ffill')
# test_pred2 = test_pred2.fillna(method='ffill')
# test_pred2.info()

In [None]:
# # model_rf
# start = time.time()  # 시작 시간 저장

# print("===== predict start! =======\n")
# predict_sales1 = model_rf1.predict(test_pred1)

# print("========== done! ===========\n")
# print("time :", time.time() - start)  # 현재시각 - 시작시간 = 실행 시간

In [None]:
# df_sample1 = df_sample.copy()
# df_sample1['Weekly_Sales'] = predict_sales1
# df_sample1

---
# Predict visualization

In [None]:
# # model_rf
# start = time.time()  # 시작 시간 저장

# print("===== predict start! =======\n")
# predict_sales2 = model_rf2.predict(test_pred2)

# print("========== done! ===========\n")
# print("time :", time.time() - start)  # 현재시각 - 시작시간 = 실행 시간

In [None]:
pred_df = train_total[['Date', 'Weekly_Sales']].copy()
pred_df

In [None]:
# pred_df2 = df_test.copy()
# pred_df2['Weekly_Sales'] = predict_sales2
# pred_df2 = pred_df2[['Date', 'Weekly_Sales']]
# pred_df2

In [None]:
# pred_df1 = df_test.copy()
# pred_df1['Weekly_Sales'] = predict_sales1
# pred_df1 = pred_df2[['Date', 'Weekly_Sales']]
# pred_df1

In [None]:
# df_sample2 = df_sample.copy()
# df_sample2['Weekly_Sales'] = predict_sales2
# df_sample2


# pred_df2 = df_test.copy()
# pred_df2['Weekly_Sales'] = predict_sales2
# pred_df2 = pred_df2[['Date', 'Weekly_Sales']]
# pred_df2

# pred_df2.Date = pd.to_datetime(pred_df2.Date)
# pred_df2['Week'] = pred_df2.Date.dt.week
# pred_df2['Year'] = pred_df2.Date.dt.year

In [None]:
pred_df['Week'] = pred_df.Date.dt.week
pred_df['Year'] = pred_df.Date.dt.year

# pred_df2.Date = pd.to_datetime(pred_df2.Date)
# pred_df2['Week'] = pred_df2.Date.dt.week
# pred_df2['Year'] = pred_df2.Date.dt.year

In [None]:
# pred_df['Week'] = pred_df.Date.dt.week
# pred_df['Year'] = pred_df.Date.dt.year

# pred_df1.Date = pd.to_datetime(pred_df1.Date)
# pred_df1['Week'] = pred_df1.Date.dt.week
# pred_df1['Year'] = pred_df1.Date.dt.year

In [None]:
# pred_df_2010 = pred_df[pred_df.Year==2010]['Weekly_Sales'].groupby(pred_df['Week']).mean()
# pred_df_2011 = pred_df[pred_df.Year==2011]['Weekly_Sales'].groupby(pred_df['Week']).mean()
# pred_df_2012 = pred_df[pred_df.Year==2012]['Weekly_Sales'].groupby(pred_df['Week']).mean()

# pred_df1_2012 = pred_df1[pred_df1.Year==2012]['Weekly_Sales'].groupby(pred_df1['Week']).mean()
# pred_df1_2013 = pred_df1[pred_df1.Year==2013]['Weekly_Sales'].groupby(pred_df1['Week']).mean()


# plt.figure(figsize=(20,8))
# sns.lineplot(pred_df_2010.index, pred_df_2010.values)
# sns.lineplot(pred_df_2011.index, pred_df_2011.values)
# sns.lineplot(pred_df_2012.index, pred_df_2012.values)

# sns.lineplot(pred_df1_2012.index, pred_df1_2012.values)
# sns.lineplot(pred_df1_2013.index, pred_df1_2013.values)


# plt.grid()
# plt.xticks(np.arange(1, 53, step=1))
# plt.legend(['2010', '2011', '2012', '2012_pred', '2013_pred'], loc='best', fontsize=20)
# # plt.legend(['2010', '2011', '2012_y', '2013'], loc='best', fontsize=18)
# plt.title('Weekly Sales per year (average)', fontsize=15)
# plt.ylabel('Sales', fontsize=16)
# plt.xlabel('Week', fontsize=16)
# plt.show()

In [None]:
# pred_df_2010 = pred_df[pred_df.Year==2010]['Weekly_Sales'].groupby(pred_df['Week']).mean()
# pred_df_2011 = pred_df[pred_df.Year==2011]['Weekly_Sales'].groupby(pred_df['Week']).mean()
# pred_df_2012 = pred_df[pred_df.Year==2012]['Weekly_Sales'].groupby(pred_df['Week']).mean()

# pred_df2_2012 = pred_df2[pred_df2.Year==2012]['Weekly_Sales'].groupby(pred_df2['Week']).mean()
# pred_df2_2013 = pred_df2[pred_df2.Year==2013]['Weekly_Sales'].groupby(pred_df2['Week']).mean()


# plt.figure(figsize=(20,8))
# sns.lineplot(pred_df_2010.index, pred_df_2010.values)
# sns.lineplot(pred_df_2011.index, pred_df_2011.values)
# sns.lineplot(pred_df_2012.index, pred_df_2012.values)

# sns.lineplot(pred_df2_2012.index, pred_df2_2012.values)
# sns.lineplot(pred_df2_2013.index, pred_df2_2013.values)


# plt.grid()
# plt.xticks(np.arange(1, 53, step=1))
# plt.legend(['2010', '2011', '2012', '2012_pred', '2013_pred'], loc='best', fontsize=20)
# # plt.legend(['2010', '2011', '2012_y', '2013'], loc='best', fontsize=18)
# plt.title('Weekly Sales per year (average)', fontsize=15)
# plt.ylabel('Sales', fontsize=16)
# plt.xlabel('Week', fontsize=16)
# plt.show()

In [None]:
# train_total[train_total.Week == 51]

---
# 최적화
[설명](https://datascienceschool.net/03%20machine%20learning/14.01%20%EB%AA%A8%ED%98%95%20%EC%B5%9C%EC%A0%81%ED%99%94.html)

In [None]:
from sklearn.model_selection import GridSearchCV

params ={
    'n_estimators':[50],
    'max_depth':[15,20,25,30],
    'min_samples_leaf':[1,2,3],
    'min_samples_split':[3,4,5,6]
}

In [None]:
rf_optimize = RandomForestRegressor(random_state=0, n_jobs=-1)
grid_cv = GridSearchCV(rf_optimize, param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(x_train1, y_train1)

In [None]:
grid_cv.best_params_

In [None]:
print(grid_cv.best_score_)


In [None]:
start = time.time()  # 시작 시간 저장
print("=====train start=======\n")
# param: 아무렇게나

model_rf_final = RandomForestRegressor(n_estimators=50,max_depth = 25,
                                  min_samples_leaf = 1, min_samples_split = 6)
model_rf_final.fit(x_train1, y_train1)

print("done!\ntime :", time.time() - start)  # 현재시각 - 시작시간 = 실행 시간

In [None]:
print("훈련 세트 정확도: {:.3f}".format(model_rf_final.score(x_train1, y_train1)))
print("테스트 세트 정확도: {:.3f}".format(model_rf_final.score(x_test1, y_test1)))

In [None]:
test_pred3 = test_total1.copy()

In [None]:
# model_rf
start = time.time()  # 시작 시간 저장

print("===== predict start! =======\n")
predict_sales3 = model_rf_final.predict(test_pred3)

print("========== done! ===========\n")
print("time :", time.time() - start)  # 현재시각 - 시작시간 = 실행 시간

In [None]:
pred_df3 = df_test.copy()
pred_df3['Weekly_Sales'] = predict_sales3
pred_df3 = pred_df3[['Date', 'Weekly_Sales']]

pred_df3.Date = pd.to_datetime(pred_df3.Date)
pred_df3['Week'] = pred_df3.Date.dt.week
pred_df3['Year'] = pred_df3.Date.dt.year

In [None]:
pred_df_2010 = pred_df[pred_df.Year==2010]['Weekly_Sales'].groupby(pred_df['Week']).mean()
pred_df_2011 = pred_df[pred_df.Year==2011]['Weekly_Sales'].groupby(pred_df['Week']).mean()
pred_df_2012 = pred_df[pred_df.Year==2012]['Weekly_Sales'].groupby(pred_df['Week']).mean()

pred_df3_2012 = pred_df3[pred_df3.Year==2012]['Weekly_Sales'].groupby(pred_df3['Week']).mean()
pred_df3_2013 = pred_df3[pred_df3.Year==2013]['Weekly_Sales'].groupby(pred_df3['Week']).mean()


plt.figure(figsize=(20,8))
sns.lineplot(pred_df_2010.index, pred_df_2010.values)
sns.lineplot(pred_df_2011.index, pred_df_2011.values)
sns.lineplot(pred_df_2012.index, pred_df_2012.values)

sns.lineplot(pred_df3_2012.index, pred_df3_2012.values)
sns.lineplot(pred_df3_2013.index, pred_df3_2013.values)


plt.grid()
plt.xticks(np.arange(1, 53, step=1))
plt.legend(['2010', '2011', '2012', '2012_pred', '2013_pred'], loc='best', fontsize=20)
# plt.legend(['2010', '2011', '2012_y', '2013'], loc='best', fontsize=18)
plt.title('Weekly Sales per year (average)', fontsize=15)
plt.ylabel('Sales', fontsize=16)
plt.xlabel('Week', fontsize=16)
plt.show()

In [None]:
pred_final = pred_df3[["Date", "Weekly_Sales"]]
pred_final

In [None]:
df_sample['Weekly_Sales'] = pred_final['Weekly_Sales']
df_sample

In [None]:
df_sample.to_csv('submission.csv',index=False)
df_sample