In [None]:
import warnings
warnings.filterwarnings("ignore")

# loading packages

import numpy as np
import pandas as pd
from pandas import datetime as dt
from pandas import Series,DataFrame

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns # advanced vizs
%matplotlib inline


from sklearn.model_selection import train_test_split


# machine learning
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn import model_selection
from sklearn.tree import DecisionTreeRegressor, plot_tree, export_graphviz, export_text
from sklearn.model_selection import GridSearchCV
from IPython.display import Image 
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


#결과 살펴보기
from sklearn.metrics import r2_score as r2, mean_squared_error as mse
import math

In [None]:
# importing train data files
store= pd.read_csv('../input/rossmann-store-sales/store.csv')
train= pd.read_csv('../input/rossmann-store-sales/train.csv')
test = pd.read_csv('../input/rossmann-store-sales/test.csv')
state = pd.read_csv("../input/rossmann-store-extra/store_states.csv")
state_name = pd.read_csv("../input/rossmann-store-extra/state_names.csv")
weathers = pd.read_csv("../input/rossmann-store-extra/weather.csv")

### Data preprocessing

In [None]:
weathers.rename(columns = {'file' : 'StateName'}, inplace = True)

In [None]:
weathers = pd.merge(weathers, state_name, on='StateName')
weathers

In [None]:
weathers["Events"]=np.where(weathers["Events"]== 0, 0,1)
weathers.Date=pd.to_datetime(weathers.Date)
weather_new = weathers[['Date','State','Events']]
weather_new

In [None]:
train.head()

In [None]:
store.head()

In [None]:
store=store.merge(state, on=["Store"],how="inner")
store

In [None]:
store.CompetitionDistance.fillna(store.CompetitionDistance.median(),inplace=True)

In [None]:
store.CompetitionOpenSinceMonth.fillna(store.CompetitionOpenSinceMonth.median(), inplace=True)
store.CompetitionOpenSinceYear.fillna(store.CompetitionOpenSinceYear.median(), inplace=True)
store.Promo2SinceWeek.fillna(0, inplace=True)
store.Promo2SinceYear.fillna(0, inplace=True)
store.PromoInterval.fillna(0, inplace=True)

In [None]:
df=store.merge(train,on=["Store"],how="inner")
df.head()

In [None]:
df["Date"]=pd.to_datetime(df["Date"])
df["Year"]=df["Date"].dt.year
df["Month"]=df["Date"].dt.month
df["Day"]=df["Date"].dt.day
df["Week"]=df["Date"].dt.week%4
df["WeekOfYear"]=df["Date"].dt.week


In [None]:
df["StateHoliday"] = df["StateHoliday"].map({0: 0, "0": 0, "a": 1, "b": 1, "c": 1})

In [None]:
df

In [None]:
df['Assortment'] =[1 if i == 'a' else 2 if i == 'b' else 3 for i in df['Assortment']]

In [None]:
df['CompetitionOpen'] = 0
df['CompetitionOpen'] = df['CompetitionOpen'].where(df['CompetitionOpenSinceYear'] == 0, other=12 * (df['Year'] - df['CompetitionOpenSinceYear']) + (df['Month'] - df['CompetitionOpenSinceMonth']))
df['PromoOpen'] = 0
df['PromoOpen'] = df['PromoOpen'].where(df['Promo2SinceYear'] == 0, other=12 * (df['Year'] - df['Promo2SinceYear']) + (df['WeekOfYear'] - df['Promo2SinceWeek'])/4)
df['PromoOpen'] = df['PromoOpen'].where(df['PromoOpen'] > 0,  0)


In [None]:
df.drop(columns=['Store','CompetitionOpenSinceMonth','CompetitionOpenSinceYear','Promo2SinceWeek','Promo2SinceYear','WeekOfYear'], inplace=True)

In [None]:
df= df.merge(weather_new, how='inner', left_on=["Date", "State"], right_on=["Date","State"])

In [None]:
df.head()

In [None]:
df2 = pd.get_dummies(df,columns = ['StoreType','PromoInterval','State'], drop_first=True)

df2

In [None]:
df2['ln_Sales'] = df2['Sales'].map(lambda x : np.log(x) if x != 0 else 0)
df2['ln_Customers'] = df2['Customers'].map(lambda x : np.log(x) if x != 0 else 0)
df2['ln_CompetitionDistance'] = df2['CompetitionDistance'].map(lambda x : np.log(x) if x != 0 else 0)

In [None]:
from sklearn.preprocessing import RobustScaler
roscaler= RobustScaler()

In [None]:
data=df2[['PromoOpen','CompetitionOpen']]
data_scaled=roscaler.fit_transform(data)
data_final=pd.DataFrame(data_scaled, columns=['scaled_PromoOpen','scaled_CompetitionOpen'])

In [None]:
data_final

In [None]:
df3 = pd.concat([df2,data_final], axis=1)

In [None]:
df3.info()

In [None]:
df3.drop(columns=['PromoOpen','CompetitionOpen','CompetitionDistance','Sales','Customers','Date'], inplace = True)

In [None]:
from sklearn.preprocessing import StandardScaler
std=StandardScaler()
data = df3[['ln_CompetitionDistance','ln_Customers', 'ln_Sales']]
std_data=std.fit_transform(data)
std_data=pd.DataFrame(std_data, columns = 'scaled_'+ data.columns)
std_data.head()

In [None]:
df4=pd.concat([df3,std_data], axis=1)
df4.drop(columns=['ln_Customers','ln_CompetitionDistance','ln_Sales','scaled_ln_Customers'], inplace = True)
df4.tail()

In [None]:
df4 = df4[(df4["Open"] != 0)&(df4['scaled_ln_Sales'] !=0)]
df4

In [None]:
x=df4.drop(['scaled_ln_Sales'], axis=1)
y=df4['scaled_ln_Sales']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Multiple Linear Regression

In [None]:
#Multiple Linear Regression
# 모델 정의하기 = 인스턴스화= 객체화
m_lr = LinearRegression()
# 학습하기
m_lr.fit(x_train, y_train)
#결과 예측하기
y_pred = m_lr.predict(x_test)

In [None]:
#설명력
print('m_lr_R^2: ', r2(y_test,y_pred))

# RMSE 예측력 : 해석을 위해서 
print('m_lr_RMSE: ',math.sqrt(mse(y_test,y_pred)))

In [None]:
#계수와 절편을 프린트해라
print(m_lr.intercept_)
print(m_lr.coef_)

In [None]:
c1=m_lr.coef_.reshape(1,-1)
m_lr.coef_.shape
c2=pd.DataFrame(c1, columns=list(x_test.columns))

In [None]:
c2.T.plot.bar(figsize=(20,6))

# Ridge, Lasso 모델 실습

In [None]:
#Ridge
ridge_model=Ridge(alpha=1)#alpha : 하이퍼 파라미터 규제의 강도
ridge_model.fit(x_train, y_train)
y_pred=ridge_model.predict(x_test)
print('ridge R^2: ', r2(y_test,y_pred))
print('ridge RMSE: ', math.sqrt(mse(y_test, y_pred)))

In [None]:
#Lasso
lasso_model=Lasso(alpha=0.01)
lasso_model.fit(x_train, y_train)
y_pred=lasso_model.predict(x_test)
print('Lasso R^2: ', r2(y_test,y_pred))
print('Lasso RMSE: ', math.sqrt(mse(y_test, y_pred)))

In [None]:
c1=lasso_model.coef_.reshape(1,-1)
c2=pd.DataFrame(c1, columns=list(x_test.columns))

In [None]:
plt.figure(figsize=(30,6))
c2.T.plot.bar(figsize=(20,6))

# Polynomial Features

In [None]:
poly=PolynomialFeatures(degree=2)
poly_x_train=poly.fit_transform(x_train)
poly_x_test=poly.transform(x_test)

In [None]:
#Multiple Linear Regression
p_lr=LinearRegression()
p_lr.fit(poly_x_train, y_train)
y_pred=p_lr.predict(poly_x_test)

In [None]:
print('p_lr R^2: ', r2(y_test,y_pred))
print('p_lr RMSE: ', math.sqrt(mse(y_test, y_pred)))

In [None]:
#Lasso model
p_lasso=Lasso(alpha=0.1)
p_lasso.fit(poly_x_train, y_train)
y_pred=p_lasso.predict(poly_x_test)
print('p_lasso R^2: ', r2(y_test,y_pred))
print('p_lasso RMSE: ', math.sqrt(mse(y_test, y_pred)))

In [None]:
x_test.columns

# Decision Tree

In [None]:
#Decision Tree Regressor
dt_regressor=DecisionTreeRegressor(max_depth=5)
dt_regressor.fit(x_train, y_train)
y_pred=dt_regressor.predict(x_test)
print('dt_regressor R^2: ', r2(y_test,y_pred))
print('dt_regressor RMSE: ', math.sqrt(mse(y_test, y_pred)))

In [None]:
columns=list(x_test.columns)
dt_regressor.feature_importances_ 

In [None]:
feature_importance = pd.DataFrame(dt_regressor.feature_importances_.reshape(1, -1), columns = columns, index = ['feature_importance'])
feature_importance.T

In [None]:
feature_importance.T.plot.bar(figsize=(20,6))

In [None]:
#random forest regressor
rf_regressor=RandomForestRegressor(n_estimators =500,max_depth=5)
rf_regressor.fit(x_train, y_train)
y_pred=rf_regressor.predict(x_test)
print('rf_regressor R^2: ', r2(y_test,y_pred))
print('rf_regressor RMSE: ', math.sqrt(mse(y_test, y_pred)))

In [None]:
feature_importances = pd.Series(rf_regressor.feature_importances_, index=x_train.columns)
feature_importances.nlargest(10).sort_values(ascending = True).plot(kind='barh')
plt.xlabel('importance')
plt.title('Feature Importance')