Import Libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

Import Data

In [None]:
train = pd.read_csv('/kaggle/input/rossmann-store-sales/train.csv')
store = pd.read_csv('/kaggle/input/rossmann-store-sales/store.csv')
test = pd.read_csv('/kaggle/input/rossmann-store-sales/test.csv')

Number of rows and columns

In [None]:
print(train.shape)
print(test.shape)
print(store.shape)

---

In [None]:
train.head()

In [None]:
store.head()

In [None]:
test.head()

In [None]:
train.dtypes

The Date column is of the object type , we need to convert it to DateTime 

In [None]:
store.dtypes

In [None]:
train.describe(include='object')

In [None]:
train.describe()[['Sales','Customers']]

Average number of customers across all the stores every day is 633 <br>
Average sales across all the stores everyday is about 57738 units

In [None]:
train.Store.nunique()

We have a total of 1115 Stores all across

In [None]:
train.DayOfWeek.value_counts().sort_values()

In [None]:
print(train.Open.value_counts() , '\n',train.Promo.value_counts())

---

Missing Values

In [None]:
print(train.isna().sum())
print('-'*20)
print(store.isna().sum())
print('-'*20)
print(test.isna().sum())

---

Exploring a particular Store

#### Store 1 Analysis

In [None]:
store1 = train[train['Store']==1]
store1.head()

In [None]:
print(store1.shape)

In [None]:
store1['Date'] = pd.to_datetime(store1['Date'])
print(min(store1['Date']))
print(max(store1['Date']))
store1['Year'] = store1['Date'].dt.year
store1['Month'] = store1['Date'].dt.month

The data for Store 1 is available from 2013-01-01 to 2015-07-31

In [None]:
store1.resample('1D',on='Date')['Sales'].sum().plot.line(figsize=(14,4))
plt.show()

The gaps in the above plot show that there are missing records for those dates

In [None]:
import seaborn as sns
sns.distplot(store1.Sales , bins=10)
plt.show()

In [None]:
sns.distplot(train.Sales)
plt.show()

The sales are 0 for many records , which might be because the stores are closed

---

### Treating Missing Values

In [None]:
store.isna().sum()

In [None]:
store[store['Store']==1].T

In [None]:
store[~(store['Promo2']==0)].iloc[0]

Fill Promo2SinceWeek with 0  & <br>
Promo2SinceYear & PromoInterval with mode <br>
Fill CompetitionDistance with max value to keep them far from the stores since we dont know about them<br>
CompetitionOpenSinceMonth & CompetitionOpenSinceYear with mode

In [None]:
store['Promo2SinceWeek'] = store['Promo2SinceWeek'].fillna(0)
store['Promo2SinceYear'] = store['Promo2SinceYear'].fillna(store['Promo2SinceYear'].mode().iloc[0])
store['PromoInterval'] = store['PromoInterval'].fillna(store['PromoInterval'].mode().iloc[0])

store['CompetitionDistance'] = store['CompetitionDistance'].fillna(store['CompetitionDistance'].max())
store['CompetitionOpenSinceMonth'] = store['CompetitionOpenSinceMonth'].fillna(store['CompetitionOpenSinceMonth'].mode().iloc[0])
store['CompetitionOpenSinceYear'] = store['CompetitionOpenSinceYear'].fillna(store['CompetitionOpenSinceYear'].mode().iloc[0])

In [None]:
store.isna().sum()

---

### Merging Data

In [None]:
df = train.merge(store , on='Store' , how='left')
print(train.shape)
print(store.shape)
print(df.shape)

In [None]:
df.head(3)

In [None]:
df.isna().sum()

---

### Encoding

In [None]:
df['Date'] = pd.to_datetime(df['Date'])

In [None]:
df['Day'] = df['Date'].dt.day
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year

# df['DayOfWeek'] = df['Date'].dt.strftime(%a)

In [None]:
df.dtypes

### Categorical Cols

In [None]:
cat_cols = df.select_dtypes(include=['object']).columns

for i in cat_cols:
    print(i)
    print(df[i].value_counts())
    print('-'*20)

In [None]:
df['StateHoliday'] = df['StateHoliday'].map({'0':0 , 0:0 , 'a':1 , 'b':2 , 'c':3})
df['StateHoliday'] = df['StateHoliday'].astype(int)

In [None]:
df['StoreType'] = df['StoreType'].map({'a':1 , 'b':2 , 'c':3 , 'd':4})
df['StoreType'] = df['StoreType'].astype(int)

In [None]:
df['Assortment'] = df['Assortment'].map({'a':1 , 'b':2 , 'c':3})
df['Assortment'] = df['Assortment'].astype(int)

In [None]:
df['PromoInterval'] = df['PromoInterval'].map({'Jan,Apr,Jul,Oct':1 , 'Feb,May,Aug,Nov':2 , 'Mar,Jun,Sept,Dec':3})
df['PromoInterval'] = df['PromoInterval'].astype(int)

In [None]:
 df.dtypes

---

### Train & Validate Split

Applying Log Transformation of the Target Variable

In [None]:
X = df.drop(['Sales','Date','Customers'],1)
#Transform Target Variable
y = np.log(df['Sales']+1)

from sklearn.model_selection import train_test_split
X_train , X_val , y_train , y_val = train_test_split(X , y , test_size=0.30 , random_state = 1 )

X_train.shape , X_val.shape , y_train.shape , y_val.shape

---

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor(max_depth=11)
dt.fit(X_train , y_train)
y_pred_dt = dt.predict(X_val)

![](http://)Reverse the Transformation

In [None]:
y_pred_dt = np.exp(y_pred_dt)-1
y_val = np.exp(y_val)-1

In [None]:
from sklearn.metrics import r2_score , mean_squared_error

print(r2_score(y_val , y_pred_dt))
print(np.sqrt(mean_squared_error(y_val , y_pred_dt)))

RMSPE - Root Mean Square Percentage Error

In [None]:
def ToWeight(y):
    w = np.zeros(y.shape, dtype=float)
    ind = y != 0
    w[ind] = 1./(y[ind]**2)
    return w

def rmspe(y, yhat):
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean( w * (y - yhat)**2 ))
    return rmspe

ROOT MEAN SQUARE PERCENTAGE ERROR

In [None]:
rmspe(y_val,y_pred_dt)

---

### Hyperparameter Tuning

#### Customized Metric 

In [None]:
def get_rmspe_score(model, input_values, y_actual):
    y_predicted=model.predict(input_values)
    y_actual=np.exp(y_actual)-1
    y_predicted=np.exp(y_predicted)-1
    score=rmspe(y_actual, y_predicted)
    return score


In [None]:
from sklearn.model_selection import RandomizedSearchCV

params = {
    'max_depth' : list(range(5,25))
}

base  = DecisionTreeRegressor()

model_tuned = RandomizedSearchCV(base , params , return_train_score=True).fit(X_train , y_train)

In [None]:
model_cv_results = pd.DataFrame(model_tuned.cv_results_).sort_values(by='mean_test_score' , ascending=False)
model_cv_results

In [None]:
model_cv_results.set_index('param_max_depth')['mean_test_score'].plot(color='g',legend=True)
model_cv_results.set_index('param_max_depth')['mean_train_score'].plot(color='r' , legend=True)
plt.grid(True)
plt.show()

We can see that the model Underfits with the max_depth is <10 an=d Overfits when the max_depth is >12<br>
So we can choose max_depth as 11

---

### XGBOOST

In [None]:
import xgboost as xgb

In [None]:
dtrain = xgb.DMatrix(X_train,y_train)
dvalidate = xgb.DMatrix(X_val[X_train.columns],y_val)

params = {
    'eta' : 1,
    'max_depth' : 5,
    'objecive' : 'reg:linear'
}

model_xg = xgb.train(params, dtrain , 5)

y_pred_xg = model_xg.predict(dvalidate)

y_pred_xg = np.exp(y_pred_xg)-1


rmspe(y_val , y_pred_xg)

---


Feature Importance

In [None]:
plt.barh(X_train.columns , dt.feature_importances_)
plt.show()

---

### Process Test Data

In [None]:
test.shape

In [None]:
test.head()

In [None]:
test_cust = train.groupby(['Store'])[['Customers']].mean().reset_index().astype(int)

In [None]:
test_1 = test.merge(test_cust , on='Store' , how='left')
test_1.head()

In [None]:
test_m = test_1.merge(store , on='Store' , how='left')

In [None]:
test_m.shape

In [None]:
test_m['Open'].fillna(1,inplace=True)

test_m['Date'] = pd.to_datetime(test_m['Date'])

test_m['Day'] = test_m['Date'].dt.day
test_m['Month'] = test_m['Date'].dt.month
test_m['Year'] = test_m['Date'].dt.year

test_m.drop('Date',1,inplace=True)

In [None]:
cat_cols = test_m.select_dtypes(include=['object']).columns

for i in cat_cols:
    print(i)
    print(test_m[i].value_counts())
    print('-'*20)

In [None]:
test_m['StateHoliday'] = test_m['StateHoliday'].map({'0':0 , 'a':1})
test_m['StateHoliday'] = test_m['StateHoliday'].astype(int)

test_m['StoreType'] = test_m['StoreType'].map({'a':1 , 'b':2 , 'c':3 , 'd':4})
test_m['StoreType'] = test_m['StoreType'].astype(int)

test_m['Assortment'] = test_m['Assortment'].map({'a':1 , 'b':2 , 'c':3})
test_m['Assortment'] = test_m['Assortment'].astype(int)

test_m['PromoInterval'] = test_m['PromoInterval'].map({'Jan,Apr,Jul,Oct':1 , 'Feb,May,Aug,Nov':2 , 'Mar,Jun,Sept,Dec':3})
test_m['PromoInterval'] = test_m['PromoInterval'].astype(int)

In [None]:
test_m.dtypes

In [None]:
X_train.dtypes

In [None]:
test_m.isna().sum()

### Prediction

In [None]:
test_pred = dt.predict(test_m[X_train.columns])
test_pred_inv = np.exp(test_pred)-1

In [None]:
test_pred_inv

### Submission

In [None]:
submission = pd.DataFrame({'Id' : test_m['Id'] , 'Sales' : test_pred_inv})
submission['Sales'] = submission['Sales'].astype(int)
submission['Id']= submission.index
submission['Id'] = submission['Id']+1
submission.head()

In [None]:
submission.shape

In [None]:
submission

In [None]:
submission.to_csv('sumbission.csv',index=False)