In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv("../input/rossmann-store-sales/train.csv", low_memory=False)
store_df = pd.read_csv("../input/rossmann-store-sales/store.csv")


In [None]:
test_df = pd.read_csv("../input/rossmann-store-sales/test.csv")
submission_df = pd.read_csv("../input/rossmann-store-sales/sample_submission.csv")

In [None]:
train_df

In [None]:
store_df

In [None]:
test_df

In [None]:
submission_df

## Merge store data with train and test data according to store number

In [None]:

merged_df=train_df.merge(store_df,how='left',on='Store')
merged_test_df=test_df.merge(store_df,how='left',on='Store')

In [None]:
merged_df

In [None]:
merged_test_df

## Preprocessing and Feature Engineering


In [None]:
merged_df.info()

### Transfer data type for date and splite year ,month , day, weeks number

In [None]:
def split_date(df):
    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df.Date.dt.year
    df['Month'] = df.Date.dt.month
    df['Day'] = df.Date.dt.day
    df['WeekOfYear_x'] = df.Date.dt.isocalendar().week

In [None]:
split_date(merged_df)
split_date(merged_test_df)

In [None]:
merged_df

In [None]:
merged_df.info()

In [None]:
merged_test_df

#sure we need store open not close one to training model

In [None]:
merged_df.Open.value_counts()

In [None]:
merged_df=merged_df[merged_df['Open']==1].copy()
#merged_test_df=merged_test_df[merged_test_df['Open']==1].copy()

In [None]:
merged_df

In [None]:
merged_test_df

## Competition
Next, we can use the columns CompetitionOpenSince[Month/Year] columns from store_df to compute the number of months for which a competitor has been open near the store.

In [None]:
def comp_months(df):
    df['CompetitionOpen'] = 12 * (df.Year - df.CompetitionOpenSinceYear) + (df.Month - df.CompetitionOpenSinceMonth)
    df['CompetitionOpen'] = df['CompetitionOpen'].map(lambda x: 0 if x < 0 else x).fillna(0)

In [None]:
comp_months(merged_df)
comp_months(merged_test_df)

In [None]:
merged_df[['Date', 'CompetitionDistance', 'CompetitionOpenSinceYear', 'CompetitionOpenSinceMonth', 'CompetitionOpen']].sample(20)

## Additional Promotion
We can also add some additional columns to indicate how long a store has been running Promo2 and whether a new round of Promo2 starts in the current month.

In [None]:
merged_df=merged_df.rename(columns={"WeekOfYear_x":"WeekOfYear"})
merged_test_df=merged_test_df.rename(columns={"WeekOfYear_x":"WeekOfYear"})

merged_df.info()

In [None]:
def check_promo_month(row):
    month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun',              
                 7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    try:
        months = (row['PromoInterval'] or '').split(',')
        if row['Promo2Open'] and month2str[row['Month']] in months:
            return 1
        else:
            return 0
    except Exception:
        return 0

def promo_cols(df):
    # Months since Promo2 was open
    df['Promo2Open'] = 12 * (df.Year - df.Promo2SinceYear) +  (df.WeekOfYear- df.Promo2SinceWeek)*7/30.5
    df['Promo2Open'] = df['Promo2Open'].map(lambda x: 0 if x < 0 else x).fillna(0) * df['Promo2']
    # Whether a new round of promotions was started in the current month
    df['IsPromo2Month'] = df.apply(check_promo_month, axis=1) * df['Promo2']

In [None]:
promo_cols(merged_df)
promo_cols(merged_test_df)

In [None]:
merged_df.columns

In [None]:
input_cols=['Store', 'DayOfWeek', 'Promo', 'StateHoliday', 'SchoolHoliday', 
              'StoreType', 'Assortment', 'CompetitionDistance', 'CompetitionOpen', 
              'Day', 'Month', 'Year', 'WeekOfYear',  'Promo2', 
              'Promo2Open', 'IsPromo2Month']
target_col=['Sales']

In [None]:
input_x=merged_df[input_cols].copy()
target_x=merged_df[target_col].copy()

In [None]:
input_test=merged_test_df[input_cols].copy()

In [None]:
input_x

## Check which cols have numeric and which categorical

In [None]:
input_x.info()

In [None]:
numeric_cols = ['Store', 'Promo', 'SchoolHoliday', 
              'CompetitionDistance', 'CompetitionOpen', 'Promo2', 'Promo2Open', 'IsPromo2Month',
              'Day', 'Month', 'Year', 'WeekOfYear',  ]
categorical_cols = ['DayOfWeek', 'StateHoliday', 'StoreType', 'Assortment']

### Impute missing numerical data

In [None]:
input_x[numeric_cols].isna().sum()

In [None]:
input_test[numeric_cols].isna().sum()

### only null with competiton distance , we will try to fill  with max distanceas far away from compatitor

In [None]:
max_distance=input_x['CompetitionDistance'].max()
max_distance

### Try to fill Null with max distance

In [None]:
input_x['CompetitionDistance'].fillna(max_distance, inplace=True)
input_test['CompetitionDistance'].fillna(max_distance, inplace=True)

### Scale Numeric Values

Let's scale numeric values to the 0 to 1 range.

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler().fit(input_x[numeric_cols])

In [None]:
input_x[numeric_cols] = scaler.transform(input_x[numeric_cols])
input_test[numeric_cols] = scaler.transform(input_test[numeric_cols])

### Encode Categorical Columns


In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(input_x[categorical_cols])
encoded_cols = list(encoder.get_feature_names(categorical_cols))

In [None]:
input_x[encoded_cols] = encoder.transform(input_x[categorical_cols])
input_test[encoded_cols] = encoder.transform(input_test[categorical_cols])

let's to put all numeric data in one data frame

In [None]:
X = input_x[numeric_cols + encoded_cols]
X_test = input_test[numeric_cols + encoded_cols]

In [None]:
X_test

# Training Model
## Gradient Boosting


In [None]:
from xgboost import XGBRegressor

In [None]:
model = XGBRegressor(random_state=42, n_jobs=-1, n_estimators=1500, max_depth=5)

In [None]:
%%time
model.fit(X, target_x)

### Prediction


In [None]:
pred=model.predict(X)
pred

In [None]:
test_preds = model.predict(X_test)
test_preds

In [None]:
submission_df['Sales']  = test_preds

In [None]:
submission_df.to_csv('submission.csv', index=None)

In [None]:
model

### Evaluation

Let's evaluate the predictions using RMSE error.

In [None]:
from sklearn.metrics import mean_squared_error

def rmse(a, b):
    return mean_squared_error(a, b, squared=False)

In [None]:
rmse(pred, target_x)

### Feature importance

Just like decision trees and random forests, XGBoost also provides a feature importance score for each column in the input.

In [None]:
importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

In [None]:
importance_df.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10,6))
plt.title('Feature Importance')
sns.barplot(data=importance_df.head(10), x='importance', y='feature');

# Enhancement Model By Hyperparameter Tuning and Regularization

In [None]:
from sklearn.model_selection import KFold

Let's define a helper function `train_and_evaluate` which trains a model the given parameters and returns the trained model, training error and validation error.

In [None]:
def train_and_evaluate(X_train, train_targets, X_val, val_targets, **params):
    model = XGBRegressor(random_state=42, n_jobs=-1, **params)
    model.fit(X_train, train_targets)
    train_rmse = rmse(model.predict(X_train), train_targets)
    val_rmse = rmse(model.predict(X_val), val_targets)
    return model, train_rmse, val_rmse

In [None]:
kfold = KFold(n_splits=5)

In [None]:
models = []

for train_idxs, val_idxs in kfold.split(X):
    X_train, train_targets = X.iloc[train_idxs], target_x.iloc[train_idxs]
    X_val, val_targets = X.iloc[val_idxs], target_x.iloc[val_idxs]
    model, train_rmse, val_rmse = train_and_evaluate(X_train, 
                                                     train_targets, 
                                                     X_val, 
                                                     val_targets, 
                                                     max_depth=5, 
                                                     n_estimators=15)
    models.append(model)
    print('Train RMSE: {}, Validation RMSE: {}'.format(train_rmse, val_rmse))

Here's a helper function to test hyperparameters with K-fold cross validation.

In [None]:
def test_params_kfold(n_splits, **params):
    train_rmses, val_rmses, models = [], [], []
    kfold = KFold(n_splits)
    for train_idxs, val_idxs in kfold.split(X):
        X_train, train_targets = X.iloc[train_idxs], targets.iloc[train_idxs]
        X_val, val_targets = X.iloc[val_idxs], targets.iloc[val_idxs]
        model, train_rmse, val_rmse = train_and_evaluate(X_train, train_targets, X_val, val_targets, **params)
        models.append(model)
        train_rmses.append(train_rmse)
        val_rmses.append(val_rmse)
    print('Train RMSE: {}, Validation RMSE: {}'.format(np.mean(train_rmses), np.mean(val_rmses)))
    return models

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_val, train_targets, val_targets = train_test_split(X, target_x, test_size=0.1)

In [None]:
def test_params(**params):
    model = XGBRegressor(n_jobs=-1, random_state=42, **params)
    model.fit(X_train, train_targets)
    train_rmse = rmse(model.predict(X_train), train_targets)
    val_rmse = rmse(model.predict(X_val), val_targets)
    print('Train RMSE: {}, Validation RMSE: {}'.format(train_rmse, val_rmse))

In [None]:
?XGBRegressor

In [None]:
model

In [None]:
test_params(max_depth=12)

In [None]:
test_params(max_depth=8)

In [None]:
test_params(max_depth=4)

In [None]:
test_params(max_depth=14)

In [None]:
test_params(max_depth=13)

In [None]:
test_params(max_depth=5)

In [None]:
test_params(learning_rate=0.4)

In [None]:
test_params(learning_rate=0.8)

In [None]:
test_params(learning_rate=0.2)

In [None]:
test_params(learning_rate=0.99)

In [None]:
test_params (n_estimators=15)

In [None]:
test_params(n_estimators=10)

In [None]:
test_params (n_estimators=505)

In [None]:
test_params (n_estimators=1000)

In [None]:
test_params (n_estimators=1500)

In [None]:
model

In [None]:
model = XGBRegressor(n_jobs=-1, random_state=42, n_estimators=1500, 
                     learning_rate=0.99, max_depth=5, subsample=0.9, 
                     colsample_bytree=0.7)

In [None]:
%%time
model.fit(X, target_x)

In [None]:
pred=model.predict(X)

In [None]:
rmse(pred,target_x)

### after training with chossen parameters and fitting model

In [None]:
test_preds = model.predict(X_test)
test_preds

## Let's add the predictions into `submission_df`.

In [None]:
submission_df['Sales']  = test_preds

In [None]:
submission_df

In [None]:
##Save as csv
submission_df.to_csv('submission.csv', index=None)

In [None]:
model