# Rossmann Store Sales Prediction
# By Mohamed Eltayeb

# Import libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from sklearn.preprocessing import LabelEncoder
from pandas.plotting import scatter_matrix


from lightgbm import LGBMRegressor
from sklearn.compose import TransformedTargetRegressor

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.options.mode.chained_assignment = None  # default='warn'
plt.rcParams["figure.figsize"] = (12, 8)
pd.set_option('display.max_columns', None)

In [None]:
#The Evaluation Metric
def RMSPE(y_true, y_pred):
    '''
    Compute Root Mean Square Percentage Error between two arrays.
    '''
    loss = np.sqrt(np.mean(np.square(((y_true - y_pred) / y_true)), axis=0))

    return loss

In [None]:
#Plot the LGBM Features Importances
def plotImp(model, X , num = 20, fig_size = (40, 20)):
    feature_imp = pd.DataFrame({'Value':model.feature_importances_,'Feature':X.columns})
    plt.figure(figsize=fig_size)
    sns.set(font_scale = 5)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", 
                                                        ascending=False)[0:num])
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances-01.png')
    plt.show()

# Read the training and testing data


In [None]:
train_df = pd.read_csv("../input/rossmann-store-sales/train.csv",dtype={'StateHoliday': object})
test_df = pd.read_csv("../input/rossmann-store-sales/test.csv")
store_df = pd.read_csv("../input/rossmann-store-sales/store.csv")

In [None]:
train_df = pd.merge(train_df, store_df, how = 'left', on = 'Store')
test_df = pd.merge(test_df, store_df, how = 'left', on = 'Store')

# Convert the Tabular Data to Time Series Data

In [None]:
ID = test_df['Id']
test_df.drop('Id',inplace=True,axis=1)

train_df.sort_values(["Store","Date"], ignore_index=True, inplace=True)
test_df.sort_values(["Store","Date"], ignore_index=True, inplace=True)

for dataset in (train_df,test_df):
    dataset['Date'] = pd.to_datetime(dataset['Date'])
    dataset['Day'] = dataset.Date.dt.day
    dataset['Month'] = dataset.Date.dt.month
    dataset['Year'] = dataset.Date.dt.year
    dataset['DayOfYear'] = dataset.Date.dt.dayofyear
    dataset['WeekOfYear'] = dataset.Date.dt.weekofyear
    dataset.set_index('Date', inplace=True)

# Exploratory data analysis

# Features:

* Id - an Id that represents a (Store, Date) duple within the test set
* Store - a unique Id for each store
* Sales - the turnover for any given day (this is what you are predicting)
* Customers - the number of customers on a given day
* Open - an indicator for whether the store was open: 0 = closed, 1 = open
* StateHoliday - indicates a state holiday. Normally all stores, with few exceptions, are closed on state holidays. Note that all schools are closed on public holidays and weekends. a = public holiday, b = Easter holiday, c = Christmas, 0 = None
* SchoolHoliday - indicates if the (Store, Date) was affected by the closure of public schools
* StoreType - differentiates between 4 different store models: a, b, c, d
* Assortment - describes an assortment level: a = basic, b = extra, c = extended
* CompetitionDistance - distance in meters to the nearest competitor store
* CompetitionOpenSince[Month/Year] - gives the approximate year and month of the time the nearest competitor was opened
* Promo - indicates whether a store is running a promo on that day
* Promo2 - Promo2 is a continuing and consecutive promotion for some stores: 0 = store is not participating, 1 = store is participating
* Promo2Since[Year/Week] - describes the year and calendar week when the store started participating in Promo2
* PromoInterval - describes the consecutive intervals Promo2 is started, naming the months the promotion is started anew. E.g. "Feb,May,Aug,Nov" means each round starts in February, May, August, November of any given year for that store

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df.describe()

In [None]:
test_df.describe()

# Plots

In [None]:
#Numerical Features Histograms (Train)
num_feats = list(train_df.select_dtypes(include=['int64', 'float64', 'int32']).columns)
train_df[num_feats].hist(figsize=(20,15));

In [None]:
#Numerical Features Histograms (Test)
num_feats = list(test_df.select_dtypes(include=['int64', 'float64', 'int32']).columns)
test_df[num_feats].hist(figsize=(20,15));

#### We can notice several things. Firstly, the test set does not have the customers feature, so we cannot use it in prediction. But we maybe can utilize it in doing clusters for the stores.
#### Secondly, for each feature, the distribution in the training set has almost an equal corresponding one in the test set except for the feature 'SchoolHoliday' which has a lot of '1.0's values in the test set.

In [None]:
#Sales_Stores
plt.figure(figsize=(12, 8))
plt.scatter(train_df['Store'],train_df['Sales'], alpha=0.1);
plt.plot()

#### We can see that only several stores have sales more than 20000, and there are quite a lot of 0 sales days. This may be an indication to the days when the stores were closed. Deleting these points later would be better for forcasting well.

In [None]:
#Sales_Customers
attributes = ["Sales", "Customers"]
scatter_matrix(train_df[attributes], alpha=0.1);

#### Obviously, this feature is highly correlated to the target and may be a powerful predictor, but it is not in the test set, so we cannot use it except in making clusters or aggregations. 

In [None]:
#Sales_CompetitionDistance
attributes = ["Sales", "CompetitionDistance"]
scatter_matrix(train_df[attributes], alpha=0.1);

#### Well, it looks like the further stores distant from each other, the lower sales they get. This maybe due to that people tend to go to places where several same stores setting next to each other in order to get best prices.
#### Also, several similar stores setting to each other may mean that their place is a big market within the city while that distant stores may mean that they are just small retailing stores.


In [None]:
#Sales_Open
sns.set()
plt.hist(train_df[train_df['Open'] == 1].Sales, color='green', alpha=0.3, label = 'YES')
plt.hist(train_df[train_df['Open'] == 0].Sales, color='red', alpha=0.3, label = 'NO')
plt.legend()
plt.plot()

#### Nothing's strange. As that no red bars appeares, that does mean the store doesn't have any sales when the store is closed which is reasonable.
#### We can use this piece of information to fill some of the test set predictions manually.

In [None]:
#Sales_Promo
sns.set()
plt.hist(train_df[train_df['Promo'] == 1].Sales, color='green', alpha=0.3, label = 'YES')
plt.hist(train_df[train_df['Promo'] == 0].Sales, color='red', alpha=0.3, label = 'NO')
plt.legend()
plt.plot()

In [None]:
#Sales_Promo2
sns.set()
plt.hist(train_df[train_df['Promo2'] == 1].Sales, color='green', alpha=0.3, label = 'YES')
plt.hist(train_df[train_df['Promo2'] == 0].Sales, color='red', alpha=0.3, label = 'NO')
plt.legend()
plt.plot()

#### It looks like stores with continuing and consecutive promotions doesn't get significantly better sales than stores with indiviual promotions. In fact, it looks like the opposite is true.

In [None]:
#Sales_DayOfWeek
plt.figure(figsize=(12, 8))
plt.scatter(train_df["DayOfWeek"],train_df["Sales"])
plt.plot()

In [None]:
#Sales_Month
plt.figure(figsize=(12, 8))
plt.scatter(train_df["Month"],train_df["Sales"])
plt.plot()

#### High sales in April, May, June and December
#### Low sales in January and September

In [None]:
#Sales_StoreType
sns.set()
plt.hist(train_df[train_df['StoreType'] == 'a'].Sales, color='green', alpha=0.3, label = 'a')
plt.hist(train_df[train_df['StoreType'] == 'b'].Sales, color='red', alpha=0.3, label = 'b')
plt.hist(train_df[train_df['StoreType'] == 'c'].Sales, color='yellow', alpha=0.3, label = 'c')
plt.hist(train_df[train_df['StoreType'] == 'd'].Sales, color='blue', alpha=0.3, label = 'd')
plt.legend()
plt.plot()

#### Stores with type 'b' look like having greater mean sales than the others. 

In [None]:
#Sales_Assortment
sns.set()
plt.hist(train_df[train_df['Assortment'] == 'a'].Sales, color='green', alpha=0.3, label = 'a')
plt.hist(train_df[train_df['Assortment'] == 'b'].Sales, color='red', alpha=0.3, label = 'b')
plt.hist(train_df[train_df['Assortment'] == 'c'].Sales, color='yellow', alpha=0.3, label = 'c')
plt.legend()
plt.plot()

#### Stores with Assortment 'b' look like having much less maximum sales than the others. 

In [None]:
#Open_DayOfWeek
sns.countplot( x='DayOfWeek', data=train_df, hue="Open", palette="Set1");

#### Almost all the stores are closed at the weekend.
#### As that the data has been taken from European Countries, it is safe to say that number 7 represent Sunday.

In [None]:
#PromoInterval
train_df['PromoInterval'].hist()

#### Clearly, most of the stores prefer the Jan,Apr,Jul,Oct Promo interval. 

# Feature Engineering

# Adding Aggregations

In [None]:
# Sales_per_day, Customers_per_day, avg_sales_per_customer and Sales_per_customers_per_day

# Get total sales, customers and open days per store
store_data_sales = train_df.groupby([train_df['Store']])['Sales'].sum()
store_data_customers = train_df.groupby([train_df['Store']])['Customers'].sum()
store_data_avg_sales = train_df.groupby([train_df['Store']])['Sales'].mean()
store_data_avg_customers = train_df.groupby([train_df['Store']])['Customers'].mean()
store_data_open = train_df.groupby([train_df['Store']])['Open'].count()

# Calculate sales per day, customers per day and sales per customers per day
store_data_sales_per_day = store_data_sales / store_data_open
store_data_customers_per_day = store_data_customers / store_data_open
store_data_avg_sales_per_customer = store_data_avg_sales / store_data_avg_customers
store_data_sales_per_customer_per_day = store_data_sales_per_day / store_data_customers_per_day

#Saving the above values in a dictionary so that they can be mapped to the dataframe.
sales_per_day_dict = dict(store_data_sales_per_day)
customers_per_day_dict = dict(store_data_customers_per_day)
avg_sales_per_customer_dict = dict(store_data_avg_sales_per_customer)
sales_per_customers_per_day_dict = dict(store_data_sales_per_customer_per_day)



train_df['SalesPerDay'] = train_df['Store'].map(sales_per_day_dict)
train_df['Customers_per_day'] = train_df['Store'].map(customers_per_day_dict)
train_df['Avg_Sales_per_Customer'] = train_df['Store'].map(avg_sales_per_customer_dict)
train_df['Sales_Per_Customers_Per_Day'] = train_df['Store'].map(sales_per_customers_per_day_dict)

test_df['Sales_per_day'] = test_df['Store'].map(sales_per_day_dict)
test_df['Customers_per_day'] = test_df['Store'].map(customers_per_day_dict)
test_df['Avg_Sales_per_Customer'] = test_df['Store'].map(avg_sales_per_customer_dict)
test_df['Sales_Per_Customers_Per_Day'] = test_df['Store'].map(sales_per_customers_per_day_dict)

## Foureier Frequnecies and Amplitudes

In [None]:
freq2_dict_no_log = dict()
freq3_dict_no_log = dict()

amp2_dict_no_log = dict()
amp3_dict_no_log = dict()

for feat_1 in ('Year','Month'):
        for i in range(min(train_df[feat_1].unique()), max(train_df[feat_1].unique()) + 1):

            a = train_df.loc[train_df[feat_1]==i]
            a_sales = a['Sales']

            Y = np.fft.fft(a_sales.values)
            Y = abs(Y)
            freq = np.fft.fftfreq(len(Y), 1)

            intercept_index = np.argmax(Y)
            Y = np.delete(Y, intercept_index)
            freq = np.delete(freq, intercept_index)

            amplitude_1_index = np.argmax(Y)
            amplitude_1 = Y[amplitude_1_index]
            Y = np.delete(Y, amplitude_1_index)
            freq_1 = freq[amplitude_1_index]
            freq = np.delete(freq, amplitude_1_index)

            amplitude_2_index = np.argmax(Y)
            amplitude_2 = Y[amplitude_2_index]
            Y = np.delete(Y, amplitude_2_index)
            freq_2 = freq[amplitude_2_index]
            freq = np.delete(freq, amplitude_2_index)

            amplitude_3_index = np.argmax(Y)
            amplitude_3 = Y[amplitude_3_index]
            Y = np.delete(Y, amplitude_3_index)
            freq_3 = freq[amplitude_3_index]
            freq = np.delete(freq, amplitude_3_index)
            

            #Freq_1 is not included because it seems as it is always 0
            a[f'Frequency_2_{feat_1}_Sales'] = freq_2
            a[f'Frequency_3_{feat_1}_Sales'] = freq_3

            a[f'Amplitude_2_{feat_1}_Sales'] = amplitude_2
            a[f'Amplitude_3_{feat_1}_Sales'] = amplitude_3


            freq2_dict_no_log[i] = freq_2
            freq3_dict_no_log[i] = freq_3

            amp2_dict_no_log[i] = amplitude_2
            amp3_dict_no_log[i] = amplitude_3


            if i == min(train_df[feat_1].unique()):
                k = a
            else:
                k = pd.concat([k,a])
        train_df = k
        test_df[f'Frequency_2_{feat_1}_Sales'] = test_df[feat_1].map(freq2_dict_no_log)
        test_df[f'Frequency_3_{feat_1}_Sales'] = test_df[feat_1].map(freq3_dict_no_log)
        test_df[f'Amplitude_2_{feat_1}_Sales'] = test_df[feat_1].map(amp2_dict_no_log)
        test_df[f'Amplitude_3_{feat_1}_Sales'] = test_df[feat_1].map(amp3_dict_no_log)
        freq2_dict_no_log = dict()
        freq3_dict_no_log = dict()
        amp2_dict_no_log = dict()
        amp3_dict_no_log = dict()


## Converting "CompetitionOpenSinceYear/Month" to Milliseconds

In [None]:
feats = ['CompetitionOpenSinceMonth','CompetitionOpenSinceYear']
modes = train_df[feats].mode()

for f in feats:
        train_df[f] = train_df[f].fillna(modes[f][0])
        test_df[f] = test_df[f].fillna(modes[f][0])

#---------------------------------------------------------------------------------------------------------------- 
def convertCompetitionOpen(df):
    try:
        date = '{}-{}'.format(int(df['CompetitionOpenSinceYear']), int(df['CompetitionOpenSinceMonth']))
        return pd.to_datetime(date)
    except:
        return np.nan

train_df['CompetitionOpenInt'] = train_df.apply(lambda df: convertCompetitionOpen(df), axis=1).astype(np.int64)
test_df['CompetitionOpenInt'] = test_df.apply(lambda df: convertCompetitionOpen(df), axis=1).astype(np.int64)

## Drop 'Customers' and 'StateHoliday' Columns

In [None]:
train_df.drop('Customers',inplace = True, axis=1)  #Because it is not in the test set

In [None]:
train_df.drop('StateHoliday',inplace=True,axis=1)  #Because it reduces the performance
test_df.drop('StateHoliday',inplace=True,axis=1)

## Return to the original order

In [None]:
train_df.sort_values(["Store"], ignore_index=True, inplace=True)
test_df.sort_values(["Store"], ignore_index=True, inplace=True)
train_df.sort_values(["Year","Month","Day"], ascending=False ,ignore_index=True, inplace=True)
test_df.sort_values(["Year","Month","Day"], ascending=False ,ignore_index=True, inplace=True)

# Data preprocessing

## Missing Values

In [None]:
#missing data percentage (Training)
total = train_df.isnull().sum().sort_values(ascending=False)
percent_1 = train_df.isnull().sum()/train_df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data

In [None]:
#missing data percentage (Testing)
total = test_df.isnull().sum().sort_values(ascending=False)
percent_1 = test_df.isnull().sum()/test_df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data

In [None]:
feats = ['Promo2SinceYear','Promo2SinceWeek','CompetitionDistance', 'PromoInterval']
modes = train_df[feats].mode()

for f in feats:
        train_df[f] = train_df[f].fillna(modes[f][0])
        test_df[f] = test_df[f].fillna(modes[f][0])
for dataset in (train_df,test_df):
    dataset['Open'] = dataset['Open'].fillna(0)

# Remove Outliers

In [None]:
train_df = train_df[train_df['Sales'] < 25000]  #Drops samples which have sales more than 25000
train_df.reset_index(drop=True)

# Encoding

## Label Encoder

In [None]:
attributes = ['StoreType','Assortment','PromoInterval']
for dataset in (train_df,test_df):
    for f in attributes:
        dataset[attributes] = dataset[attributes].apply(lambda x: pd.factorize(x)[0])

# Only Use non-zero Sales Samples For Training

In [None]:
train_df = train_df[train_df['Open'] == 1]
train_df = train_df[train_df['Sales'] > 0.0]

# The Correlation with The Target 

In [None]:
corr_matrix = train_df.corr()
corr_matrix["Sales"].sort_values(ascending=False)

# ML Modeling

## Initialize The Model 

In [None]:
params = {'n_estimators': 1742,
          'min_child_samples': 89,
          'n_jobs':-1,
          'learning_rate': 0.2723,
          'max_depth': -1,  
          'subsample': 1,
          'colsample_bytree': 0.8,
          'reg_alpha': 0.1,
          'reg_lambda': 1,
          'verbosity': -1}

In [None]:
#LightGBM Regressor
lgbm = LGBMRegressor(**params, random_state=42)

## Validation

In [None]:
#Sort the values to use the last days as a validation set
temp = train_df.sort_values(["Year","Month","Day"], ignore_index=True).copy()

#The same time period as the test set
train = temp[:-47000].copy()       
vali = temp[-47000:].copy()

#Return to the same order as the original dataset
train.sort_values(["Store"], ignore_index=True, inplace=True)
vali.sort_values(["Store"], ignore_index=True, inplace=True)
train.sort_values(["Year","Month","Day"], ascending=False ,ignore_index=True, inplace=True)
vali.sort_values(["Year","Month","Day"], ascending=False ,ignore_index=True, inplace=True)

#Get the target
y_test = vali['Sales'].copy()
vali.drop('Sales',inplace=True,axis=1)

In [None]:
#Fit the Model
lgbm.fit(train.drop('Sales',axis=1),train['Sales'])
y_pred = lgbm.predict(vali)

#Use a Correction Factor Because we transformed the target with log(x+1) then reversed it
y_pred = y_pred*0.995
score = RMSPE(y_test,y_pred)
score

## Show the Features Importances

In [None]:
plotImp(lgbm,train.drop('Sales',axis=1))

# The Final Model 

## Prepare the Datasets

### We will fit the models with two datasets:

### 1- The Full Training Set 

In [None]:
X = train_df.drop('Sales',axis=1)
y = train_df['Sales']

### 2- A Dataset Consists of Only the Data Between May and September from All The Years

In [None]:
X_MaySept = train_df[(train_df['Month'] >= 5) & (train_df['Month'] <= 9)]
X_MaySept.reset_index(drop=True)
y_MaySept = X_MaySept['Sales']
X_MaySept = X_MaySept.drop('Sales',axis=1)

## Models: Layer 1

In [None]:
Predictions = pd.DataFrame()

### 1- Averaging 15 Models With The Same Hyperparameters but Changing The Seed. 

### Feed Them with The Full Dataset and Get the Mean, Harmonic Mean and Geometric Mean of The Predictions.

In [None]:
for seed in range(30,46):
    lgbm = LGBMRegressor(**params, random_state=seed)
    #Transform the target with log(x+1) to make the model able to optimize the loss function properly
    llgbm = TransformedTargetRegressor(lgbm, func = np.log1p, inverse_func = np.expm1)
    llgbm.fit(X, y)

    Predictions[f'Sales_{seed}'] = llgbm.predict(test_df)
    Predictions[f'Sales_{seed}'] = Predictions[f'Sales_{seed}'] * 0.995 #Multiply by a Correction Factor

Predictions['Mean'] = Predictions.mean(axis=1)
Predictions['HMean'] = Predictions.apply(stats.hmean, axis=1)
Predictions['GMean'] = Predictions.apply(stats.gmean, axis=1)

### 2- Averaging 15 Models With The Same Hyperparameters but Changing The Seed. 

### Feed Them with The MaySeptember Dataset and Get the Mean, Harmonic Mean and Geometric Mean of The Predictions.

In [None]:
for seed in range(30,46):
    lgbm = LGBMRegressor(**params, random_state=seed)
    #Transform the target with log(x+1) to make the model able to optimize the loss function properly
    llgbm = TransformedTargetRegressor(lgbm, func = np.log1p, inverse_func = np.expm1)
    llgbm.fit(X_MaySept, y_MaySept)

    Predictions[f'Sales_2_{seed}'] = llgbm.predict(test_df)
    Predictions[f'Sales_2_{seed}'] = Predictions[f'Sales_2_{seed}'] * 0.995

Predictions['Mean_2'] = Predictions.mean(axis=1)
Predictions['HMean_2'] = Predictions.apply(stats.hmean, axis=1)
Predictions['GMean_2'] = Predictions.apply(stats.gmean, axis=1)

## Models: Layer 2

### Get The Harmonic Mean of The Six Predictions 

In [None]:
FinalPred = Predictions[['Mean','HMean','GMean','Mean_2','HMean_2','GMean_2']].apply(stats.hmean,axis=1)

## Models: Make The Submission File

In [None]:
submission = pd.DataFrame({"Id": ID ,"Sales": FinalPred.values})
submission.to_csv('FinalSubmission.csv',index=False) 

# Public Leaderboard: 0.10448
# Private Leaderboard: 0.11323