## 1. Libraries and data

In [154]:
# Import libraries
import pickle
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

In [155]:
# Import the trained model
model = pickle.load(open("trained_model.pkl", "rb"))

In [156]:
# Import the real data
original_data = pd.read_csv("real_data/REAL_DATA.csv")
data = original_data

## 2. EDA

In [157]:
display(data)
display(data.info())

Unnamed: 0,index,store_ID,day_of_week,date,nb_customers_on_day,open,promotion,state_holiday,school_holiday
0,272371,415,7,01/03/2015,0,0,0,0,0
1,558468,27,7,29/12/2013,0,0,0,0,0
2,76950,404,3,19/03/2014,657,1,1,0,0
3,77556,683,2,29/01/2013,862,1,0,0,0
4,456344,920,3,19/03/2014,591,1,1,0,0
...,...,...,...,...,...,...,...,...,...
71200,59062,441,7,26/10/2014,0,0,0,0,0
71201,687449,377,7,18/08/2013,0,0,0,0,0
71202,207393,15,3,11/06/2014,648,1,0,0,0
71203,233378,950,2,23/04/2013,626,1,1,0,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71205 entries, 0 to 71204
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   index                71205 non-null  int64 
 1   store_ID             71205 non-null  int64 
 2   day_of_week          71205 non-null  int64 
 3   date                 71205 non-null  object
 4   nb_customers_on_day  71205 non-null  int64 
 5   open                 71205 non-null  int64 
 6   promotion            71205 non-null  int64 
 7   state_holiday        71205 non-null  object
 8   school_holiday       71205 non-null  int64 
dtypes: int64(7), object(2)
memory usage: 4.9+ MB


None

In [158]:
def data_eda(df):
    # There are three different state holidays, so we'll one-hot-encode them
    df = pd.get_dummies(df, columns=["state_holiday"], prefix="state_holiday", drop_first=True)

    # We'll one-hot-encode day_of_week too so every day is equally important
    df = pd.get_dummies(df, columns=["day_of_week"], prefix="day_of_week", drop_first=False)

    # We'll turn date into an integer too
    df['date'] = df['date'].apply(lambda x: '/'.join(x.split('/')[::-1])).str.replace('/', '').astype('int64')

    df = df.replace({True:1,False:0})

    # We don't need "index"
    df = df.drop("index", axis=1)
    
    return df

In [159]:
data = data_eda(data)

  df = df.replace({True:1,False:0})


In [160]:
display(data.sample(10))
display(data.info())

Unnamed: 0,store_ID,date,nb_customers_on_day,open,promotion,school_holiday,state_holiday_a,state_holiday_b,state_holiday_c,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,day_of_week_7
13711,712,20140429,730,1,1,0,0,0,0,0,1,0,0,0,0,0
2163,833,20141221,0,0,0,0,0,0,0,0,0,0,0,0,0,1
6473,123,20150110,416,1,0,0,0,0,0,0,0,0,0,0,1,0
27066,673,20141231,402,1,0,1,0,0,0,0,0,1,0,0,0,0
7763,391,20130116,531,1,0,0,0,0,0,0,0,1,0,0,0,0
8408,1064,20131102,1114,1,0,0,0,0,0,0,0,0,0,0,1,0
38815,604,20150720,488,1,0,0,0,0,0,1,0,0,0,0,0,0
8198,1071,20150303,825,1,1,0,0,0,0,0,1,0,0,0,0,0
63634,251,20140115,2113,1,0,0,0,0,0,0,0,1,0,0,0,0
340,1017,20130320,1176,1,1,0,0,0,0,0,0,1,0,0,0,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71205 entries, 0 to 71204
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   store_ID             71205 non-null  int64
 1   date                 71205 non-null  int64
 2   nb_customers_on_day  71205 non-null  int64
 3   open                 71205 non-null  int64
 4   promotion            71205 non-null  int64
 5   school_holiday       71205 non-null  int64
 6   state_holiday_a      71205 non-null  int64
 7   state_holiday_b      71205 non-null  int64
 8   state_holiday_c      71205 non-null  int64
 9   day_of_week_1        71205 non-null  int64
 10  day_of_week_2        71205 non-null  int64
 11  day_of_week_3        71205 non-null  int64
 12  day_of_week_4        71205 non-null  int64
 13  day_of_week_5        71205 non-null  int64
 14  day_of_week_6        71205 non-null  int64
 15  day_of_week_7        71205 non-null  int64
dtypes: int64(16)
memory us

None

## 3. Feature Scaling (Standardization)

In [161]:
# Standardize the data
scaler = pickle.load(open("scaler.pkl", "rb")) # We use the model that we trained with the known data

X = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)

## 4. Model Predictions

In [162]:
y_pred = pd.Series(model.predict(X))

## 5. Post-processing

In [163]:
original_data["sales"] = np.clip(a=y_pred, a_min=0, a_max=None) # Set all negative values to 0 (to improve: set sales=0 when open=0)

## 6. Data Export

In [164]:
original_data.to_csv("G2.csv", index=False)

In [165]:
check = pd.read_csv("G2.csv")
check

Unnamed: 0,index,store_ID,day_of_week,date,nb_customers_on_day,open,promotion,state_holiday,school_holiday,sales
0,272371,415,7,01/03/2015,0,0,0,0,0,190.589140
1,558468,27,7,29/12/2013,0,0,0,0,0,507.527599
2,76950,404,3,19/03/2014,657,1,1,0,0,6788.836202
3,77556,683,2,29/01/2013,862,1,0,0,0,5518.998093
4,456344,920,3,19/03/2014,591,1,1,0,0,6130.208475
...,...,...,...,...,...,...,...,...,...,...
71200,59062,441,7,26/10/2014,0,0,0,0,0,0.000000
71201,687449,377,7,18/08/2013,0,0,0,0,0,0.000000
71202,207393,15,3,11/06/2014,648,1,0,0,0,5565.443914
71203,233378,950,2,23/04/2013,626,1,1,0,0,6685.373974
