# Store Date Preparation

In [1]:
import pandas as pd
from IPython.display import display, HTML
from sklearn import preprocessing

stores_file="../data/rossmann-store-sales/store.csv"
# read in file
df_stores=pd.read_csv(stores_file)

## Data Preparation

In [2]:
## TODO rename PromoInterval to Promo2Interval
df_stores.rename(columns={"PromoInterval": "Promo2Interval"}, inplace=True)
## TODO ajust Data Types of Promo2 to Boolean
df_stores["Promo2"]=df_stores['Promo2'].astype('bool')
print(df_stores.info())
display(df_stores)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1115 entries, 0 to 1114
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Store                      1115 non-null   int64  
 1   StoreType                  1115 non-null   object 
 2   Assortment                 1115 non-null   object 
 3   CompetitionDistance        1112 non-null   float64
 4   CompetitionOpenSinceMonth  761 non-null    float64
 5   CompetitionOpenSinceYear   761 non-null    float64
 6   Promo2                     1115 non-null   bool   
 7   Promo2SinceWeek            571 non-null    float64
 8   Promo2SinceYear            571 non-null    float64
 9   Promo2Interval             571 non-null    object 
dtypes: bool(1), float64(5), int64(1), object(3)
memory usage: 79.6+ KB
None


Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,Promo2Interval
0,1,c,a,1270.0,9.0,2008.0,False,,,
1,2,a,a,570.0,11.0,2007.0,True,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,a,a,14130.0,12.0,2006.0,True,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,c,c,620.0,9.0,2009.0,False,,,
4,5,a,a,29910.0,4.0,2015.0,False,,,
...,...,...,...,...,...,...,...,...,...,...
1110,1111,a,a,1900.0,6.0,2014.0,True,31.0,2013.0,"Jan,Apr,Jul,Oct"
1111,1112,c,c,1880.0,4.0,2006.0,False,,,
1112,1113,a,c,9260.0,,,False,,,
1113,1114,a,c,870.0,,,False,,,


### Promo2 data cleaning

In [8]:
# handel promo2
# locate the problem
print ("Value Count",  df_stores["Promo2"].value_counts())
print("Number of lines without promo2: ", df_stores[(df_stores["Promo2"] == False) & (df_stores["Promo2SinceWeek"].isna())].shape[0])
print("Number of lines with promo2 and Null values in Promo2Since: ", df_stores[(df_stores["Promo2"] == True) & (df_stores["Promo2SinceWeek"].isna())].shape[0])
# Set all to 0 when no Promo2

df_stores.fillna({"Promo2Interval":0}, inplace=True)
df_stores.fillna({"Promo2SinceWeek":0}, inplace=True)
df_stores[].fillna({"Promo2SinceYear":0}, inplace=True)
df_stores.isna().mean()
## change data types
df_stores["Promo2SinceWeek"] = df_stores["Promo2SinceWeek"].astype(int)
df_stores["Promo2SinceYear"] = df_stores["Promo2SinceYear"].astype(int)
df_stores.dtypes

Value Count Promo2
True     570
False    542
Name: count, dtype: int64
Number of lines without promo2:  0
Number of lines with promo2 and Null values in Promo2Since:  0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_stores["Promo2SinceWeek"].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_stores["Promo2SinceYear"].fillna(0, inplace=True)


Store                          int64
StoreType                     object
Assortment                    object
CompetitionDistance_x        float64
CompetitionOpenSinceMonth    float64
CompetitionOpenSinceYear     float64
Promo2                          bool
Promo2SinceWeek                int64
Promo2SinceYear                int64
Promo2Interval                object
CompetitionDistanceBin       float64
CompetitionDistance_y        float64
dtype: object

### Null values in Competition Distance cleaning

In [4]:
# handel Null values in CompetitionDistance
print (df_stores.shape[0])
df_stores.dropna(subset=["CompetitionDistance"], inplace=True)
print (df_stores.shape[0])


1115
1112


## Binning the Competition Distance

In [5]:

est = preprocessing.KBinsDiscretizer(n_bins=25, encode='ordinal').fit(df_stores[["CompetitionDistance"]])
df_stores["CompetitionDistanceBin"] = est.transform(df_stores[["CompetitionDistance"]])    
display(df_stores)

# calc mean for each bin and assign it to each instance
mean_dist = df_stores[["CompetitionDistance","CompetitionDistanceBin"]].groupby("CompetitionDistanceBin").mean().reset_index()
display(mean_dist)
df_stores = pd.merge(df_stores, mean_dist, how="inner", on="CompetitionDistanceBin")
display(df_stores)

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,Promo2Interval,CompetitionDistanceBin
0,1,c,a,1270.0,9.0,2008.0,False,0,0,0,8.0
1,2,a,a,570.0,11.0,2007.0,True,13,2010,"Jan,Apr,Jul,Oct",5.0
2,3,a,a,14130.0,12.0,2006.0,True,14,2011,"Jan,Apr,Jul,Oct",22.0
3,4,c,c,620.0,9.0,2009.0,False,0,0,0,5.0
4,5,a,a,29910.0,4.0,2015.0,False,0,0,0,24.0
...,...,...,...,...,...,...,...,...,...,...,...
1110,1111,a,a,1900.0,6.0,2014.0,True,31,2013,"Jan,Apr,Jul,Oct",11.0
1111,1112,c,c,1880.0,4.0,2006.0,False,0,0,0,10.0
1112,1113,a,c,9260.0,,,False,0,0,0,20.0
1113,1114,a,c,870.0,,,False,0,0,0,6.0


Unnamed: 0,CompetitionDistanceBin,CompetitionDistance
0,0.0,64.390244
1,1.0,152.5
2,2.0,240.454545
3,3.0,328.372093
4,4.0,442.826087
5,5.0,584.666667
6,6.0,768.139535
7,7.0,1017.826087
8,8.0,1201.363636
9,9.0,1449.545455


Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance_x,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,Promo2Interval,CompetitionDistanceBin,CompetitionDistance_y
0,1,c,a,1270.0,9.0,2008.0,False,0,0,0,8.0,1201.363636
1,2,a,a,570.0,11.0,2007.0,True,13,2010,"Jan,Apr,Jul,Oct",5.0,584.666667
2,3,a,a,14130.0,12.0,2006.0,True,14,2011,"Jan,Apr,Jul,Oct",22.0,15728.888889
3,4,c,c,620.0,9.0,2009.0,False,0,0,0,5.0,584.666667
4,5,a,a,29910.0,4.0,2015.0,False,0,0,0,24.0,31696.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
1107,1111,a,a,1900.0,6.0,2014.0,True,31,2013,"Jan,Apr,Jul,Oct",11.0,2049.782609
1108,1112,c,c,1880.0,4.0,2006.0,False,0,0,0,10.0,1726.590909
1109,1113,a,c,9260.0,,,False,0,0,0,20.0,9531.136364
1110,1114,a,c,870.0,,,False,0,0,0,6.0,768.139535
