# Feature building

## Import and set up

In [1]:
# import relevant packages
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
# set up paths
data_dir = "../data"

In [3]:
# import data 
df = pd.read_csv(os.path.join(data_dir, 'df_clean.csv'))
print(df.shape)
df.head()

(497376, 20)


  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,Date,Store,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Month,Year,Weekday
0,2013-01-01,353.0,3139.0,820.0,1.0,0.0,a,1.0,b,b,900.0,,,1.0,14.0,2013.0,"Feb,May,Aug,Nov",1,2013,1
1,2013-01-01,335.0,2401.0,482.0,1.0,0.0,a,1.0,b,a,90.0,,,1.0,31.0,2013.0,"Jan,Apr,Jul,Oct",1,2013,1
2,2013-01-01,512.0,2646.0,625.0,1.0,0.0,a,1.0,b,b,590.0,,,1.0,5.0,2013.0,"Mar,Jun,Sept,Dec",1,2013,1
3,2013-01-01,494.0,3113.0,527.0,1.0,0.0,a,1.0,b,a,1260.0,6.0,2011.0,0.0,,,,1,2013,1
4,2013-01-01,530.0,2907.0,532.0,1.0,0.0,a,1.0,a,c,18160.0,,,0.0,,,,1,2013,1


## Check duplicates

In [4]:
df.duplicated().sum()

0

## Check and fill missing values

In [5]:
df.isnull().sum()

Date                              0
Store                             0
Sales                             0
Customers                     14984
Open                          15010
Promo                         15012
StateHoliday                  15031
SchoolHoliday                 15093
StoreType                         0
Assortment                        0
CompetitionDistance            1305
CompetitionOpenSinceMonth    158067
CompetitionOpenSinceYear     158067
Promo2                            0
Promo2SinceWeek              244960
Promo2SinceYear              244960
PromoInterval                244960
Month                             0
Year                              0
Weekday                           0
dtype: int64

In [6]:
# fill missing "open" data with 0 if sales is 0 else with mode
mask = (df["Sales"] == 0) & (df["Open"].isna())
df.loc[mask, "Open"] = df.loc[mask, "Open"].fillna(value=0)

# fill remaining missing "open" data with mode
mode = df["Open"].mode()[0]
df["Open"].fillna(value=mode, inplace=True)

In [7]:
# fill missing "promo" data with mode
mode = df["Promo"].mode()[0]
df["Promo"].fillna(value=mode, inplace=True)

In [8]:
# fill missing "SchoolHoliday" data with mode
mode = df["SchoolHoliday"].mode()[0]
df["SchoolHoliday"].fillna(value=mode, inplace=True)

In [9]:
# Create new feature Holiday, initially by default no holiday (0)
df["Holiday"] = 0

# Whenever StateHoliday indicates a holiday make it 1
mask = (df["StateHoliday"].isin(["a", "b", "c"]))
df.loc[mask, "Holiday"] = 1

# Whenever StateHoliday is missing, but sales is 0 and store is closed, make it a holiday
mask2 = ((df["StateHoliday"].isna()) & (df["Sales"] == 0) & (df["Open"] == 0))
df.loc[mask2, "Holiday"] = 1

## Train test split

In [10]:
# Get X and y
y = df["Sales"]
X = df.copy()

print(y.shape)
print(X.shape)

(497376,)
(497376, 21)


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
print(y_train.shape)
print(X_train.shape)
print(y_test.shape)
print(X_test.shape)

(397900,)
(397900, 21)
(99476,)
(99476, 21)


## Mean encode categorical variables

In [12]:
# Get mean of target variable for StoreType - mean encoding
storetype_dict = X_train.groupby("StoreType").mean().loc[:, "Sales"].to_dict()
#storetype_dict
X_train['StoreType_enc'] = X_train['StoreType'].map(storetype_dict).fillna(X_train["Sales"].mean())
X_test['StoreType_enc'] = X_test['StoreType'].map(storetype_dict).fillna(X_train["Sales"].mean())  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['StoreType_enc'] = X_train['StoreType'].map(storetype_dict).fillna(X_train["Sales"].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['StoreType_enc'] = X_test['StoreType'].map(storetype_dict).fillna(X_train["Sales"].mean())


In [13]:
# Get mean of target variable for Assortment - mean encoding
assortment_dict = X_train.groupby("Assortment").mean().loc[:, "Sales"].to_dict()
X_train['Assortment_enc'] = X_train.loc[:,'Assortment'].map(assortment_dict).fillna(X_train["Sales"].mean())
X_test['Assortment_enc'] = X_test.loc[:,'Assortment'].map(assortment_dict).fillna(X_train["Sales"].mean()) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['Assortment_enc'] = X_train.loc[:,'Assortment'].map(assortment_dict).fillna(X_train["Sales"].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['Assortment_enc'] = X_test.loc[:,'Assortment'].map(assortment_dict).fillna(X_train["Sales"].mean())


In [14]:
# Get mean of target variable for Assortment - mean encoding
storeid_dict = X_train.groupby("Store").mean().loc[:, "Sales"].to_dict()
X_train['Store_enc'] = X_train.loc[:,'Store'].map(storeid_dict).fillna(X_train["Sales"].mean())
X_test['Store_enc'] = X_test.loc[:,'Store'].map(storeid_dict).fillna(X_train["Sales"].mean()) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['Store_enc'] = X_train.loc[:,'Store'].map(storeid_dict).fillna(X_train["Sales"].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['Store_enc'] = X_test.loc[:,'Store'].map(storeid_dict).fillna(X_train["Sales"].mean())


## Save files

In [15]:
X_train.to_csv(os.path.join(data_dir, 'X_train.csv'), index=False)
X_test.to_csv(os.path.join(data_dir, 'X_test.csv'), index=False)
y_train.to_csv(os.path.join(data_dir, 'y_train.csv'), index=False)
y_test.to_csv(os.path.join(data_dir, 'y_test.csv'), index=False)