# Group 7 Project

In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import datetime
import tensorflow as tf

## Retrieving Data

In [2]:
df = pd.read_csv("test.csv")
store_df = pd.read_csv("store.csv")

## Data Processing

Convert for time series:

In [3]:
df['Date'] = df['Date'].astype('datetime64[ns]')

Merge the two dataframes:

In [4]:
df = pd.merge(df, store_df, how='inner', on='Store')
df.head()

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,1,4,2015-09-17,1.0,1,0,0,c,a,1270.0,9.0,2008.0,0,,,
1,857,1,3,2015-09-16,1.0,1,0,0,c,a,1270.0,9.0,2008.0,0,,,
2,1713,1,2,2015-09-15,1.0,1,0,0,c,a,1270.0,9.0,2008.0,0,,,
3,2569,1,1,2015-09-14,1.0,1,0,0,c,a,1270.0,9.0,2008.0,0,,,
4,3425,1,7,2015-09-13,0.0,0,0,0,c,a,1270.0,9.0,2008.0,0,,,


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41088 entries, 0 to 41087
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Id                         41088 non-null  int64         
 1   Store                      41088 non-null  int64         
 2   DayOfWeek                  41088 non-null  int64         
 3   Date                       41088 non-null  datetime64[ns]
 4   Open                       41077 non-null  float64       
 5   Promo                      41088 non-null  int64         
 6   StateHoliday               41088 non-null  object        
 7   SchoolHoliday              41088 non-null  int64         
 8   StoreType                  41088 non-null  object        
 9   Assortment                 41088 non-null  object        
 10  CompetitionDistance        40992 non-null  float64       
 11  CompetitionOpenSinceMonth  25872 non-null  float64       
 12  Comp

### Handling Missing Values

In [6]:
df.isnull().sum()

Id                               0
Store                            0
DayOfWeek                        0
Date                             0
Open                            11
Promo                            0
StateHoliday                     0
SchoolHoliday                    0
StoreType                        0
Assortment                       0
CompetitionDistance             96
CompetitionOpenSinceMonth    15216
CompetitionOpenSinceYear     15216
Promo2                           0
Promo2SinceWeek              17232
Promo2SinceYear              17232
PromoInterval                17232
dtype: int64

In [7]:
#Since there are only 3 missing values replace with median
df['CompetitionDistance'].fillna(df['CompetitionDistance'].median(), inplace = True)

#fill the rest with 0
df.fillna(0, inplace = True)

## Feature Engineering

Replace '0' with na values:

In [8]:
df['StateHoliday'] = df.StateHoliday.replace([0, '0'], np.nan)
df.StateHoliday.unique()

array([nan, 'a'], dtype=object)

### Dummy Encoding

In [9]:
dummy_columns = ['StoreType', 'Assortment', 'StateHoliday']
df = pd.get_dummies(df, columns=dummy_columns)

In [10]:
df.head()

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,SchoolHoliday,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,...,Promo2SinceYear,PromoInterval,StoreType_a,StoreType_b,StoreType_c,StoreType_d,Assortment_a,Assortment_b,Assortment_c,StateHoliday_a
0,1,1,4,2015-09-17,1.0,1,0,1270.0,9.0,2008.0,...,0.0,0,0,0,1,0,1,0,0,0
1,857,1,3,2015-09-16,1.0,1,0,1270.0,9.0,2008.0,...,0.0,0,0,0,1,0,1,0,0,0
2,1713,1,2,2015-09-15,1.0,1,0,1270.0,9.0,2008.0,...,0.0,0,0,0,1,0,1,0,0,0
3,2569,1,1,2015-09-14,1.0,1,0,1270.0,9.0,2008.0,...,0.0,0,0,0,1,0,1,0,0,0
4,3425,1,7,2015-09-13,0.0,0,0,1270.0,9.0,2008.0,...,0.0,0,0,0,1,0,1,0,0,0


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41088 entries, 0 to 41087
Data columns (total 22 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Id                         41088 non-null  int64         
 1   Store                      41088 non-null  int64         
 2   DayOfWeek                  41088 non-null  int64         
 3   Date                       41088 non-null  datetime64[ns]
 4   Open                       41088 non-null  float64       
 5   Promo                      41088 non-null  int64         
 6   SchoolHoliday              41088 non-null  int64         
 7   CompetitionDistance        41088 non-null  float64       
 8   CompetitionOpenSinceMonth  41088 non-null  float64       
 9   CompetitionOpenSinceYear   41088 non-null  float64       
 10  Promo2                     41088 non-null  int64         
 11  Promo2SinceWeek            41088 non-null  float64       
 12  Prom

### Working with Time

In [12]:
#Splitting the Date as Day, month and year and adding 3 new columns
df['Day'] = df['Date'].dt.day
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year
df['Weekofyear'] = df['Date'].dt.weekofyear

  df['Weekofyear'] = df['Date'].dt.weekofyear


In [13]:
#How long the competition has been open
df['CompetitionOpen'] = 12 * (df.Year - df.CompetitionOpenSinceYear) + (df.Month - df.CompetitionOpenSinceMonth)

In [14]:
#How long the promo has been running
df['PromoOpen'] = 12 * (df.Year - df.Promo2SinceYear) + (df.Weekofyear - df.Promo2SinceWeek) / 4.0

In [15]:
df = df.drop(columns=['Weekofyear','Promo2SinceWeek', 'CompetitionOpenSinceMonth', 'Promo2SinceYear', 'CompetitionOpenSinceYear', 'Open', 'PromoInterval'])

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41088 entries, 0 to 41087
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Id                   41088 non-null  int64         
 1   Store                41088 non-null  int64         
 2   DayOfWeek            41088 non-null  int64         
 3   Date                 41088 non-null  datetime64[ns]
 4   Promo                41088 non-null  int64         
 5   SchoolHoliday        41088 non-null  int64         
 6   CompetitionDistance  41088 non-null  float64       
 7   Promo2               41088 non-null  int64         
 8   StoreType_a          41088 non-null  uint8         
 9   StoreType_b          41088 non-null  uint8         
 10  StoreType_c          41088 non-null  uint8         
 11  StoreType_d          41088 non-null  uint8         
 12  Assortment_a         41088 non-null  uint8         
 13  Assortment_b         41088 non-

In [17]:
#replace competitiondistance with furthest variable
df['CompetitionDistance'] = df.CompetitionDistance.replace(np.nan, df['CompetitionDistance'].max())
                                                           
#Competittion Open - use mean
df['CompetitionOpen'] = df.CompetitionOpen.replace(np.nan, df['CompetitionOpen'].max())

#PromoOpen - recode missing as 0
df['PromoOpen'] = df.PromoOpen.replace(np.nan, 0)

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41088 entries, 0 to 41087
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Id                   41088 non-null  int64         
 1   Store                41088 non-null  int64         
 2   DayOfWeek            41088 non-null  int64         
 3   Date                 41088 non-null  datetime64[ns]
 4   Promo                41088 non-null  int64         
 5   SchoolHoliday        41088 non-null  int64         
 6   CompetitionDistance  41088 non-null  float64       
 7   Promo2               41088 non-null  int64         
 8   StoreType_a          41088 non-null  uint8         
 9   StoreType_b          41088 non-null  uint8         
 10  StoreType_c          41088 non-null  uint8         
 11  StoreType_d          41088 non-null  uint8         
 12  Assortment_a         41088 non-null  uint8         
 13  Assortment_b         41088 non-

## Splitting Data

In [19]:
X = df.drop(columns=['Date','Store', 'Id'])

In [20]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X)
scaled_X = scaler.transform(X)
scaled_X

array([[ 0.01033678,  1.23544154, -0.89269463, ...,  0.        ,
        -0.76633345,  1.17667518],
       [-0.48582866,  1.23544154, -0.89269463, ...,  0.        ,
        -0.76633345,  1.17667518],
       [-0.98199411,  1.23544154, -0.89269463, ...,  0.        ,
        -0.76633345,  1.17667518],
       ...,
       [-1.47815955,  1.23544154,  1.1202039 , ...,  0.        ,
         1.30391053, -0.8500866 ],
       [ 1.49883311, -0.80942721,  1.1202039 , ...,  0.        ,
         1.30391053, -0.85010758],
       [ 1.00266767, -0.80942721,  1.1202039 , ...,  0.        ,
         1.30391053, -0.85010758]])

In [21]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41088 entries, 0 to 41087
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   DayOfWeek            41088 non-null  int64  
 1   Promo                41088 non-null  int64  
 2   SchoolHoliday        41088 non-null  int64  
 3   CompetitionDistance  41088 non-null  float64
 4   Promo2               41088 non-null  int64  
 5   StoreType_a          41088 non-null  uint8  
 6   StoreType_b          41088 non-null  uint8  
 7   StoreType_c          41088 non-null  uint8  
 8   StoreType_d          41088 non-null  uint8  
 9   Assortment_a         41088 non-null  uint8  
 10  Assortment_b         41088 non-null  uint8  
 11  Assortment_c         41088 non-null  uint8  
 12  StateHoliday_a       41088 non-null  uint8  
 13  Day                  41088 non-null  int64  
 14  Month                41088 non-null  int64  
 15  Year                 41088 non-null 

#### Store the data for the models

In [22]:
from numpy import savetxt, save

# Save as txt if needed in other csv format
# savetxt('csvs/x_test_dataset.csv', x_train, delimiter=',')

# Save as binary for fastest and more efficient saving
save('npys/x_test_dataset.npy', X)

## Save the submission with predicted sales

#### Predict the sales

In [23]:
# In this case it will be 0 just for simplicity
sales = np.zeros(len(df))

len(sales) == len(df)

True

#### Add to the df

In [24]:
df['Sales'] = sales
df.head()

Unnamed: 0,Id,Store,DayOfWeek,Date,Promo,SchoolHoliday,CompetitionDistance,Promo2,StoreType_a,StoreType_b,...,Assortment_a,Assortment_b,Assortment_c,StateHoliday_a,Day,Month,Year,CompetitionOpen,PromoOpen,Sales
0,1,1,4,2015-09-17,1,0,1270.0,0,0,0,...,1,0,0,0,17,9,2015,84.0,24189.5,0.0
1,857,1,3,2015-09-16,1,0,1270.0,0,0,0,...,1,0,0,0,16,9,2015,84.0,24189.5,0.0
2,1713,1,2,2015-09-15,1,0,1270.0,0,0,0,...,1,0,0,0,15,9,2015,84.0,24189.5,0.0
3,2569,1,1,2015-09-14,1,0,1270.0,0,0,0,...,1,0,0,0,14,9,2015,84.0,24189.5,0.0
4,3425,1,7,2015-09-13,0,0,1270.0,0,0,0,...,1,0,0,0,13,9,2015,84.0,24189.25,0.0


#### Remove everything except Id and Sales

In [25]:
df = df[['Id', 'Sales']]
df.head()

Unnamed: 0,Id,Sales
0,1,0.0
1,857,0.0
2,1713,0.0
3,2569,0.0
4,3425,0.0


#### Save the submission

In [26]:
df.to_csv('submission.csv', index=False)