In [1]:
import warnings
import pandas as pd 
import numpy as np
import sys
sys.path.append("..")
warnings.simplefilter("ignore")
from Scripts.Data_cleaning import load_data,clean_data
from Scripts.preprocessing import preprocess

It is important to process the data into a format where it can be fed to a machine learning model. This typically means converting all non-numeric columns to numeric, handling NaN values and generating new features from already existing features. 

###### Load datasets 

In [2]:
train_df = load_data('../data/train-data.csv')
test_df = load_data('../data/test-data.csv')

###### missing values 

In [3]:
train_df.isna().sum()

Store                        0
StoreType                    0
Assortment                   0
CompetitionDistance          0
CompetitionOpenSinceMonth    0
CompetitionOpenSinceYear     0
Promo2                       0
Promo2SinceWeek              0
Promo2SinceYear              0
PromoInterval                0
DayOfWeek                    0
Date                         0
Sales                        0
Customers                    0
Open                         0
Promo                        0
StateHoliday                 0
SchoolHoliday                0
Days                         0
months                       0
Years                        0
DayOfYear                    0
WeekOfYear                   0
dtype: int64

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1014567 entries, 0 to 1014566
Data columns (total 23 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   Store                      1014567 non-null  int64  
 1   StoreType                  1014567 non-null  object 
 2   Assortment                 1014567 non-null  object 
 3   CompetitionDistance        1014567 non-null  float64
 4   CompetitionOpenSinceMonth  1014567 non-null  int64  
 5   CompetitionOpenSinceYear   1014567 non-null  int64  
 6   Promo2                     1014567 non-null  int64  
 7   Promo2SinceWeek            1014567 non-null  int64  
 8   Promo2SinceYear            1014567 non-null  int64  
 9   PromoInterval              1014567 non-null  object 
 10  DayOfWeek                  1014567 non-null  int64  
 11  Date                       1014567 non-null  object 
 12  Sales                      1014567 non-null  float64
 13  Customers   

In [5]:
test_df.isna().sum()

Store                        0
StoreType                    0
Assortment                   0
CompetitionDistance          0
CompetitionOpenSinceMonth    0
CompetitionOpenSinceYear     0
Promo2                       0
Promo2SinceWeek              0
Promo2SinceYear              0
PromoInterval                0
Id                           0
DayOfWeek                    0
Date                         0
Open                         0
Promo                        0
StateHoliday                 0
SchoolHoliday                0
Days                         0
months                       0
Years                        0
DayOfYear                    0
WeekOfYear                   0
dtype: int64

##### converting date column to datetime 

In [6]:
c = clean_data(train_df)
c.convert_to_datetime(train_df,['Date'])

In [7]:
d = clean_data(test_df)
d.convert_to_datetime(test_df,['Date'])

###### convert to int 

In [8]:
clean_data(test_df).convert_to_int(test_df,['Days','months','Years','DayOfYear','WeekOfYear'])

###### converting non-numeric to numeric

In [9]:
r = preprocess(train_df)
r.label_encoder(train_df)

In [10]:
train_df.info == 'object'

False

In [11]:
train_df['StoreType'].unique()

array([0, 1, 2, 3], dtype=int64)

all columns in train data have been converted to numeric 

In [12]:
t = preprocess(test_df)
t.label_encoder(test_df)

In [13]:
test_df.info == 'object'

False

all columns in test data have been converted to numeric 

###### feature engineering 

In [14]:
def feat_eng(df):
    df['weekends'] = (df['DayOfWeek']//5 ==1).astype(int)
    df['weekdays'] = (df['DayOfWeek']//5 !=1).astype(int)

In [15]:
feat_eng(train_df)
feat_eng(test_df)

In [16]:
train_df.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,...,Promo,StateHoliday,SchoolHoliday,Days,months,Years,DayOfYear,WeekOfYear,weekends,weekdays
0,1,0,0,1270.0,9,2008,0,0,0,0,...,1,0,1,31,7,2015,212,31,1,0
1,1,0,0,1270.0,9,2008,0,0,0,0,...,1,0,1,30,7,2015,211,31,0,1
2,1,0,0,1270.0,9,2008,0,0,0,0,...,1,0,1,29,7,2015,210,31,0,1
3,1,0,0,1270.0,9,2008,0,0,0,0,...,1,0,1,28,7,2015,209,31,0,1
4,1,0,0,1270.0,9,2008,0,0,0,0,...,1,0,1,27,7,2015,208,31,0,1


In [17]:
test_df.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,...,Promo,StateHoliday,SchoolHoliday,Days,months,Years,DayOfYear,WeekOfYear,weekends,weekdays
0,1,0,0,1270.0,9,2008,0,0,0,0,...,1.0,0,0.0,17,9,2015,260,38,0,1
1,1,0,0,1270.0,9,2008,0,0,0,0,...,1.0,0,0.0,16,9,2015,259,38,0,1
2,1,0,0,1270.0,9,2008,0,0,0,0,...,1.0,0,0.0,15,9,2015,258,38,0,1
3,1,0,0,1270.0,9,2008,0,0,0,0,...,1.0,0,0.0,14,9,2015,257,38,0,1
4,1,0,0,1270.0,9,2008,0,0,0,0,...,0.0,0,0.0,13,9,2015,256,37,1,0


###### feature scaling

datetime columns have to be ignored before doing feature scaling 

In [18]:
def separate_columns(df):
    datetime_columm =['Date','DayOfWeek'] + df.iloc[:,-7:].columns.tolist()
    no_datetime = df.columns.difference(datetime_columm).tolist()
    df_ = df[no_datetime] 
    return df_,datetime_columm

In [19]:
train_df_no_datetime = separate_columns(train_df)[0]
test_df_no_datetime = separate_columns(test_df)[0]


###### scale data 

In [20]:
r.scalling_data(train_df_no_datetime)
t.scalling_data(test_df_no_datetime)

###### concat scaled data and unscaled data 

In [21]:
train_transformed  = pd.concat([train_df_no_datetime,train_df[separate_columns(train_df)[1]]],axis=1)
test_transformed = pd.concat([test_df_no_datetime,train_df[separate_columns(test_df)[1]]],axis=1)

In [25]:
len(train_transformed.columns) == len(train_df.columns)

True

In [26]:
len(test_transformed.columns) == len(test_df.columns)

True

In [28]:
train_transformed.to_csv('../data/train_preprocessed.csv')
test_transformed.to_csv('../data/test_preprocessed.csv')