In [1]:
import warnings
import pandas as pd 
import numpy as np
import sys
sys.path.append("..")
warnings.simplefilter("ignore")
from Scripts.Data_cleaning import load_data,clean_data
from Scripts.preprocessing import preprocess
from Scripts.modelling import modeler
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

It is important to process the data into a format where it can be fed to a machine learning model. This typically means converting all non-numeric columns to numeric, handling NaN values and generating new features from already existing features. 

###### Load datasets 

In [2]:
train_df = load_data('../data/train-data.csv')
test_df = load_data('../data/test-data.csv')

###### missing values 

In [3]:
train_df.isna().sum()

Store                        0
StoreType                    0
Assortment                   0
CompetitionDistance          0
CompetitionOpenSinceMonth    0
CompetitionOpenSinceYear     0
Promo2                       0
Promo2SinceWeek              0
Promo2SinceYear              0
PromoInterval                0
DayOfWeek                    0
Date                         0
Sales                        0
Customers                    0
Open                         0
Promo                        0
StateHoliday                 0
SchoolHoliday                0
Days                         0
months                       0
Years                        0
DayOfYear                    0
WeekOfYear                   0
dtype: int64

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1014567 entries, 0 to 1014566
Data columns (total 23 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   Store                      1014567 non-null  int64  
 1   StoreType                  1014567 non-null  object 
 2   Assortment                 1014567 non-null  object 
 3   CompetitionDistance        1014567 non-null  float64
 4   CompetitionOpenSinceMonth  1014567 non-null  int64  
 5   CompetitionOpenSinceYear   1014567 non-null  int64  
 6   Promo2                     1014567 non-null  int64  
 7   Promo2SinceWeek            1014567 non-null  int64  
 8   Promo2SinceYear            1014567 non-null  int64  
 9   PromoInterval              1014567 non-null  object 
 10  DayOfWeek                  1014567 non-null  int64  
 11  Date                       1014567 non-null  object 
 12  Sales                      1014567 non-null  float64
 13  Customers   

In [5]:
test_df.isna().sum()

Store                        0
StoreType                    0
Assortment                   0
CompetitionDistance          0
CompetitionOpenSinceMonth    0
CompetitionOpenSinceYear     0
Promo2                       0
Promo2SinceWeek              0
Promo2SinceYear              0
PromoInterval                0
Id                           0
DayOfWeek                    0
Date                         0
Open                         0
Promo                        0
StateHoliday                 0
SchoolHoliday                0
Days                         0
months                       0
Years                        0
DayOfYear                    0
WeekOfYear                   0
dtype: int64

##### converting date column to datetime 

In [6]:
c = clean_data(train_df)
c.convert_to_datetime(train_df,['Date'])

In [7]:
d = clean_data(test_df)
d.convert_to_datetime(test_df,['Date'])

###### convert to int 

In [8]:
clean_data(test_df).convert_to_int(test_df,['Days','months','Years','DayOfYear','WeekOfYear'])

###### converting non-numeric to numeric

In [9]:
r = preprocess(train_df)
r.label_encoder(train_df)

In [10]:
train_df.info == 'object'

False

In [11]:
train_df['StoreType'].unique()

array([0, 1, 2, 3], dtype=int64)

all columns in train data have been converted to numeric 

In [12]:
t = preprocess(test_df)
t.label_encoder(test_df)

In [13]:
test_df.info == 'object'

False

all columns in test data have been converted to numeric 

###### feature engineering 

In [14]:
def feat_eng(df):
    df['weekends'] = (df['DayOfWeek']//5 ==1).astype(int)
    df['weekdays'] = (df['DayOfWeek']//5 !=1).astype(int)

In [15]:
feat_eng(train_df)
feat_eng(test_df)

In [16]:
train_df.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,...,Promo,StateHoliday,SchoolHoliday,Days,months,Years,DayOfYear,WeekOfYear,weekends,weekdays
0,1,0,0,1270.0,9,2008,0,0,0,0,...,1,0,1,31,7,2015,212,31,1,0
1,1,0,0,1270.0,9,2008,0,0,0,0,...,1,0,1,30,7,2015,211,31,0,1
2,1,0,0,1270.0,9,2008,0,0,0,0,...,1,0,1,29,7,2015,210,31,0,1
3,1,0,0,1270.0,9,2008,0,0,0,0,...,1,0,1,28,7,2015,209,31,0,1
4,1,0,0,1270.0,9,2008,0,0,0,0,...,1,0,1,27,7,2015,208,31,0,1


In [17]:
test_df.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,...,Promo,StateHoliday,SchoolHoliday,Days,months,Years,DayOfYear,WeekOfYear,weekends,weekdays
0,1,0,0,1270.0,9,2008,0,0,0,0,...,1.0,0,0.0,17,9,2015,260,38,0,1
1,1,0,0,1270.0,9,2008,0,0,0,0,...,1.0,0,0.0,16,9,2015,259,38,0,1
2,1,0,0,1270.0,9,2008,0,0,0,0,...,1.0,0,0.0,15,9,2015,258,38,0,1
3,1,0,0,1270.0,9,2008,0,0,0,0,...,1.0,0,0.0,14,9,2015,257,38,0,1
4,1,0,0,1270.0,9,2008,0,0,0,0,...,0.0,0,0.0,13,9,2015,256,37,1,0


###### feature scaling

datetime columns have to be ignored before doing feature scaling 

In [18]:
def separate_columns(df):
    datetime_columm =['Date','DayOfWeek'] + df.iloc[:,-7:].columns.tolist() + ['Sales']
    no_datetime = df.columns.difference(datetime_columm).tolist()
    df_ = df[no_datetime] 
    return df_,datetime_columm

In [19]:
train_df_no_datetime = separate_columns(train_df)[0]
test_df_no_datetime = separate_columns(test_df)[0]


###### scale data 

In [20]:
r.scalling_data(train_df_no_datetime)
t.scalling_data(test_df_no_datetime)

###### concat scaled data and unscaled data 

In [21]:
train_transformed  = pd.concat([train_df_no_datetime,train_df[separate_columns(train_df)[1]]],axis=1)
test_transformed = pd.concat([test_df_no_datetime,train_df[separate_columns(test_df)[1]]],axis=1)

In [22]:
len(train_transformed.columns) == len(train_df.columns)

True

In [23]:
len(test_transformed.columns) == len(test_df.columns)

False

# Modelling 

In [24]:
clean_data(train_transformed).drop_cols(train_transformed,['Date'])

Unnamed: 0,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Customers,Open,Promo,Promo2,Promo2SinceWeek,Promo2SinceYear,...,StoreType,DayOfWeek,Days,months,Years,DayOfYear,WeekOfYear,weekends,weekdays,Sales
0,0.0,0.062406,0.727273,0.947368,0.407489,1.0,1.0,0.0,0.00,0.000000,...,0.000000,5,31,7,2015,212,31,1,0,5263.0
1,0.0,0.062406,0.727273,0.947368,0.400881,1.0,1.0,0.0,0.00,0.000000,...,0.000000,4,30,7,2015,211,31,0,1,5020.0
2,0.0,0.062406,0.727273,0.947368,0.383994,1.0,1.0,0.0,0.00,0.000000,...,0.000000,3,29,7,2015,210,31,0,1,4782.0
3,0.0,0.062406,0.727273,0.947368,0.411160,1.0,1.0,0.0,0.00,0.000000,...,0.000000,2,28,7,2015,209,31,0,1,5011.0
4,0.0,0.062406,0.727273,0.947368,0.449339,1.0,1.0,0.0,0.00,0.000000,...,0.000000,1,27,7,2015,208,31,0,1,6102.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1014562,0.5,0.266101,0.000000,0.000000,0.248899,1.0,0.0,1.0,0.44,0.998511,...,0.666667,6,5,1,2013,5,1,1,0,4771.0
1014563,0.5,0.266101,0.000000,0.000000,0.239354,1.0,0.0,1.0,0.44,0.998511,...,0.666667,5,4,1,2013,4,1,1,0,4540.0
1014564,0.5,0.266101,0.000000,0.000000,0.220264,1.0,0.0,1.0,0.44,0.998511,...,0.666667,4,3,1,2013,3,1,0,1,4297.0
1014565,0.5,0.266101,0.000000,0.000000,0.223935,1.0,0.0,1.0,0.44,0.998511,...,0.666667,3,2,1,2013,2,1,0,1,3697.0


###### Random forest  

In [25]:
#### split the data
X_train, X_test, X_val, y_train, y_test,y_val = modeler().split_data(train_transformed,'Sales')

In [26]:
RandomForest_pipeline=Pipeline([
                     ('imputer', SimpleImputer(strategy='mean')),
                    ('scaler',StandardScaler()),
                     ('rf_regressor',RandomForestRegressor(n_jobs=-1, n_estimators=15,))])

In [27]:
rf_model = RandomForest_pipeline.fit(X_train,y_train)