In [1]:
# Import Libraries
import sys
sys.path.append('C:/Users/rohan/Documents/Projects/Food_Demand_Forecasting_Challenge/Food_Demand_Forecasting_Challenge')

'''Standard libraries'''
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotnine as p9
%matplotlib inline
color = sns.color_palette()

'''Data Pre-Processing'''
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

'''Local Libraries'''
from src.features import preprocess_ml_data

'''Display Settings'''
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load data files
train_features = pd.read_csv(r'../data/processed/built_features/train_features.csv')
val_features = pd.read_csv(r'../data/processed/built_features/val_features.csv')
test_features = pd.read_csv(r'../data/processed/built_features/test_features.csv')
full_original_train_features = pd.read_csv(r'../data/processed/built_features/full_original_train_features.csv')
full_original_test_features = pd.read_csv(r'../data/processed/built_features/full_original_test_features.csv')

train_target = pd.read_csv(r'../data/processed/target/train_target.csv')
val_target = pd.read_csv(r'../data/processed/target/val_target.csv')
test_target = pd.read_csv(r'../data/processed/target/test_target.csv')
full_original_train_target = pd.read_csv(r'../data/processed/target/full_original_train_target.csv')

In [3]:
full_original_train_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 456548 entries, 0 to 456547
Data columns (total 75 columns):
level_0                                                                456548 non-null int64
index                                                                  456548 non-null int64
id                                                                     456548 non-null int64
week                                                                   456548 non-null int64
center_id                                                              456548 non-null int64
meal_id                                                                456548 non-null int64
checkout_price                                                         456548 non-null float64
base_price                                                             456548 non-null float64
emailer_for_promotion                                                  456548 non-null int64
homepage_featured                               

In [4]:
full_original_test_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32573 entries, 0 to 32572
Data columns (total 75 columns):
level_0                                                                32573 non-null int64
index                                                                  32573 non-null int64
id                                                                     32573 non-null int64
week                                                                   32573 non-null int64
center_id                                                              32573 non-null int64
meal_id                                                                32573 non-null int64
checkout_price                                                         32573 non-null float64
base_price                                                             32573 non-null float64
emailer_for_promotion                                                  32573 non-null int64
homepage_featured                                          

In [5]:
# Drop irrelevant columns from features set
train_features_1 = preprocess_ml_data.drop_irrelevant_features(train_features)
val_features_1   = preprocess_ml_data.drop_irrelevant_features(val_features)
test_features_1  = preprocess_ml_data.drop_irrelevant_features(test_features)

full_original_train_features_1 = preprocess_ml_data.drop_irrelevant_features(full_original_train_features)
full_original_test_features_1 = preprocess_ml_data.drop_irrelevant_features(full_original_test_features)

In [8]:
# Separate numerical features from categorical features
train_num_features = preprocess_ml_data.separate_num_cat_features(train_features_1)[0]
val_num_features   = preprocess_ml_data.separate_num_cat_features(val_features_1)[0]
test_num_features  = preprocess_ml_data.separate_num_cat_features(test_features_1)[0]
full_original_train_num_features = preprocess_ml_data.separate_num_cat_features(full_original_train_features_1)[0]
full_original_test_num_features = preprocess_ml_data.separate_num_cat_features(full_original_test_features_1)[0]

train_cat_features = preprocess_ml_data.separate_num_cat_features(train_features_1)[1]
val_cat_features   = preprocess_ml_data.separate_num_cat_features(val_features_1)[1]
test_cat_features  = preprocess_ml_data.separate_num_cat_features(test_features_1)[1]
full_original_train_cat_features = preprocess_ml_data.separate_num_cat_features(full_original_train_features_1)[1]
full_original_test_cat_features = preprocess_ml_data.separate_num_cat_features(full_original_test_features_1)[1]

In [9]:
# transforms the numerical features (imputing missing values with median value and
# normalizing the values in each column such that minimum is 0 and maximum is 1)

train_num_features_transformed = preprocess_ml_data.transform_num_features(train_num_features)
val_num_features_transformed   = preprocess_ml_data.transform_num_features(val_num_features)
test_num_features_transformed  = preprocess_ml_data.transform_num_features(test_num_features)

full_original_train_num_features_transformed = preprocess_ml_data.transform_num_features(full_original_train_num_features)
full_original_test_num_features_transformed = preprocess_ml_data.transform_num_features(full_original_test_num_features)

In [10]:
# transforms the categorical features (imputing missing values with most frequent occurence
# value and performing one-hot encoding)

train_cat_features_transformed = preprocess_ml_data.transform_cat_features(train_cat_features)
val_cat_features_transformed   = preprocess_ml_data.transform_cat_features(val_cat_features)
test_cat_features_transformed  = preprocess_ml_data.transform_cat_features(test_cat_features)

full_original_train_cat_features_transformed = preprocess_ml_data.transform_cat_features(full_original_train_cat_features)
full_original_test_cat_features_transformed = preprocess_ml_data.transform_cat_features(full_original_test_cat_features)

In [11]:
# # Takes a sparse matrix and converts it to a dataframe

# train_cat_features_transformed = preprocess_ml_data.convert_spmatrix_to_dataframe(train_cat_features_transformed)
# val_cat_features_transformed   = preprocess_ml_data.convert_spmatrix_to_dataframe(val_cat_features_transformed)
# test_cat_features_transformed  = preprocess_ml_data.convert_spmatrix_to_dataframe(test_cat_features_transformed)

# full_original_train_cat_features_transformed = preprocess_ml_data.convert_spmatrix_to_dataframe(full_original_train_cat_features_transformed)
# full_original_test_cat_features_transformed = preprocess_ml_data.convert_spmatrix_to_dataframe(full_original_test_cat_features_transformed)

In [12]:
# transforms all the features to be ready to be fed to machine learning models

train_all_features_transformed = preprocess_ml_data.transform_all_features(train_num_features_transformed, train_cat_features_transformed)
val_all_features_transformed   = preprocess_ml_data.transform_all_features(val_num_features_transformed, val_cat_features_transformed)
test_all_features_transformed  = preprocess_ml_data.transform_all_features(test_num_features_transformed, test_cat_features_transformed)

full_original_train_all_features_transformed  = preprocess_ml_data.transform_all_features(full_original_train_num_features_transformed, full_original_train_cat_features_transformed)
full_original_test_all_features_transformed  = preprocess_ml_data.transform_all_features(full_original_test_num_features_transformed, full_original_test_cat_features_transformed)

In [13]:
# Save the pre-processed feature sets to disk
train_all_features_transformed.to_csv(r'../data/processed/final_features/train_features.csv', index=False)
val_all_features_transformed.to_csv(r'../data/processed/final_features/val_features.csv', index=False)
test_all_features_transformed.to_csv(r'../data/processed/final_features/test_features.csv', index=False)

full_original_train_all_features_transformed.to_csv(r'../data/processed/final_features/full_original_train_features.csv', index=False)
full_original_test_all_features_transformed.to_csv(r'../data/processed/final_features/full_original_test_features.csv', index=False)