In [None]:
### Import required libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns

import os

In [None]:
### Make sure that 'ggplot' style is used for all plots
plt.style.use('ggplot')
# plt.style.available ### To view all other available styles

In [None]:
### Set Working Directory (WD)
os.chdir('/Volumes/GoogleDrive/My Drive/CEMEX/Data Translators/GitHub/rgamerosl/capstone-project')

In [None]:
# ### How to import RDS (equivalent to RData) into pandas

# import rpy2.robjects as robjects
# from rpy2.robjects.packages import importr
# from rpy2.robjects import pandas2ri

# from rpy2.robjects.conversion import localconverter

# readRDS = robjects.r['readRDS']
# rdata = readRDS('dataset/Fuel_Data.RDS')

# with localconverter(robjects.default_converter + pandas2ri.converter):
#   pdata = robjects.conversion.rpy2py(rdata)

# print(pdata.info())
# display(pdata.head(5))

In [None]:
### Read the data
df = pd.read_csv("dataset/Fuel_Data.csv")
display(df)

In [None]:
df.info()

In [None]:
### Fill with 0 the NA for the different events
df.iloc[:,16:33] = df.iloc[:,16:33].fillna(0)
df.info()

In [None]:
display(df.head(10))

In [None]:
df0 = df.drop(['Date','Plate','Zone','Hrs_eff','Engine_hrs','Fuel_used','km_per_liter'], axis=1)
display(df0.head(10))

In [None]:
df0.info()

In [None]:
fig = plt.subplots(figsize=(10,10))
ax = sns.heatmap(df0.iloc[:,0:11].corr(), annot=True, fmt='0.2f', cmap='Blues')
plt.yticks(rotation=0)
plt.savefig(f'figures/correlations1.png')

In [None]:
subset = [7,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25]
fig = plt.subplots(figsize=(16,16))
ax = sns.heatmap(df0.iloc[:,subset].corr(), annot=True, fmt='0.2f', cmap='Blues',xticklabels=subset,yticklabels=subset)
plt.yticks(rotation=0)
plt.savefig(f'figures/correlations2.png')

In [None]:
df1 = df0.dropna(subset=['liters_per_hour'])
df1.reset_index(inplace=True)
df1.drop('index',axis=1,inplace=True)
df1.info()

In [None]:
# df1.to_excel('dataset/data_v2.xlsx')

In [None]:
oe_manufacturer = OneHotEncoder()
oe_results_m = oe_manufacturer.fit_transform(df1[['Manufacturer']])
manufacturer_ohe = pd.DataFrame(oe_results_m.toarray(), columns=oe_manufacturer.categories_)
print(display(manufacturer_ohe.head(10)))
manufacturer_ohe.columns=np.array(oe_manufacturer.categories_).flatten()
manufacturer_ohe.info()

In [None]:
df2 = df1.join(manufacturer_ohe)
### Drop column for Kenworth, before droping it the 29 column corresponded to the manufacturer Kenworth
df2.drop(df2.columns[29],axis=1,inplace=True)
print(display(df2.head(10)))
df2.info()

In [None]:
### Weekdays (0: Monday to 6: Sunday)
oe_weekday = OneHotEncoder()
oe_results_w = oe_weekday.fit_transform(df2[['Weekday']])
weekday_ohe = pd.DataFrame(oe_results_w.toarray(), columns=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'])
print(display(weekday_ohe.head(10)))
weekday_ohe.columns=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
weekday_ohe.info()

In [None]:
df2 = df2.join(weekday_ohe)
### Drop column for Friday, before droping it the 34 column corresponded to the label Friday
df2.drop(df2.columns[34],axis=1,inplace=True)
print(display(df2.head(10)))
df2.info()

In [None]:
oe_city = OneHotEncoder()
oe_results_c = oe_city.fit_transform(df2[['City']])
city_ohe = pd.DataFrame(oe_results_c.toarray(), columns=oe_city.categories_)
print(display(city_ohe.head(10)))
city_ohe.columns=np.array(oe_city.categories_).flatten()
city_ohe.info()

In [None]:
df2 = df2.join(city_ohe)
### Drop column for MEXICO DF, before droping it the 80 column corresponded to the label MEXICO DF
df2.drop(df2.columns[80],axis=1,inplace=True)
print(display(df2.head(10)))
df2.info()

In [None]:
# ### Another approach to categorical/indicator variables using get_dummiyes properly
# import pandas as pd

# from pandas.api.types import CategoricalDtype 

# # say you want a column for "japan" too (it'll be always zero, of course)
# df["country"] = train_df["country"].astype(CategoricalDtype(["australia","germany","korea","russia","japan"]))

# # now call .get_dummies() as usual
# pd.get_dummies(df["country"],prefix='country')

In [None]:
# df2.to_excel("dataset/final_data.xlsx")

In [None]:
data = df2.drop(['Manufacturer','City','Weekday'],axis=1)
data.info()

In [None]:
### Now need to do Train Test split and afterwards StandardScale all the numerical variables in each set seperately

In [None]:
### Train Test split
data_train, data_test = train_test_split(data, test_size=0.25, random_state=42, shuffle=True)

In [None]:
col_indexes = data.columns[0:23]

In [None]:
### Standarize numerical variables in Train Set
scaler = StandardScaler()
data_train_scale = data_train.copy(deep=True)
data_train_scale[col_indexes] = scaler.fit_transform(data_train[col_indexes].to_numpy()) 
display(data_train_scale.head(10))

In [None]:
### Standarize numerical variables in Test Set
scaler = StandardScaler()
data_test_scale = data_test.copy(deep=True)
data_test_scale[col_indexes] = scaler.fit_transform(data_test[col_indexes].to_numpy()) 
display(data_test_scale.head(10))