In [None]:
### Import required libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

import os

In [None]:
### Make sure that 'ggplot' style is used for all plots
plt.style.use('ggplot')
# plt.style.available ### To view all other available styles

In [None]:
### Set Working Directory (WD)
os.chdir('/Volumes/GoogleDrive/My Drive/CEMEX/Data Translators/GitHub/rgamerosl/capstone-project')

In [None]:
### Read the data
df = pd.read_excel("dataset/data_v0.xlsx")

In [None]:
df.info()

In [None]:
df.iloc[-1,]

In [None]:
### For some strange reason when reading the Excel file it detects 1'021,336 entries. However I know there are only 466,786 differente entries, everything else are just empty rows
df = df.iloc[0:466786,0:18]
df.info()

In [None]:
df_backup = df.copy(deep=True)

In [None]:
display(df.head(7))

In [None]:
df['liters_per_hour'].describe()

In [None]:
### First replace empty strings with nan
df['liters_per_hour'] = df['liters_per_hour'].replace(r'^\s*$', np.nan, regex=True)
### Transform strings into float numbers
df['liters_per_hour'] = df['liters_per_hour'].astype(float)

In [None]:
df['liters_per_hour'].describe()

In [None]:
plt.hist(df['liters_per_hour'])
plt.show()

In [None]:
### Everything bigger than 20 should be nan
df.loc[abs(df['liters_per_hour']) > 20,'liters_per_hour'] = np.nan
### Also every negative value should be nan
df.loc[df['liters_per_hour'] < 0,'liters_per_hour'] = np.nan

In [None]:
plt.hist(df['liters_per_hour'])
plt.show()

In [None]:
df['liters_per_hour'].describe()

In [None]:
### Amount of nan in the liters_per_hour (target value)
df['liters_per_hour'].isna().sum()/len(df['liters_per_hour'])

In [None]:
df['Manufacturer'].unique()

In [None]:
### Find out how many observations we have from each manufacturer
df.Manufacturer.value_counts()

In [None]:
### Group different spellings for the same manufacturer

df.loc[df['Manufacturer']=="INTERNATIONAL","Manufacturer"] = "International"
df.loc[df['Manufacturer']=="FREIGHTLINER","Manufacturer"] = "Freightliner"
df.loc[df['Manufacturer']=="freightliner","Manufacturer"] = "Freightliner"
df.loc[df['Manufacturer']=="MAN","Manufacturer"] = "Man"

In [None]:
### Count the amount of differents trucks analysed by Manufacturers
print(len(df_backup['Plate'].unique()))
df.groupby('Manufacturer').Plate.nunique() 

### Something could be wrong as there are only 2330 different Plates numbers, however when doing the counts by manufacturer I get 2592 Plates.

In [None]:
Plates = df.groupby('Plate').Manufacturer.nunique()
display(Plates.head(10))

In [None]:
len(Plates[Plates > 1])
display(Plates[Plates > 1])

### There are 262 Plates numbers that have listed 2 different manufacturers, what should we do with those?
### Maybe look for an extra dataset only with the Plate number and Manufacturer to validate the real value for the Manufacturer of these 262 Trucks

In [None]:
df.loc[df["Plate"]=="CR3535","Manufacturer"].unique()

In [None]:
### Considering that there are only 1 truck from the following Manufactrers: Astra, Scania and Volvo, I think we could get rid of these 3 trucks for the analysis
df2 = df[~df.Manufacturer.isin(["Astra", "Scania", "Volvo"])]
print(df2.Manufacturer.value_counts())
print(df2.groupby('Manufacturer').Plate.nunique())

In [None]:
df2.info()