# Deploy de Modelos de Machine Learning

## App Web Para Previsão de Preço de Carros

### Fonte de Dados

Usaremos uma versão modificada do dataset MPG Cars:
    
https://archive.ics.uci.edu/ml/datasets/auto+mpg

### Instalando e Carregando Pacotes

In [1]:
# Versão da Linguagem Python
from platform import python_version
print('Versão da Linguagem Python Usada Neste Jupyter Notebook:', python_version())

Versão da Linguagem Python Usada Neste Jupyter Notebook: 3.9.7


In [2]:
# Para atualizar um pacote, execute o comando abaixo no terminal ou prompt de comando:
# pip install -U nome_pacote

# Para instalar a versão exata de um pacote, execute o comando abaixo no terminal ou prompt de comando:
# pip install nome_pacote==versão_desejada

# Depois de instalar ou atualizar o pacote, reinicie o jupyter notebook.

# Instala o pacote watermark.
# Esse pacote é usado para gravar as versões de outros pacotes usados neste jupyter notebook.
#!pip install -q -U watermark

In [3]:
# Imports
import joblib
import sklearn
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score

In [4]:
# Versões dos pacotes usados neste jupyter notebook
%reload_ext watermark
%watermark --iversions

pandas : 1.3.4
joblib : 1.1.0
sklearn: 0.24.2
numpy  : 1.20.3



## Carregando e Explorando os Dados

In [5]:
# Carrega os dados
df = pd.read_csv('dados/carros.csv')

In [6]:
# Visualiza
df.head()

Unnamed: 0,Make,Model,Type,Origin,DriveTrain,MSRP,Invoice,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway,Weight,Wheelbase,Length
0,Acura,MDX,SUV,Asia,All,"$36,945","$33,337",3.5,6.0,265,17,23,4451,106,189
1,Acura,RSX Type S 2dr,Sedan,Asia,Front,"$23,820","$21,761",2.0,4.0,200,24,31,2778,101,172
2,Acura,TSX 4dr,Sedan,Asia,Front,"$26,990","$24,647",2.4,4.0,200,22,29,3230,105,183
3,Acura,TL 4dr,Sedan,Asia,Front,"$33,195","$30,299",3.2,6.0,270,20,28,3575,108,186
4,Acura,3.5 RL 4dr,Sedan,Asia,Front,"$43,755","$39,014",3.5,6.0,225,18,24,3880,115,197


In [7]:
# Shape
df.shape

(428, 15)

In [8]:
# Colunas
df.columns

Index(['Make', 'Model', 'Type', 'Origin', 'DriveTrain', 'MSRP', 'Invoice',
       'EngineSize', 'Cylinders', 'Horsepower', 'MPG_City', 'MPG_Highway',
       'Weight', 'Wheelbase', 'Length'],
      dtype='object')

In [9]:
# Info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 428 entries, 0 to 427
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Make         428 non-null    object 
 1   Model        428 non-null    object 
 2   Type         428 non-null    object 
 3   Origin       428 non-null    object 
 4   DriveTrain   428 non-null    object 
 5   MSRP         428 non-null    object 
 6   Invoice      428 non-null    object 
 7   EngineSize   428 non-null    float64
 8   Cylinders    426 non-null    float64
 9   Horsepower   428 non-null    int64  
 10  MPG_City     428 non-null    int64  
 11  MPG_Highway  428 non-null    int64  
 12  Weight       428 non-null    int64  
 13  Wheelbase    428 non-null    int64  
 14  Length       428 non-null    int64  
dtypes: float64(2), int64(6), object(7)
memory usage: 50.3+ KB


In [10]:
# Verifica valores nulos
df.isnull().any()

Make           False
Model          False
Type           False
Origin         False
DriveTrain     False
MSRP           False
Invoice        False
EngineSize     False
Cylinders       True
Horsepower     False
MPG_City       False
MPG_Highway    False
Weight         False
Wheelbase      False
Length         False
dtype: bool

In [11]:
# Verifica valores nulos
df[df['Cylinders'].isnull()]

Unnamed: 0,Make,Model,Type,Origin,DriveTrain,MSRP,Invoice,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway,Weight,Wheelbase,Length
247,Mazda,RX-8 4dr automatic,Sports,Asia,Rear,"$25,700","$23,794",1.3,,197,18,25,3053,106,174
248,Mazda,RX-8 4dr manual,Sports,Asia,Rear,"$27,200","$25,179",1.3,,238,18,24,3029,106,174


In [12]:
# Remove valores nulos
df = df.dropna()

In [13]:
# Reset do índice
df.reset_index(drop=True)

Unnamed: 0,Make,Model,Type,Origin,DriveTrain,MSRP,Invoice,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway,Weight,Wheelbase,Length
0,Acura,MDX,SUV,Asia,All,"$36,945","$33,337",3.5,6.0,265,17,23,4451,106,189
1,Acura,RSX Type S 2dr,Sedan,Asia,Front,"$23,820","$21,761",2.0,4.0,200,24,31,2778,101,172
2,Acura,TSX 4dr,Sedan,Asia,Front,"$26,990","$24,647",2.4,4.0,200,22,29,3230,105,183
3,Acura,TL 4dr,Sedan,Asia,Front,"$33,195","$30,299",3.2,6.0,270,20,28,3575,108,186
4,Acura,3.5 RL 4dr,Sedan,Asia,Front,"$43,755","$39,014",3.5,6.0,225,18,24,3880,115,197
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421,Volvo,C70 LPT convertible 2dr,Sedan,Europe,Front,"$40,565","$38,203",2.4,5.0,197,21,28,3450,105,186
422,Volvo,C70 HPT convertible 2dr,Sedan,Europe,Front,"$42,565","$40,083",2.3,5.0,242,20,26,3450,105,186
423,Volvo,S80 T6 4dr,Sedan,Europe,Front,"$45,210","$42,573",2.9,6.0,268,19,26,3653,110,190
424,Volvo,V40,Wagon,Europe,Front,"$26,135","$24,641",1.9,4.0,170,22,29,2822,101,180


In [14]:
# Verifica valores nulos
df.isnull().any()

Make           False
Model          False
Type           False
Origin         False
DriveTrain     False
MSRP           False
Invoice        False
EngineSize     False
Cylinders      False
Horsepower     False
MPG_City       False
MPG_Highway    False
Weight         False
Wheelbase      False
Length         False
dtype: bool

### Engenharia de Atributos

In [15]:
# Removemos colunas que não nos interessam
#df = df.drop(['Model', 'Origin', 'Invoice', 'EngineSize', 'Cylinders', 'Weight', 'Wheelbase', 'Length'], axis = 1)
df = df.drop(['Model', 'Origin', 'Invoice'], axis = 1)

In [16]:
# Colunas
df.columns

Index(['Make', 'Type', 'DriveTrain', 'MSRP', 'EngineSize', 'Cylinders',
       'Horsepower', 'MPG_City', 'MPG_Highway', 'Weight', 'Wheelbase',
       'Length'],
      dtype='object')

In [17]:
# Ajustamos a coluna target
df['MSRP'] = df['MSRP'].map(lambda x: x.lstrip('$').replace(',',''))

In [18]:
# Convertemos para tipo numérico
df['MSRP'] = pd.to_numeric(df['MSRP'])

In [19]:
# Visualiza os dados
df.head()

Unnamed: 0,Make,Type,DriveTrain,MSRP,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway,Weight,Wheelbase,Length
0,Acura,SUV,All,36945,3.5,6.0,265,17,23,4451,106,189
1,Acura,Sedan,Front,23820,2.0,4.0,200,24,31,2778,101,172
2,Acura,Sedan,Front,26990,2.4,4.0,200,22,29,3230,105,183
3,Acura,Sedan,Front,33195,3.2,6.0,270,20,28,3575,108,186
4,Acura,Sedan,Front,43755,3.5,6.0,225,18,24,3880,115,197


### Pré-Processamento 

Aplicando One-Hot Encoding às variáveis categóricas.

In [20]:
# Info
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 426 entries, 0 to 427
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Make         426 non-null    object 
 1   Type         426 non-null    object 
 2   DriveTrain   426 non-null    object 
 3   MSRP         426 non-null    int64  
 4   EngineSize   426 non-null    float64
 5   Cylinders    426 non-null    float64
 6   Horsepower   426 non-null    int64  
 7   MPG_City     426 non-null    int64  
 8   MPG_Highway  426 non-null    int64  
 9   Weight       426 non-null    int64  
 10  Wheelbase    426 non-null    int64  
 11  Length       426 non-null    int64  
dtypes: float64(2), int64(7), object(3)
memory usage: 43.3+ KB


In [21]:
# One-hot encoding
df = pd.get_dummies(df, columns = ['Make', 'Type', 'DriveTrain'])

In [22]:
# Shape
df.shape

(426, 56)

In [23]:
# Visualiza
df.head()

Unnamed: 0,MSRP,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway,Weight,Wheelbase,Length,Make_Acura,...,Make_Volvo,Type_Hybrid,Type_SUV,Type_Sedan,Type_Sports,Type_Truck,Type_Wagon,DriveTrain_All,DriveTrain_Front,DriveTrain_Rear
0,36945,3.5,6.0,265,17,23,4451,106,189,1,...,0,0,1,0,0,0,0,1,0,0
1,23820,2.0,4.0,200,24,31,2778,101,172,1,...,0,0,0,1,0,0,0,0,1,0
2,26990,2.4,4.0,200,22,29,3230,105,183,1,...,0,0,0,1,0,0,0,0,1,0
3,33195,3.2,6.0,270,20,28,3575,108,186,1,...,0,0,0,1,0,0,0,0,1,0
4,43755,3.5,6.0,225,18,24,3880,115,197,1,...,0,0,0,1,0,0,0,0,1,0


In [24]:
# Colunas
df.columns

Index(['MSRP', 'EngineSize', 'Cylinders', 'Horsepower', 'MPG_City',
       'MPG_Highway', 'Weight', 'Wheelbase', 'Length', 'Make_Acura',
       'Make_Audi', 'Make_BMW', 'Make_Buick', 'Make_Cadillac',
       'Make_Chevrolet', 'Make_Chrysler', 'Make_Dodge', 'Make_Ford',
       'Make_GMC', 'Make_Honda', 'Make_Hummer', 'Make_Hyundai',
       'Make_Infiniti', 'Make_Isuzu', 'Make_Jaguar', 'Make_Jeep', 'Make_Kia',
       'Make_Land Rover', 'Make_Lexus', 'Make_Lincoln', 'Make_MINI',
       'Make_Mazda', 'Make_Mercedes-Benz', 'Make_Mercury', 'Make_Mitsubishi',
       'Make_Nissan', 'Make_Oldsmobile', 'Make_Pontiac', 'Make_Porsche',
       'Make_Saab', 'Make_Saturn', 'Make_Scion', 'Make_Subaru', 'Make_Suzuki',
       'Make_Toyota', 'Make_Volkswagen', 'Make_Volvo', 'Type_Hybrid',
       'Type_SUV', 'Type_Sedan', 'Type_Sports', 'Type_Truck', 'Type_Wagon',
       'DriveTrain_All', 'DriveTrain_Front', 'DriveTrain_Rear'],
      dtype='object')

In [25]:
df.head()

Unnamed: 0,MSRP,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway,Weight,Wheelbase,Length,Make_Acura,...,Make_Volvo,Type_Hybrid,Type_SUV,Type_Sedan,Type_Sports,Type_Truck,Type_Wagon,DriveTrain_All,DriveTrain_Front,DriveTrain_Rear
0,36945,3.5,6.0,265,17,23,4451,106,189,1,...,0,0,1,0,0,0,0,1,0,0
1,23820,2.0,4.0,200,24,31,2778,101,172,1,...,0,0,0,1,0,0,0,0,1,0
2,26990,2.4,4.0,200,22,29,3230,105,183,1,...,0,0,0,1,0,0,0,0,1,0
3,33195,3.2,6.0,270,20,28,3575,108,186,1,...,0,0,0,1,0,0,0,0,1,0
4,43755,3.5,6.0,225,18,24,3880,115,197,1,...,0,0,0,1,0,0,0,0,1,0


### Construindo o Modelo

In [26]:
# Separa x e y
X = df.drop('MSRP', axis = 1)
y = df['MSRP']
X = X.to_numpy()
y = y.to_numpy()

In [27]:
# Divisão dos dados em treino e teste
X_treino, X_teste, y_treino, y_teste = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [28]:
# Cria o modelo
modelo = GradientBoostingRegressor(n_estimators = 5000, 
                                   learning_rate = 0.1,
                                   max_depth = 10,
                                   min_samples_leaf = 3,
                                   max_features = 0.1,
                                   loss = 'lad',
                                   random_state = 0)

In [29]:
# Treinamento do modelo
modelo.fit(X_treino, y_treino)

GradientBoostingRegressor(loss='lad', max_depth=10, max_features=0.1,
                          min_samples_leaf=3, n_estimators=5000,
                          random_state=0)

In [30]:
# Previsões
previsoes = modelo.predict(X_teste)

In [31]:
print("O R2 Score do Modelo é:", r2_score(y_teste, previsoes) * 100)

O R2 Score do Modelo é: 72.3030166271737


In [32]:
# Salva o modelo em disco
joblib.dump(modelo, '../modelo/modelo.pkl')

['../modelo/modelo.pkl']

# Fim