# Preços de Casas - Técnicas avançadas de Regressão

## 1 - Objetivo da Análise

#### Obj.:  Prever o preço de venda de cada casa, para cada ID no conjunto de teste. Será previsto o valor da variável SalePrice.

## 2 - Importação das bibliotecas básicas e das bases de dados Train e Test

#### Importação das Bibliotecas

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.impute import SimpleImputer

#### Importação das bases de dados

In [2]:
# Base de dados de treinamento

train_data = pd.read_csv('train.csv') # base de teste
test_data = pd.read_csv('test.csv') # Base de treinamento

In [3]:
train_data.info() # informações das colunas da base de treinamento

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [4]:
test_data.info() # informações das colunas da base de teste

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

In [5]:
# Exclusão das colunas que apresentam mais de 15% dos valores nulos base de treinamento e teste

train_data = train_data.drop(['LotFrontage', 'Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal'], axis = 1)
teste_data = test_data.drop(['LotFrontage', 'Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal'], axis = 1)


## 3 - Exploração e processamento da Base de Treinamento

### 3.1 - Análise Exploratória

In [6]:
# Base de dados de treinamento

train_data.head().columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotArea', 'Street', 'LotShape',
       'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood',
       'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl',
       'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual',
       'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical',
       '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath',
       'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr',
       'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
       'Enclos

In [7]:
# Calda da base de dados de treinamento

train_data.tail()

Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MoSold,YrSold,SaleType,SaleCondition,SalePrice
1455,1456,60,RL,7917,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,40,0,0,0,0,8,2007,WD,Normal,175000
1456,1457,20,RL,13175,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,2,2010,WD,Normal,210000
1457,1458,70,RL,9042,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,60,0,0,0,0,5,2010,WD,Normal,266500
1458,1459,20,RL,9717,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,112,0,0,0,4,2010,WD,Normal,142125
1459,1460,20,RL,9937,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,68,0,0,0,0,6,2008,WD,Normal,147500


In [8]:
# A base de dados tem 1460 linhas e 81 colunas

train_data.shape

(1460, 74)

####  Análise das colunas numéricas

In [9]:
# Colunas com os dados númericos

train_numericos = train_data.select_dtypes(include=['int64', 'float64'])
train_numericos.describe()

Unnamed: 0,Id,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,46.549315,...,472.980137,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,161.319273,...,213.804841,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,2.703626,1.328095,79442.502883
min,1.0,20.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,0.0,...,334.5,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,0.0,...,480.0,0.0,25.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,0.0,...,576.0,168.0,68.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,1474.0,...,1418.0,857.0,547.0,552.0,508.0,480.0,738.0,12.0,2010.0,755000.0


In [10]:
# Lista com as colunas numéricas da base de treinamento. As colunas MSSubClass, OverallQuall e OverallCond não entram nessa categória pois
# os números representam dados categóricos.

train_numericos.columns

Index(['Id', 'MSSubClass', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MoSold',
       'YrSold', 'SalePrice'],
      dtype='object')

#### Análise colunas categóricas

In [11]:
# Colunas com os dados categôricos

train_categoricos = train_data.select_dtypes(include=['object'])
train_categoricos.describe()

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,Electrical,KitchenQual,Functional,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition
count,1460,1460,1460,1460,1460,1460,1460,1460,1460,1460,...,1459,1460,1460,1379,1379,1379,1379,1460,1460,1460
unique,5,2,4,4,2,5,3,25,9,8,...,5,4,7,6,3,5,5,3,9,6
top,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,...,SBrkr,TA,Typ,Attchd,Unf,TA,TA,Y,WD,Normal
freq,1151,1454,925,1311,1459,1052,1382,225,1260,1445,...,1334,735,1360,870,605,1311,1326,1340,1267,1198


In [12]:
# Lista com as colunas categóricas da base de treinamento. As colunas MSSubClass, OverallQuall e OverallCond, foram consideradas
# categóricas por representar de forma numérica dados categóricos.

train_categoricos.columns

Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
       'PavedDrive', 'SaleType', 'SaleCondition'],
      dtype='object')

####  Valores Nulos

#### Tratamento de valores numéricos nulos

In [13]:
# Colunas com valores nulos

nulos = train_numericos.isnull().sum()

In [14]:
# Quantidade de valores nulos

nulos.head(50)

Id                0
MSSubClass        0
LotArea           0
OverallQual       0
OverallCond       0
YearBuilt         0
YearRemodAdd      0
MasVnrArea        8
BsmtFinSF1        0
BsmtFinSF2        0
BsmtUnfSF         0
TotalBsmtSF       0
1stFlrSF          0
2ndFlrSF          0
LowQualFinSF      0
GrLivArea         0
BsmtFullBath      0
BsmtHalfBath      0
FullBath          0
HalfBath          0
BedroomAbvGr      0
KitchenAbvGr      0
TotRmsAbvGrd      0
Fireplaces        0
GarageYrBlt      81
GarageCars        0
GarageArea        0
WoodDeckSF        0
OpenPorchSF       0
EnclosedPorch     0
3SsnPorch         0
ScreenPorch       0
PoolArea          0
MoSold            0
YrSold            0
SalePrice         0
dtype: int64

##### O tratamento de valores nulos para colunas númericas consistirá em substituir os valores nulos pela mediana dos valores da coluna. As colunas numéricas que apresentam dados nulos são: MasVnrArea eGarageYrBlt.

#### Tratando a coluna MasVnrArea

In [15]:
# Mediana dos valores na coluna MasVnrArea

print(f'A mediana da coluna MasVnrArea é {train_numericos["MasVnrArea"].median()}')


A mediana da coluna MasVnrArea é 0.0


In [16]:
# Atibuindo a mediana às celulas com valores nulos na coluna MasVnrArea

train_data["MasVnrArea"].fillna(train_data["MasVnrArea"].median(), inplace=True)
train_data[train_data["MasVnrArea"].isnull()]

Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MoSold,YrSold,SaleType,SaleCondition,SalePrice


#### Tratando a coluna GarageYrBlt

In [17]:
# Mediana dos valores na colunas GarageYrBlt

print(f'A mediana dos valores na coluna GarageYrBlt é {train_numericos["GarageYrBlt"].median()}')

A mediana dos valores na coluna GarageYrBlt é 1980.0


In [18]:
# Atibuindo a média às celulas com valores nulos na coluna GarageYrBlt

train_data["GarageYrBlt"].fillna(train_data["GarageYrBlt"].median(), inplace=True)
train_data[train_data["GarageYrBlt"].isnull()]

Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MoSold,YrSold,SaleType,SaleCondition,SalePrice


#### Tratamento de valores categoricos nulos

###### O traramento de variáveis categóricas nulas será realizado utilizando o algorítimo simple imputer da biblioteca do Sklearn. Um dos parâmetros do SimpleImputer e a strategy, no qual é definido se você quer usar a média, a mediana, a moda ou constant, para preechencer os dados faltantes, sendo a constant a opção a ser utilizada. Essa estratégia consiste em preencher a célula com o dado faltante, com o valor da célula acíma.

In [19]:
from sklearn.impute import SimpleImputer # Importação da biblioteca usada para imputação 

In [20]:
imp_constat = SimpleImputer(missing_values=np.nan, strategy='constant')

##### Quantidade de valores nulos por coluna categórica

In [21]:
nulos2 = train_categoricos.isnull().sum()
nulos2.head(50)

MSZoning          0
Street            0
LotShape          0
LandContour       0
Utilities         0
LotConfig         0
LandSlope         0
Neighborhood      0
Condition1        0
Condition2        0
BldgType          0
HouseStyle        0
RoofStyle         0
RoofMatl          0
Exterior1st       0
Exterior2nd       0
MasVnrType        8
ExterQual         0
ExterCond         0
Foundation        0
BsmtQual         37
BsmtCond         37
BsmtExposure     38
BsmtFinType1     37
BsmtFinType2     38
Heating           0
HeatingQC         0
CentralAir        0
Electrical        1
KitchenQual       0
Functional        0
GarageType       81
GarageFinish     81
GarageQual       81
GarageCond       81
PavedDrive        0
SaleType          0
SaleCondition     0
dtype: int64

In [22]:
nulos2 = train_categoricos.isnull().sum()
colunas_com_valores_nulos = []
for c in nulos2.items():
    if c[1] > 0:
        colunas_com_valores_nulos.append(c[0])
colunas_com_valores_nulos

['MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond']

In [23]:
imputer = imp_constat.fit(train_data[colunas_com_valores_nulos])
train_data[colunas_com_valores_nulos] = imputer.transform(train_data[colunas_com_valores_nulos])

In [24]:
train_categoricos.head()

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,Electrical,KitchenQual,Functional,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition
0,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
1,RL,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,...,SBrkr,TA,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
2,RL,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
3,RL,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,...,SBrkr,Gd,Typ,Detchd,Unf,TA,TA,Y,WD,Abnorml
4,RL,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,...,SBrkr,Gd,Typ,Attchd,RFn,TA,TA,Y,WD,Normal


In [25]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 74 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   LotShape       1460 non-null   object 
 6   LandContour    1460 non-null   object 
 7   Utilities      1460 non-null   object 
 8   LotConfig      1460 non-null   object 
 9   LandSlope      1460 non-null   object 
 10  Neighborhood   1460 non-null   object 
 11  Condition1     1460 non-null   object 
 12  Condition2     1460 non-null   object 
 13  BldgType       1460 non-null   object 
 14  HouseStyle     1460 non-null   object 
 15  OverallQual    1460 non-null   int64  
 16  OverallCond    1460 non-null   int64  
 17  YearBuilt      1460 non-null   int64  
 18  YearRemo

### 3.2 - Pré-processamento dos dados

In [26]:
train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,61,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,42,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,35,272,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,84,0,0,0,0,12,2008,WD,Normal,250000


In [27]:
X_train_data = train_data.iloc[:, 1:73].values

In [28]:
X_train_data

array([[60, 'RL', 8450, ..., 2008, 'WD', 'Normal'],
       [20, 'RL', 9600, ..., 2007, 'WD', 'Normal'],
       [60, 'RL', 11250, ..., 2008, 'WD', 'Normal'],
       ...,
       [70, 'RL', 9042, ..., 2010, 'WD', 'Normal'],
       [20, 'RL', 9717, ..., 2010, 'WD', 'Normal'],
       [20, 'RL', 9937, ..., 2008, 'WD', 'Normal']], dtype=object)

In [29]:
y_train_data = train_data.iloc[:, -1].values

In [30]:
y_train_data

array([208500, 181500, 223500, ..., 266500, 142125, 147500], dtype=int64)

### 3.3 - Tratamento de atributos categóricos

#### LabelEncoder

In [31]:
from sklearn.preprocessing import LabelEncoder

In [32]:
X_train_data[0]

array([60, 'RL', 8450, 'Pave', 'Reg', 'Lvl', 'AllPub', 'Inside', 'Gtl',
       'CollgCr', 'Norm', 'Norm', '1Fam', '2Story', 7, 5, 2003, 2003,
       'Gable', 'CompShg', 'VinylSd', 'VinylSd', 'BrkFace', 196.0, 'Gd',
       'TA', 'PConc', 'Gd', 'TA', 'No', 'GLQ', 706, 'Unf', 0, 150, 856,
       'GasA', 'Ex', 'Y', 'SBrkr', 856, 854, 0, 1710, 1, 0, 2, 1, 3, 1,
       'Gd', 8, 'Typ', 0, 'Attchd', 2003.0, 'RFn', 2, 548, 'TA', 'TA',
       'Y', 0, 61, 0, 0, 0, 0, 2, 2008, 'WD', 'Normal'], dtype=object)

In [33]:
train_data.columns
train_data_X_train = train_data.drop(columns=['Id', 'SalePrice'], axis=1)
train_data_X_train

Unnamed: 0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,...,0,61,0,0,0,0,2,2008,WD,Normal
1,20,RL,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,...,298,0,0,0,0,0,5,2007,WD,Normal
2,60,RL,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,...,0,42,0,0,0,0,9,2008,WD,Normal
3,70,RL,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,...,0,35,272,0,0,0,2,2006,WD,Abnorml
4,60,RL,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,...,192,84,0,0,0,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,7917,Pave,Reg,Lvl,AllPub,Inside,Gtl,Gilbert,...,0,40,0,0,0,0,8,2007,WD,Normal
1456,20,RL,13175,Pave,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,...,349,0,0,0,0,0,2,2010,WD,Normal
1457,70,RL,9042,Pave,Reg,Lvl,AllPub,Inside,Gtl,Crawfor,...,0,60,0,0,0,0,5,2010,WD,Normal
1458,20,RL,9717,Pave,Reg,Lvl,AllPub,Inside,Gtl,NAmes,...,366,0,112,0,0,0,4,2010,WD,Normal


In [34]:
lista = list(train_categoricos.columns)
lista

['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'SaleType',
 'SaleCondition']

In [35]:
for c in lista:
    print(f'label_encoder_{c} = LabelEncoder()')

label_encoder_MSZoning = LabelEncoder()
label_encoder_Street = LabelEncoder()
label_encoder_LotShape = LabelEncoder()
label_encoder_LandContour = LabelEncoder()
label_encoder_Utilities = LabelEncoder()
label_encoder_LotConfig = LabelEncoder()
label_encoder_LandSlope = LabelEncoder()
label_encoder_Neighborhood = LabelEncoder()
label_encoder_Condition1 = LabelEncoder()
label_encoder_Condition2 = LabelEncoder()
label_encoder_BldgType = LabelEncoder()
label_encoder_HouseStyle = LabelEncoder()
label_encoder_RoofStyle = LabelEncoder()
label_encoder_RoofMatl = LabelEncoder()
label_encoder_Exterior1st = LabelEncoder()
label_encoder_Exterior2nd = LabelEncoder()
label_encoder_MasVnrType = LabelEncoder()
label_encoder_ExterQual = LabelEncoder()
label_encoder_ExterCond = LabelEncoder()
label_encoder_Foundation = LabelEncoder()
label_encoder_BsmtQual = LabelEncoder()
label_encoder_BsmtCond = LabelEncoder()
label_encoder_BsmtExposure = LabelEncoder()
label_encoder_BsmtFinType1 = LabelEncoder()
label

In [36]:
label_encoder_MSZoning = LabelEncoder()
label_encoder_Street = LabelEncoder()
label_encoder_LotShape = LabelEncoder()
label_encoder_LandContour = LabelEncoder()
label_encoder_Utilities = LabelEncoder()
label_encoder_LotConfig = LabelEncoder()
label_encoder_LandSlope = LabelEncoder()
label_encoder_Neighborhood = LabelEncoder()
label_encoder_Condition1 = LabelEncoder()
label_encoder_Condition2 = LabelEncoder()
label_encoder_BldgType = LabelEncoder()
label_encoder_HouseStyle = LabelEncoder()
label_encoder_RoofStyle = LabelEncoder()
label_encoder_RoofMatl = LabelEncoder()
label_encoder_Exterior1st = LabelEncoder()
label_encoder_Exterior2nd = LabelEncoder()
label_encoder_MasVnrType = LabelEncoder()
label_encoder_ExterQual = LabelEncoder()
label_encoder_ExterCond = LabelEncoder()
label_encoder_Foundation = LabelEncoder()
label_encoder_BsmtQual = LabelEncoder()
label_encoder_BsmtCond = LabelEncoder()
label_encoder_BsmtExposure = LabelEncoder()
label_encoder_BsmtFinType1 = LabelEncoder()
label_encoder_BsmtFinType2 = LabelEncoder()
label_encoder_Heating = LabelEncoder()
label_encoder_HeatingQC = LabelEncoder()
label_encoder_CentralAir = LabelEncoder()
label_encoder_Electrical = LabelEncoder()
label_encoder_KitchenQual = LabelEncoder()
label_encoder_Functional = LabelEncoder()
label_encoder_GarageType = LabelEncoder()
label_encoder_GarageFinish = LabelEncoder()
label_encoder_GarageQual = LabelEncoder()
label_encoder_GarageCond = LabelEncoder()
label_encoder_PavedDrive = LabelEncoder()
label_encoder_SaleType = LabelEncoder()
label_encoder_SaleCondition = LabelEncoder()

In [37]:
dict_label_cat = dict()

for c in lista:
    dict_label_cat[c] = f'label_encoder_{c}'
dict_label_cat

{'MSZoning': 'label_encoder_MSZoning',
 'Street': 'label_encoder_Street',
 'LotShape': 'label_encoder_LotShape',
 'LandContour': 'label_encoder_LandContour',
 'Utilities': 'label_encoder_Utilities',
 'LotConfig': 'label_encoder_LotConfig',
 'LandSlope': 'label_encoder_LandSlope',
 'Neighborhood': 'label_encoder_Neighborhood',
 'Condition1': 'label_encoder_Condition1',
 'Condition2': 'label_encoder_Condition2',
 'BldgType': 'label_encoder_BldgType',
 'HouseStyle': 'label_encoder_HouseStyle',
 'RoofStyle': 'label_encoder_RoofStyle',
 'RoofMatl': 'label_encoder_RoofMatl',
 'Exterior1st': 'label_encoder_Exterior1st',
 'Exterior2nd': 'label_encoder_Exterior2nd',
 'MasVnrType': 'label_encoder_MasVnrType',
 'ExterQual': 'label_encoder_ExterQual',
 'ExterCond': 'label_encoder_ExterCond',
 'Foundation': 'label_encoder_Foundation',
 'BsmtQual': 'label_encoder_BsmtQual',
 'BsmtCond': 'label_encoder_BsmtCond',
 'BsmtExposure': 'label_encoder_BsmtExposure',
 'BsmtFinType1': 'label_encoder_BsmtFinTy

In [38]:
lista_train_data = list()
count = 0
for c in train_data_X_train.columns:
    for d in lista:
        if c == d:
            for a in dict_label_cat.keys():
                if d == a:
                    print(f'X_train_data[:,{count}] = {dict_label_cat[a]}.fit_transform(X_train_data[:,{count}])')
    count+=1

X_train_data[:,1] = label_encoder_MSZoning.fit_transform(X_train_data[:,1])
X_train_data[:,3] = label_encoder_Street.fit_transform(X_train_data[:,3])
X_train_data[:,4] = label_encoder_LotShape.fit_transform(X_train_data[:,4])
X_train_data[:,5] = label_encoder_LandContour.fit_transform(X_train_data[:,5])
X_train_data[:,6] = label_encoder_Utilities.fit_transform(X_train_data[:,6])
X_train_data[:,7] = label_encoder_LotConfig.fit_transform(X_train_data[:,7])
X_train_data[:,8] = label_encoder_LandSlope.fit_transform(X_train_data[:,8])
X_train_data[:,9] = label_encoder_Neighborhood.fit_transform(X_train_data[:,9])
X_train_data[:,10] = label_encoder_Condition1.fit_transform(X_train_data[:,10])
X_train_data[:,11] = label_encoder_Condition2.fit_transform(X_train_data[:,11])
X_train_data[:,12] = label_encoder_BldgType.fit_transform(X_train_data[:,12])
X_train_data[:,13] = label_encoder_HouseStyle.fit_transform(X_train_data[:,13])
X_train_data[:,18] = label_encoder_RoofStyle.fit_transform(X_train

In [39]:
X_train_data[:,1] = label_encoder_MSZoning.fit_transform(X_train_data[:,1])
X_train_data[:,3] = label_encoder_Street.fit_transform(X_train_data[:,3])
X_train_data[:,4] = label_encoder_LotShape.fit_transform(X_train_data[:,4])
X_train_data[:,5] = label_encoder_LandContour.fit_transform(X_train_data[:,5])
X_train_data[:,6] = label_encoder_Utilities.fit_transform(X_train_data[:,6])
X_train_data[:,7] = label_encoder_LotConfig.fit_transform(X_train_data[:,7])
X_train_data[:,8] = label_encoder_LandSlope.fit_transform(X_train_data[:,8])
X_train_data[:,9] = label_encoder_Neighborhood.fit_transform(X_train_data[:,9])
X_train_data[:,10] = label_encoder_Condition1.fit_transform(X_train_data[:,10])
X_train_data[:,11] = label_encoder_Condition2.fit_transform(X_train_data[:,11])
X_train_data[:,12] = label_encoder_BldgType.fit_transform(X_train_data[:,12])
X_train_data[:,13] = label_encoder_HouseStyle.fit_transform(X_train_data[:,13])
X_train_data[:,18] = label_encoder_RoofStyle.fit_transform(X_train_data[:,18])
X_train_data[:,19] = label_encoder_RoofMatl.fit_transform(X_train_data[:,19])
X_train_data[:,20] = label_encoder_Exterior1st.fit_transform(X_train_data[:,20])
X_train_data[:,21] = label_encoder_Exterior2nd.fit_transform(X_train_data[:,21])
X_train_data[:,22] = label_encoder_MasVnrType.fit_transform(X_train_data[:,22])
X_train_data[:,24] = label_encoder_ExterQual.fit_transform(X_train_data[:,24])
X_train_data[:,25] = label_encoder_ExterCond.fit_transform(X_train_data[:,25])
X_train_data[:,26] = label_encoder_Foundation.fit_transform(X_train_data[:,26])
X_train_data[:,27] = label_encoder_BsmtQual.fit_transform(X_train_data[:,27])
X_train_data[:,28] = label_encoder_BsmtCond.fit_transform(X_train_data[:,28])
X_train_data[:,29] = label_encoder_BsmtExposure.fit_transform(X_train_data[:,29])
X_train_data[:,30] = label_encoder_BsmtFinType1.fit_transform(X_train_data[:,30])
X_train_data[:,32] = label_encoder_BsmtFinType2.fit_transform(X_train_data[:,32])
X_train_data[:,36] = label_encoder_Heating.fit_transform(X_train_data[:,36])
X_train_data[:,37] = label_encoder_HeatingQC.fit_transform(X_train_data[:,37])
X_train_data[:,38] = label_encoder_CentralAir.fit_transform(X_train_data[:,38])
X_train_data[:,39] = label_encoder_Electrical.fit_transform(X_train_data[:,39])
X_train_data[:,50] = label_encoder_KitchenQual.fit_transform(X_train_data[:,50])
X_train_data[:,52] = label_encoder_Functional.fit_transform(X_train_data[:,52])
X_train_data[:,54] = label_encoder_GarageType.fit_transform(X_train_data[:,54])
X_train_data[:,56] = label_encoder_GarageFinish.fit_transform(X_train_data[:,56])
X_train_data[:,59] = label_encoder_GarageQual.fit_transform(X_train_data[:,59])
X_train_data[:,60] = label_encoder_GarageCond.fit_transform(X_train_data[:,60])
X_train_data[:,61] = label_encoder_PavedDrive.fit_transform(X_train_data[:,61])
X_train_data[:,70] = label_encoder_SaleType.fit_transform(X_train_data[:,70])
X_train_data[:,71] = label_encoder_SaleCondition.fit_transform(X_train_data[:,71])

In [40]:
count = 0
for c in train_data_X_train.columns:
    for d in lista:
        if c == d:
            print(f'{count}, ', end='')
    count+=1

1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 30, 32, 36, 37, 38, 39, 50, 52, 54, 56, 59, 60, 61, 70, 71, 

In [41]:
X_train_data[2]

array([60, 3, 11250, 1, 0, 3, 0, 4, 0, 5, 2, 2, 0, 5, 7, 5, 2001, 2002, 1,
       1, 12, 13, 1, 162.0, 2, 4, 2, 2, 3, 2, 2, 486, 5, 0, 434, 920, 1,
       0, 1, 4, 920, 866, 0, 1786, 1, 0, 2, 1, 3, 1, 2, 6, 6, 1, 1,
       2001.0, 1, 2, 608, 4, 4, 2, 0, 42, 0, 0, 0, 0, 9, 2008, 8, 4],
      dtype=object)

#### OneHotEncoder 

In [42]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [43]:
onehotencoder_train_data = ColumnTransformer(transformers=[('OneHot', OneHotEncoder(), [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 30, 32, 36, 37, 38, 39, 50, 52, 54, 56, 59, 60, 61, 70, 71])], remainder='passthrough')

In [44]:
X_train_data
X_train_data = onehotencoder_train_data.fit_transform(X_train_data).toarray()
X_train_data

array([[0.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 2.000e+00,
        2.008e+03],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 5.000e+00,
        2.007e+03],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 9.000e+00,
        2.008e+03],
       ...,
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 5.000e+00,
        2.010e+03],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 4.000e+00,
        2.010e+03],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 6.000e+00,
        2.008e+03]])

In [45]:
X_train_data[0]

array([0.000e+00, 0.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 0.000e+00,
       1.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 1.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 1.000e+00, 1.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 1.000e+00, 1.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       1.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 1.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 1.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 1.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 1.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 1.000e+00, 

In [46]:
X_train_data.shape

(1460, 279)

### 3.4 - Escalonamento dos Valores

In [47]:
from sklearn.preprocessing import StandardScaler
scaler_census = StandardScaler()
X_train_data = scaler_census.fit_transform(X_train_data)

In [48]:
X_train_data[0]

array([-0.08304548, -0.21585871, -0.10526316,  0.51813339, -0.41895507,
       -0.06423821,  0.06423821, -0.70420284, -0.16998114, -0.08304548,
        0.76051192, -0.21235968, -0.18831089, -0.15899968,  0.33712564,
        0.02618016, -0.02618016, -0.46873869, -0.26232433, -0.18238027,
       -0.05241424,  0.62276215,  0.23757092, -0.21585871, -0.09478452,
       -0.10854037, -0.03703704, -0.10526316, -0.20339487, -0.1398323 ,
        2.95522137, -0.19025216, -0.27116307, -0.23917551, -0.16124951,
       -0.10854037, -0.1863522 , -0.42683279, -0.07875671, -0.22941573,
       -0.16998114, -0.23595776, -0.28963792, -0.13199092, -0.23106504,
       -0.20521398, -0.25018188, -0.13199092, -0.16347148, -0.08712888,
       -0.18437553, -0.24235968,  0.39840954, -0.07422696, -0.11482721,
       -0.08712888, -0.13465178, -0.03703704, -0.05862104, -0.03703704,
       -0.06423821,  0.10188534, -0.02618016, -0.03703704, -0.02618016,
       -0.02618016, -0.03703704,  0.44353276, -0.14728711, -0.19

## 4 - Exploração e processamento da base de teste

### 4.1 - Análise Exploratória

In [49]:
# Base de dados de treinamento

teste_data.head().columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotArea', 'Street', 'LotShape',
       'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood',
       'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl',
       'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual',
       'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical',
       '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath',
       'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr',
       'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
       'Enclos

In [50]:
# Calda da base de dados de treinamento

teste_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,11622,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,140,0,0,0,120,0,6,2010,WD,Normal
1,1462,20,RL,14267,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,393,36,0,0,0,0,6,2010,WD,Normal
2,1463,60,RL,13830,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,212,34,0,0,0,0,3,2010,WD,Normal
3,1464,60,RL,9978,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,360,36,0,0,0,0,6,2010,WD,Normal
4,1465,120,RL,5005,Pave,IR1,HLS,AllPub,Inside,Gtl,...,0,82,0,0,144,0,1,2010,WD,Normal


In [51]:
# A base de dados tem 1460 linhas e 81 colunas

teste_data.shape

(1459, 73)

####  Análise das colunas numéricas

In [52]:
# Colunas com os dados númericos

teste_numericos = teste_data.select_dtypes(include=['int64', 'float64'])
teste_numericos.describe()

Unnamed: 0,Id,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MoSold,YrSold
count,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1444.0,1458.0,1458.0,...,1458.0,1458.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0
mean,2190.0,57.378341,9819.161069,6.078821,5.553804,1971.357779,1983.662783,100.709141,439.203704,52.619342,...,1.766118,472.768861,93.174777,48.313914,24.243317,1.79438,17.064428,1.744345,6.104181,2007.769705
std,421.321334,42.74688,4955.517327,1.436812,1.11374,30.390071,21.130467,177.6259,455.268042,176.753926,...,0.775945,217.048611,127.744882,68.883364,67.227765,20.207842,56.609763,30.491646,2.722432,1.30174
min,1461.0,20.0,1470.0,1.0,1.0,1879.0,1950.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0
25%,1825.5,20.0,7391.0,5.0,5.0,1953.0,1963.0,0.0,0.0,0.0,...,1.0,318.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2007.0
50%,2190.0,50.0,9399.0,6.0,5.0,1973.0,1992.0,0.0,350.5,0.0,...,2.0,480.0,0.0,28.0,0.0,0.0,0.0,0.0,6.0,2008.0
75%,2554.5,70.0,11517.5,7.0,6.0,2001.0,2004.0,164.0,753.5,0.0,...,2.0,576.0,168.0,72.0,0.0,0.0,0.0,0.0,8.0,2009.0
max,2919.0,190.0,56600.0,10.0,9.0,2010.0,2010.0,1290.0,4010.0,1526.0,...,5.0,1488.0,1424.0,742.0,1012.0,360.0,576.0,800.0,12.0,2010.0


In [53]:
# Lista com as colunas numéricas da base de treinamento. As colunas MSSubClass, OverallQuall e OverallCond não entram nessa categória pois
# os números representam dados categóricos.

teste_numericos.columns

Index(['Id', 'MSSubClass', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MoSold',
       'YrSold'],
      dtype='object')

#### Análise colunas categóricas

In [54]:
# Colunas com os dados categôricos

teste_categoricos = teste_data.select_dtypes(include=['object'])
teste_categoricos.describe()

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,Electrical,KitchenQual,Functional,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition
count,1455,1459,1459,1459,1457,1459,1459,1459,1459,1459,...,1459,1458,1457,1383,1381,1381,1381,1459,1458,1459
unique,5,2,4,4,1,5,3,25,9,5,...,4,4,7,6,3,4,5,3,9,6
top,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,...,SBrkr,TA,Typ,Attchd,Unf,TA,TA,Y,WD,Normal
freq,1114,1453,934,1311,1457,1081,1396,218,1251,1444,...,1337,757,1357,853,625,1293,1328,1301,1258,1204


In [55]:
# Lista com as colunas categóricas da base de treinamento. As colunas MSSubClass, OverallQuall e OverallCond, foram consideradas
# categóricas por representar de forma numérica dados categóricos.

teste_categoricos.columns

Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
       'PavedDrive', 'SaleType', 'SaleCondition'],
      dtype='object')

####  Valores Nulos

#### Tratamento de valores numéricos nulos

In [56]:
# Colunas com valores nulos

nulos = teste_numericos.isnull().sum()

In [57]:
# Quantidade de valores nulos

nulos.head(50)
 

Id                0
MSSubClass        0
LotArea           0
OverallQual       0
OverallCond       0
YearBuilt         0
YearRemodAdd      0
MasVnrArea       15
BsmtFinSF1        1
BsmtFinSF2        1
BsmtUnfSF         1
TotalBsmtSF       1
1stFlrSF          0
2ndFlrSF          0
LowQualFinSF      0
GrLivArea         0
BsmtFullBath      2
BsmtHalfBath      2
FullBath          0
HalfBath          0
BedroomAbvGr      0
KitchenAbvGr      0
TotRmsAbvGrd      0
Fireplaces        0
GarageYrBlt      78
GarageCars        1
GarageArea        1
WoodDeckSF        0
OpenPorchSF       0
EnclosedPorch     0
3SsnPorch         0
ScreenPorch       0
PoolArea          0
MoSold            0
YrSold            0
dtype: int64

In [58]:
colunastest_com_valores_nulos = []
for c in nulos.items():
    if c[1] > 0:
        colunastest_com_valores_nulos.append(c[0])
colunastest_com_valores_nulos

['MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'BsmtFullBath',
 'BsmtHalfBath',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea']

In [59]:
imp_constat = SimpleImputer(missing_values=np.nan, strategy='median')
imputer = imp_constat.fit(teste_data[colunastest_com_valores_nulos])
teste_data[colunastest_com_valores_nulos] = imputer.transform(teste_data[colunastest_com_valores_nulos])

In [60]:
teste_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 73 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotArea        1459 non-null   int64  
 4   Street         1459 non-null   object 
 5   LotShape       1459 non-null   object 
 6   LandContour    1459 non-null   object 
 7   Utilities      1457 non-null   object 
 8   LotConfig      1459 non-null   object 
 9   LandSlope      1459 non-null   object 
 10  Neighborhood   1459 non-null   object 
 11  Condition1     1459 non-null   object 
 12  Condition2     1459 non-null   object 
 13  BldgType       1459 non-null   object 
 14  HouseStyle     1459 non-null   object 
 15  OverallQual    1459 non-null   int64  
 16  OverallCond    1459 non-null   int64  
 17  YearBuilt      1459 non-null   int64  
 18  YearRemo

#### Tratamento de valores categoricos nulos

###### O traramento de variáveis categóricas nulas será realizado utilizando o algorítimo simple imputer da biblioteca do Sklearn. Um dos parâmetros do SimpleImputer e a strategy, no qual é definido se você quer usar a média, a mediana, a moda ou constant, para preechencer os dados faltantes, sendo a constant a opção a ser utilizada. Essa estratégia consiste em preencher a célula com o dado faltante, com o valor da célula acíma.

In [61]:
imp_constat = SimpleImputer(missing_values=np.nan, strategy='constant')

##### Quantidade de valores nulos por coluna categórica

In [62]:
nulos2 = teste_categoricos.isnull().sum()
nulos2.head(50)

MSZoning          4
Street            0
LotShape          0
LandContour       0
Utilities         2
LotConfig         0
LandSlope         0
Neighborhood      0
Condition1        0
Condition2        0
BldgType          0
HouseStyle        0
RoofStyle         0
RoofMatl          0
Exterior1st       1
Exterior2nd       1
MasVnrType       16
ExterQual         0
ExterCond         0
Foundation        0
BsmtQual         44
BsmtCond         45
BsmtExposure     44
BsmtFinType1     42
BsmtFinType2     42
Heating           0
HeatingQC         0
CentralAir        0
Electrical        0
KitchenQual       1
Functional        2
GarageType       76
GarageFinish     78
GarageQual       78
GarageCond       78
PavedDrive        0
SaleType          1
SaleCondition     0
dtype: int64

In [63]:
colunas_com_valores_nulos = []
for c in nulos2.items():
    if c[1] > 0:
        colunas_com_valores_nulos.append(c[0])
colunas_com_valores_nulos

['MSZoning',
 'Utilities',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'KitchenQual',
 'Functional',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'SaleType']

In [64]:
imputer = imp_constat.fit(teste_data[colunas_com_valores_nulos])
teste_data[colunas_com_valores_nulos] = imputer.transform(teste_data[colunas_com_valores_nulos])

In [65]:
teste_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 73 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1459 non-null   object 
 3   LotArea        1459 non-null   int64  
 4   Street         1459 non-null   object 
 5   LotShape       1459 non-null   object 
 6   LandContour    1459 non-null   object 
 7   Utilities      1459 non-null   object 
 8   LotConfig      1459 non-null   object 
 9   LandSlope      1459 non-null   object 
 10  Neighborhood   1459 non-null   object 
 11  Condition1     1459 non-null   object 
 12  Condition2     1459 non-null   object 
 13  BldgType       1459 non-null   object 
 14  HouseStyle     1459 non-null   object 
 15  OverallQual    1459 non-null   int64  
 16  OverallCond    1459 non-null   int64  
 17  YearBuilt      1459 non-null   int64  
 18  YearRemo

### 4.2 - Pré-processamento dos dados

In [66]:
teste_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,11622,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,140,0,0,0,120,0,6,2010,WD,Normal
1,1462,20,RL,14267,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,393,36,0,0,0,0,6,2010,WD,Normal
2,1463,60,RL,13830,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,212,34,0,0,0,0,3,2010,WD,Normal
3,1464,60,RL,9978,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,360,36,0,0,0,0,6,2010,WD,Normal
4,1465,120,RL,5005,Pave,IR1,HLS,AllPub,Inside,Gtl,...,0,82,0,0,144,0,1,2010,WD,Normal


In [67]:
X_teste_data = teste_data.iloc[:, 1:].values

In [68]:
X_teste_data

array([[20, 'RH', 11622, ..., 2010, 'WD', 'Normal'],
       [20, 'RL', 14267, ..., 2010, 'WD', 'Normal'],
       [60, 'RL', 13830, ..., 2010, 'WD', 'Normal'],
       ...,
       [20, 'RL', 20000, ..., 2006, 'WD', 'Abnorml'],
       [85, 'RL', 10441, ..., 2006, 'WD', 'Normal'],
       [60, 'RL', 9627, ..., 2006, 'WD', 'Normal']], dtype=object)

### 4.3 - Tratamento de atributos categóricos

#### LabelEncoder

In [69]:
X_teste_data[0]

array([20, 'RH', 11622, 'Pave', 'Reg', 'Lvl', 'AllPub', 'Inside', 'Gtl',
       'NAmes', 'Feedr', 'Norm', '1Fam', '1Story', 5, 6, 1961, 1961,
       'Gable', 'CompShg', 'VinylSd', 'VinylSd', 'None', 0.0, 'TA', 'TA',
       'CBlock', 'TA', 'TA', 'No', 'Rec', 468.0, 'LwQ', 144.0, 270.0,
       882.0, 'GasA', 'TA', 'Y', 'SBrkr', 896, 0, 0, 896, 0.0, 0.0, 1, 0,
       2, 1, 'TA', 5, 'Typ', 0, 'Attchd', 1961.0, 'Unf', 1.0, 730.0, 'TA',
       'TA', 'Y', 140, 0, 0, 0, 120, 0, 6, 2010, 'WD', 'Normal'],
      dtype=object)

In [70]:
teste_data.columns
teste_data_X_teste = teste_data.drop(columns=['Id'], axis=1)
teste_data_X_teste

Unnamed: 0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MoSold,YrSold,SaleType,SaleCondition
0,20,RH,11622,Pave,Reg,Lvl,AllPub,Inside,Gtl,NAmes,...,140,0,0,0,120,0,6,2010,WD,Normal
1,20,RL,14267,Pave,IR1,Lvl,AllPub,Corner,Gtl,NAmes,...,393,36,0,0,0,0,6,2010,WD,Normal
2,60,RL,13830,Pave,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,...,212,34,0,0,0,0,3,2010,WD,Normal
3,60,RL,9978,Pave,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,...,360,36,0,0,0,0,6,2010,WD,Normal
4,120,RL,5005,Pave,IR1,HLS,AllPub,Inside,Gtl,StoneBr,...,0,82,0,0,144,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,160,RM,1936,Pave,Reg,Lvl,AllPub,Inside,Gtl,MeadowV,...,0,0,0,0,0,0,6,2006,WD,Normal
1455,160,RM,1894,Pave,Reg,Lvl,AllPub,Inside,Gtl,MeadowV,...,0,24,0,0,0,0,4,2006,WD,Abnorml
1456,20,RL,20000,Pave,Reg,Lvl,AllPub,Inside,Gtl,Mitchel,...,474,0,0,0,0,0,9,2006,WD,Abnorml
1457,85,RL,10441,Pave,Reg,Lvl,AllPub,Inside,Gtl,Mitchel,...,80,32,0,0,0,0,7,2006,WD,Normal


In [71]:
lista = list(teste_categoricos.columns)
lista

['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'SaleType',
 'SaleCondition']

In [72]:
for c in lista:
    print(f'label_encoder_{c} = LabelEncoder()')

label_encoder_MSZoning = LabelEncoder()
label_encoder_Street = LabelEncoder()
label_encoder_LotShape = LabelEncoder()
label_encoder_LandContour = LabelEncoder()
label_encoder_Utilities = LabelEncoder()
label_encoder_LotConfig = LabelEncoder()
label_encoder_LandSlope = LabelEncoder()
label_encoder_Neighborhood = LabelEncoder()
label_encoder_Condition1 = LabelEncoder()
label_encoder_Condition2 = LabelEncoder()
label_encoder_BldgType = LabelEncoder()
label_encoder_HouseStyle = LabelEncoder()
label_encoder_RoofStyle = LabelEncoder()
label_encoder_RoofMatl = LabelEncoder()
label_encoder_Exterior1st = LabelEncoder()
label_encoder_Exterior2nd = LabelEncoder()
label_encoder_MasVnrType = LabelEncoder()
label_encoder_ExterQual = LabelEncoder()
label_encoder_ExterCond = LabelEncoder()
label_encoder_Foundation = LabelEncoder()
label_encoder_BsmtQual = LabelEncoder()
label_encoder_BsmtCond = LabelEncoder()
label_encoder_BsmtExposure = LabelEncoder()
label_encoder_BsmtFinType1 = LabelEncoder()
label

In [73]:
label_encoder_MSZoning = LabelEncoder()
label_encoder_Street = LabelEncoder()
label_encoder_LotShape = LabelEncoder()
label_encoder_LandContour = LabelEncoder()
label_encoder_Utilities = LabelEncoder()
label_encoder_LotConfig = LabelEncoder()
label_encoder_LandSlope = LabelEncoder()
label_encoder_Neighborhood = LabelEncoder()
label_encoder_Condition1 = LabelEncoder()
label_encoder_Condition2 = LabelEncoder()
label_encoder_BldgType = LabelEncoder()
label_encoder_HouseStyle = LabelEncoder()
label_encoder_RoofStyle = LabelEncoder()
label_encoder_RoofMatl = LabelEncoder()
label_encoder_Exterior1st = LabelEncoder()
label_encoder_Exterior2nd = LabelEncoder()
label_encoder_MasVnrType = LabelEncoder()
label_encoder_ExterQual = LabelEncoder()
label_encoder_ExterCond = LabelEncoder()
label_encoder_Foundation = LabelEncoder()
label_encoder_BsmtQual = LabelEncoder()
label_encoder_BsmtCond = LabelEncoder()
label_encoder_BsmtExposure = LabelEncoder()
label_encoder_BsmtFinType1 = LabelEncoder()
label_encoder_BsmtFinType2 = LabelEncoder()
label_encoder_Heating = LabelEncoder()
label_encoder_HeatingQC = LabelEncoder()
label_encoder_CentralAir = LabelEncoder()
label_encoder_Electrical = LabelEncoder()
label_encoder_KitchenQual = LabelEncoder()
label_encoder_Functional = LabelEncoder()
label_encoder_GarageType = LabelEncoder()
label_encoder_GarageFinish = LabelEncoder()
label_encoder_GarageQual = LabelEncoder()
label_encoder_GarageCond = LabelEncoder()
label_encoder_PavedDrive = LabelEncoder()
label_encoder_SaleType = LabelEncoder()
label_encoder_SaleCondition = LabelEncoder()

In [74]:
dict_label_cat = dict()

for c in lista:
    dict_label_cat[c] = f'label_encoder_{c}'
dict_label_cat

{'MSZoning': 'label_encoder_MSZoning',
 'Street': 'label_encoder_Street',
 'LotShape': 'label_encoder_LotShape',
 'LandContour': 'label_encoder_LandContour',
 'Utilities': 'label_encoder_Utilities',
 'LotConfig': 'label_encoder_LotConfig',
 'LandSlope': 'label_encoder_LandSlope',
 'Neighborhood': 'label_encoder_Neighborhood',
 'Condition1': 'label_encoder_Condition1',
 'Condition2': 'label_encoder_Condition2',
 'BldgType': 'label_encoder_BldgType',
 'HouseStyle': 'label_encoder_HouseStyle',
 'RoofStyle': 'label_encoder_RoofStyle',
 'RoofMatl': 'label_encoder_RoofMatl',
 'Exterior1st': 'label_encoder_Exterior1st',
 'Exterior2nd': 'label_encoder_Exterior2nd',
 'MasVnrType': 'label_encoder_MasVnrType',
 'ExterQual': 'label_encoder_ExterQual',
 'ExterCond': 'label_encoder_ExterCond',
 'Foundation': 'label_encoder_Foundation',
 'BsmtQual': 'label_encoder_BsmtQual',
 'BsmtCond': 'label_encoder_BsmtCond',
 'BsmtExposure': 'label_encoder_BsmtExposure',
 'BsmtFinType1': 'label_encoder_BsmtFinTy

In [75]:
lista_train_data = list()
count = 0
for c in teste_data_X_teste.columns:
    for d in lista:
        if c == d:
            for a in dict_label_cat.keys():
                if d == a:
                    print(f'X_teste_data[:,{count}] = {dict_label_cat[a]}.fit_transform(X_teste_data[:,{count}])')
    count+=1

X_teste_data[:,1] = label_encoder_MSZoning.fit_transform(X_teste_data[:,1])
X_teste_data[:,3] = label_encoder_Street.fit_transform(X_teste_data[:,3])
X_teste_data[:,4] = label_encoder_LotShape.fit_transform(X_teste_data[:,4])
X_teste_data[:,5] = label_encoder_LandContour.fit_transform(X_teste_data[:,5])
X_teste_data[:,6] = label_encoder_Utilities.fit_transform(X_teste_data[:,6])
X_teste_data[:,7] = label_encoder_LotConfig.fit_transform(X_teste_data[:,7])
X_teste_data[:,8] = label_encoder_LandSlope.fit_transform(X_teste_data[:,8])
X_teste_data[:,9] = label_encoder_Neighborhood.fit_transform(X_teste_data[:,9])
X_teste_data[:,10] = label_encoder_Condition1.fit_transform(X_teste_data[:,10])
X_teste_data[:,11] = label_encoder_Condition2.fit_transform(X_teste_data[:,11])
X_teste_data[:,12] = label_encoder_BldgType.fit_transform(X_teste_data[:,12])
X_teste_data[:,13] = label_encoder_HouseStyle.fit_transform(X_teste_data[:,13])
X_teste_data[:,18] = label_encoder_RoofStyle.fit_transform(X_teste

In [76]:
X_teste_data[:,1] = label_encoder_MSZoning.fit_transform(X_teste_data[:,1])
X_teste_data[:,3] = label_encoder_Street.fit_transform(X_teste_data[:,3])
X_teste_data[:,4] = label_encoder_LotShape.fit_transform(X_teste_data[:,4])
X_teste_data[:,5] = label_encoder_LandContour.fit_transform(X_teste_data[:,5])
X_teste_data[:,6] = label_encoder_Utilities.fit_transform(X_teste_data[:,6])
X_teste_data[:,7] = label_encoder_LotConfig.fit_transform(X_teste_data[:,7])
X_teste_data[:,8] = label_encoder_LandSlope.fit_transform(X_teste_data[:,8])
X_teste_data[:,9] = label_encoder_Neighborhood.fit_transform(X_teste_data[:,9])
X_teste_data[:,10] = label_encoder_Condition1.fit_transform(X_teste_data[:,10])
X_teste_data[:,11] = label_encoder_Condition2.fit_transform(X_teste_data[:,11])
X_teste_data[:,12] = label_encoder_BldgType.fit_transform(X_teste_data[:,12])
X_teste_data[:,13] = label_encoder_HouseStyle.fit_transform(X_teste_data[:,13])
X_teste_data[:,18] = label_encoder_RoofStyle.fit_transform(X_teste_data[:,18])
X_teste_data[:,19] = label_encoder_RoofMatl.fit_transform(X_teste_data[:,19])
X_teste_data[:,20] = label_encoder_Exterior1st.fit_transform(X_teste_data[:,20])
X_teste_data[:,21] = label_encoder_Exterior2nd.fit_transform(X_teste_data[:,21])
X_teste_data[:,22] = label_encoder_MasVnrType.fit_transform(X_teste_data[:,22])
X_teste_data[:,24] = label_encoder_ExterQual.fit_transform(X_teste_data[:,24])
X_teste_data[:,25] = label_encoder_ExterCond.fit_transform(X_teste_data[:,25])
X_teste_data[:,26] = label_encoder_Foundation.fit_transform(X_teste_data[:,26])
X_teste_data[:,27] = label_encoder_BsmtQual.fit_transform(X_teste_data[:,27])
X_teste_data[:,28] = label_encoder_BsmtCond.fit_transform(X_teste_data[:,28])
X_teste_data[:,29] = label_encoder_BsmtExposure.fit_transform(X_teste_data[:,29])
X_teste_data[:,30] = label_encoder_BsmtFinType1.fit_transform(X_teste_data[:,30])
X_teste_data[:,32] = label_encoder_BsmtFinType2.fit_transform(X_teste_data[:,32])
X_teste_data[:,36] = label_encoder_Heating.fit_transform(X_teste_data[:,36])
X_teste_data[:,37] = label_encoder_HeatingQC.fit_transform(X_teste_data[:,37])
X_teste_data[:,38] = label_encoder_CentralAir.fit_transform(X_teste_data[:,38])
X_teste_data[:,39] = label_encoder_Electrical.fit_transform(X_teste_data[:,39])
X_teste_data[:,50] = label_encoder_KitchenQual.fit_transform(X_teste_data[:,50])
X_teste_data[:,52] = label_encoder_Functional.fit_transform(X_teste_data[:,52])
X_teste_data[:,54] = label_encoder_GarageType.fit_transform(X_teste_data[:,54])
X_teste_data[:,56] = label_encoder_GarageFinish.fit_transform(X_teste_data[:,56])
X_teste_data[:,59] = label_encoder_GarageQual.fit_transform(X_teste_data[:,59])
X_teste_data[:,60] = label_encoder_GarageCond.fit_transform(X_teste_data[:,60])
X_teste_data[:,61] = label_encoder_PavedDrive.fit_transform(X_teste_data[:,61])
X_teste_data[:,70] = label_encoder_SaleType.fit_transform(X_teste_data[:,70])
X_teste_data[:,71] = label_encoder_SaleCondition.fit_transform(X_teste_data[:,71])

In [77]:
count = 0
for c in teste_data_X_teste.columns:
    for d in lista:
        if c == d:
            print(f'{count}, ', end='')
    count+=1

1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 30, 32, 36, 37, 38, 39, 50, 52, 54, 56, 59, 60, 61, 70, 71, 

In [78]:
X_teste_data[2]

array([60, 3, 13830, 1, 0, 3, 0, 4, 0, 8, 2, 2, 0, 4, 5, 5, 1997, 1998, 1,
       0, 10, 12, 2, 0.0, 3, 4, 2, 2, 3, 3, 2, 791.0, 5, 0.0, 137.0,
       928.0, 0, 2, 1, 3, 928, 701, 0, 1629, 0.0, 0.0, 2, 1, 3, 1, 3, 6,
       6, 1, 1, 1997.0, 0, 2.0, 482.0, 3, 4, 2, 212, 34, 0, 0, 0, 0, 3,
       2010, 8, 4], dtype=object)

#### OneHotEncoder 

In [79]:
onehotencoder_teste_data = ColumnTransformer(transformers=[('OneHot', OneHotEncoder(), [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 30, 32, 36, 37, 38, 39, 50, 52, 54, 56, 59, 60, 61, 70, 71])], remainder='passthrough')

In [80]:
X_teste_data
X_teste_data = onehotencoder_teste_data.fit_transform(X_teste_data).toarray()
X_teste_data

array([[0.000e+00, 0.000e+00, 1.000e+00, ..., 0.000e+00, 6.000e+00,
        2.010e+03],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 6.000e+00,
        2.010e+03],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 3.000e+00,
        2.010e+03],
       ...,
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 9.000e+00,
        2.006e+03],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 7.000e+00,
        2.006e+03],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 1.100e+01,
        2.006e+03]])

In [81]:
X_teste_data[0]

array([0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       1.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 1.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
       1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e

In [82]:
X_teste_data.shape

(1459, 269)

### 4.4 - Escalonamento dos Valores

In [83]:
scaler_census = StandardScaler()
X_teste_data = scaler_census.fit_transform(X_teste_data)

In [84]:
X_teste_data[0]

array([-0.10192061, -0.23114844, 12.03744159, -1.79693781, -0.44592559,
       -0.05243225, -0.06426031,  0.06426031, -0.70456388, -0.1567758 ,
       -0.06426031,  0.74973229, -0.19604633, -0.22449046, -0.12932416,
        0.33599244,  0.03704974, -0.03704974, -0.45253666, -0.24402817,
       -0.16352899, -0.08307413,  0.59133428,  0.21243573, -0.20709364,
       -0.04539206, -0.08715896, -0.07425254, -0.09843059, -0.18837771,
       -0.10529963, -0.2952682 , -0.19224481, -0.2624204 , -0.25027298,
       -0.19978606, -0.11789207, -0.21593612,  2.3859297 , -0.09843059,
       -0.20346744, -0.14489204, -0.25487939, -0.30744696, -0.12655715,
       -0.23604312, -0.21766891, -0.26539192, -0.13469875, -0.15446569,
       -0.09481729, -0.1763389 ,  4.07164749, -2.45243207, -0.09106602,
       -0.11789207, -0.108578  , -0.12932416, -0.05243225, -0.05243225,
       -0.04539206, -0.06943297,  0.10192061, -0.04539206, -0.03704974,
        0.45911696, -0.14733867, -0.20163384, -0.19415352, -0.28

## 5. Teste de Algorítimos de Regressão

### 5.1 Divisão das bases X_train_data e y_train_data para as variáveis de treinamento e teste

In [97]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [171]:
X_train_data_treinamento, X_train_data_teste, y_train_data_treinamento, y_train_data_teste = train_test_split(X_train_data, y_train_data, test_size=0.25, random_state=0)

In [172]:
X_train_data_treinamento.shape, y_train_data_treinamento.shape

((1095, 279), (1095,))

In [173]:
X_train_data_teste.shape, y_train_data_teste.shape

((365, 279), (365,))

### 5.2 Algorítimo de Regressão Random Forest - 85.47%

In [248]:
from sklearn.ensemble import RandomForestRegressor
RandomForestRegressor = RandomForestRegressor(n_estimators=100, criterion='poison', random_state=0)
RandomForestRegressor = RandomForestRegressor.fit(X_train_data_treinamento, y_train_data_treinamento)

In [249]:
predict = RandomForestRegressor.predict(X_train_data_teste)

In [250]:
print(r2_score(y_train_data_teste, predict))

0.8547101018178183


### 5.3. Algoritimo de Regressão Linear - 62,72%

In [251]:
from sklearn.linear_model import Ridge

In [252]:
Ridge = Ridge()
Ridge = Ridge.fit(X_train_data_treinamento, y_train_data_treinamento)

In [253]:
predict_ridge = Ridge.predict(X_train_data_teste)

In [254]:
print(r2_score(y_train_data_teste, predict_ridge))
print(mean_squared_error(y_train_data_teste, predict_ridge))

0.6272923734116987
2460584738.2645426


### 5.4. Algoritimo de Regressão com Lasso

In [256]:
from sklearn.linear_model import Lasso

In [257]:
Lasso = Lasso()
Lasso = Lasso.fit(X_train_data_treinamento, y_train_data_treinamento)

  model = cd_fast.enet_coordinate_descent(


In [258]:
predict_lasso = Lasso.predict(X_train_data_teste)

In [259]:
print(r2_score(y_train_data_teste, predict_lasso))
print(mean_squared_error(y_train_data_teste, predict_lasso))

0.6228200659698664
2490110539.861179


#### Salvar as variáveis

In [93]:
import pickle

In [None]:
with open('train_data.pkl', mode='wb') as f:
    pickle.dump([X_train_data_treinamento, y_train_data_treinamento, X_train_data_teste, y_train_data_teste],f)