# Preços de Casas - Técnicas avançadas de Regressão

## 1 - Objetivo da Análise

#### Obj.:  Prever o preço de venda de cada casa, para cada ID no conjunto de teste. Será previsto o valor da variável SalePrice.

## 2 - Importação das bibliotecas básicas e das bases de dados Train e Test

#### Importação das Bibliotecas

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

#### Importação das bases de dados

In [2]:
# Base de dados de treinamento

train_data = pd.read_csv('train.csv') # base de teste
test_data = pd.read_csv('test.csv') # Base de treinamento

In [3]:
train_data.info() # informações das colunas da base de treinamento

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [4]:
test_data.info() # informações das colunas da base de teste

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

## 3 - Exploração e processamento da Base de Treinamento

### 3.1 - Análise Exploratória

In [5]:
# Base de dados de treinamento

train_data = pd.read_csv('train.csv')
train_data.head().columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [6]:
# Calda da base de dados de treinamento

train_data.tail()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125
1459,1460,20,RL,75.0,9937,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2008,WD,Normal,147500


In [7]:
# A base de dados tem 1460 linhas e 80 colunas

train_data.shape

(1460, 81)

####  Valores Nulos

In [8]:
# Colunas com valores nulos

nulos = train_data.isnull().sum()
# Caldanulos.head(50)

In [9]:
# Calda das colunas

nulos.tail(31)

HalfBath            0
BedroomAbvGr        0
KitchenAbvGr        0
KitchenQual         0
TotRmsAbvGrd        0
Functional          0
Fireplaces          0
FireplaceQu       690
GarageType         81
GarageYrBlt        81
GarageFinish       81
GarageCars          0
GarageArea          0
GarageQual         81
GarageCond         81
PavedDrive          0
WoodDeckSF          0
OpenPorchSF         0
EnclosedPorch       0
3SsnPorch           0
ScreenPorch         0
PoolArea            0
PoolQC           1453
Fence            1179
MiscFeature      1406
MiscVal             0
MoSold              0
YrSold              0
SaleType            0
SaleCondition       0
SalePrice           0
dtype: int64

####  Análise das colunas numéricas

In [10]:
# Colunas com os dados númericos

train_data_numericos = train_data.select_dtypes(include=['int64', 'float64'])
train_data_numericos.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,61,0,0,0,0,0,2,2008,208500
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,298,0,0,0,0,0,0,5,2007,181500
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,42,0,0,0,0,0,9,2008,223500
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,35,272,0,0,0,0,2,2006,140000
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,192,84,0,0,0,0,0,12,2008,250000


In [11]:
colunas_numéricos = train_data_numericos.columns
colunas_numéricos

Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold', 'SalePrice'],
      dtype='object')

###### O tratamento de valores nulos para colunas númericas consistirá em substituir o valores nulos pela média dos valores da coluna. As colunas numéricas que apresentam dados nulos são: GarageYrBlt e LotFrontage

#### Tratando a coluna LotFrontage

In [12]:
# Média dos valores na colunas LotFrontage

media_lotfrontage = train_data["LotFrontage"].mean()
media_lotfrontage

70.04995836802665

In [13]:
# Atibuindo a média às celulas com valores nulos na coluna LotFrontage

train_data["LotFrontage"].fillna(train_data["LotFrontage"].mean(), inplace=True)
train_data[train_data["LotFrontage"].isnull()]

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice


#### Trando a coluna GarageYrBlt

In [14]:
# Média dos valores na colunas GarageYrBlt

media_lotfrontage = train_data["GarageYrBlt"].mean()
media_lotfrontage

1978.5061638868744

In [15]:
# Atibuindo a média às celulas com valores nulos na coluna GarageYrBlt

train_data["GarageYrBlt"].fillna(train_data["GarageYrBlt"].mean(), inplace=True)
train_data[train_data["GarageYrBlt"].isnull()]

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice


#### Análise colunas categóricas

In [16]:
# Colunas com os dados categôricos

train_data_categoricos = train_data.select_dtypes(include=['object'])
train_data_categoricos.head()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


In [17]:
colunas_categoricas = train_data_categoricos.columns
colunas_categoricas

Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')

##### Para o caso das colunas categóricas, será apagado as colunas que apresentarem valores nulos, uma vez que não é possível substituir o v.n. por qualquer outro valor. As colunas categóricas que serão apagadas são: Alley, MasVnrType, BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2, Electrical, FireplaceQu, GarageType, GarageFinish, GarageQual, GarageCond e PoolQC.

In [18]:
train_data = train_data.drop(columns=['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature'])
train_data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrArea', 'ExterQual',
       'ExterCond', 'Foundation', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', '1stFlrSF',
       '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath',
       'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'GarageYrBlt', 'GarageCars',
       'GarageArea', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'SaleType', 'SaleCondition', 'SalePrice'],
      dtype='object')

#### Checagem de valores nulos na base de treinamento

In [19]:
nulos = train_data.isnull().sum()
nulos.tail(21)

KitchenAbvGr     0
KitchenQual      0
TotRmsAbvGrd     0
Functional       0
Fireplaces       0
GarageYrBlt      0
GarageCars       0
GarageArea       0
PavedDrive       0
WoodDeckSF       0
OpenPorchSF      0
EnclosedPorch    0
3SsnPorch        0
ScreenPorch      0
PoolArea         0
MiscVal          0
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
SalePrice        0
dtype: int64

#### Obtendo mais informações sobre as colunas

In [20]:
train_data.info()
train_data.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 65 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1460 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

(1460, 65)

### 3.2 - Pré-processamento dos dados

In [21]:
train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [22]:
X_train_data = train_data.iloc[:, 1:75].values

In [23]:
X_train_data

array([[60, 'RL', 65.0, ..., 'WD', 'Normal', 208500],
       [20, 'RL', 80.0, ..., 'WD', 'Normal', 181500],
       [60, 'RL', 68.0, ..., 'WD', 'Normal', 223500],
       ...,
       [70, 'RL', 66.0, ..., 'WD', 'Normal', 266500],
       [20, 'RL', 68.0, ..., 'WD', 'Normal', 142125],
       [20, 'RL', 75.0, ..., 'WD', 'Normal', 147500]], dtype=object)

In [24]:
y_train_data = train_data.iloc[:, -1].values

In [25]:
y_train_data

array([208500, 181500, 223500, ..., 266500, 142125, 147500], dtype=int64)

### 3.3 - Tratamento de atributos categóricos

#### LabelEncoder

In [26]:
from sklearn.preprocessing import LabelEncoder

In [27]:
X_train_data[0]

array([60, 'RL', 65.0, 8450, 'Pave', 'Reg', 'Lvl', 'AllPub', 'Inside',
       'Gtl', 'CollgCr', 'Norm', 'Norm', '1Fam', '2Story', 7, 5, 2003,
       2003, 'Gable', 'CompShg', 'VinylSd', 'VinylSd', 196.0, 'Gd', 'TA',
       'PConc', 706, 0, 150, 856, 'GasA', 'Ex', 'Y', 856, 854, 0, 1710, 1,
       0, 2, 1, 3, 1, 'Gd', 8, 'Typ', 0, 2003.0, 2, 548, 'Y', 0, 61, 0, 0,
       0, 0, 0, 2, 2008, 'WD', 'Normal', 208500], dtype=object)

In [28]:
train_data.columns
train_data_X_train = train_data.drop(columns=['Id', 'SalePrice'], axis=1)
train_data_X_train

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,61,0,0,0,0,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,42,0,0,0,0,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,35,272,0,0,0,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,84,0,0,0,0,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,40,0,0,0,0,0,8,2007,WD,Normal
1456,20,RL,85.0,13175,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,0,2,2010,WD,Normal
1457,70,RL,66.0,9042,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,60,0,0,0,0,2500,5,2010,WD,Normal
1458,20,RL,68.0,9717,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,112,0,0,0,0,4,2010,WD,Normal


In [29]:
lista = list(train_data_categoricos.columns)
lista

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [30]:
for c in lista:
    print(f'label_encoder_{c} = LabelEncoder()')

label_encoder_MSZoning = LabelEncoder()
label_encoder_Street = LabelEncoder()
label_encoder_Alley = LabelEncoder()
label_encoder_LotShape = LabelEncoder()
label_encoder_LandContour = LabelEncoder()
label_encoder_Utilities = LabelEncoder()
label_encoder_LotConfig = LabelEncoder()
label_encoder_LandSlope = LabelEncoder()
label_encoder_Neighborhood = LabelEncoder()
label_encoder_Condition1 = LabelEncoder()
label_encoder_Condition2 = LabelEncoder()
label_encoder_BldgType = LabelEncoder()
label_encoder_HouseStyle = LabelEncoder()
label_encoder_RoofStyle = LabelEncoder()
label_encoder_RoofMatl = LabelEncoder()
label_encoder_Exterior1st = LabelEncoder()
label_encoder_Exterior2nd = LabelEncoder()
label_encoder_MasVnrType = LabelEncoder()
label_encoder_ExterQual = LabelEncoder()
label_encoder_ExterCond = LabelEncoder()
label_encoder_Foundation = LabelEncoder()
label_encoder_BsmtQual = LabelEncoder()
label_encoder_BsmtCond = LabelEncoder()
label_encoder_BsmtExposure = LabelEncoder()
label_encode

In [31]:
label_encoder_MSZoning = LabelEncoder()
label_encoder_Street = LabelEncoder()
label_encoder_Alley = LabelEncoder()
label_encoder_LotShape = LabelEncoder()
label_encoder_LandContour = LabelEncoder()
label_encoder_Utilities = LabelEncoder()
label_encoder_LotConfig = LabelEncoder()
label_encoder_LandSlope = LabelEncoder()
label_encoder_Neighborhood = LabelEncoder()
label_encoder_Condition1 = LabelEncoder()
label_encoder_Condition2 = LabelEncoder()
label_encoder_BldgType = LabelEncoder()
label_encoder_HouseStyle = LabelEncoder()
label_encoder_RoofStyle = LabelEncoder()
label_encoder_RoofMatl = LabelEncoder()
label_encoder_Exterior1st = LabelEncoder()
label_encoder_Exterior2nd = LabelEncoder()
label_encoder_MasVnrType = LabelEncoder()
label_encoder_ExterQual = LabelEncoder()
label_encoder_ExterCond = LabelEncoder()
label_encoder_Foundation = LabelEncoder()
label_encoder_BsmtQual = LabelEncoder()
label_encoder_BsmtCond = LabelEncoder()
label_encoder_BsmtExposure = LabelEncoder()
label_encoder_BsmtFinType1 = LabelEncoder()
label_encoder_BsmtFinType2 = LabelEncoder()
label_encoder_Heating = LabelEncoder()
label_encoder_HeatingQC = LabelEncoder()
label_encoder_CentralAir = LabelEncoder()
label_encoder_Electrical = LabelEncoder()
label_encoder_KitchenQual = LabelEncoder()
label_encoder_Functional = LabelEncoder()
label_encoder_FireplaceQu = LabelEncoder()
label_encoder_GarageType = LabelEncoder()
label_encoder_GarageFinish = LabelEncoder()
label_encoder_GarageQual = LabelEncoder()
label_encoder_GarageCond = LabelEncoder()
label_encoder_PavedDrive = LabelEncoder()
label_encoder_PoolQC = LabelEncoder()
label_encoder_Fence = LabelEncoder()
label_encoder_MiscFeature = LabelEncoder()
label_encoder_SaleType = LabelEncoder()
label_encoder_SaleCondition = LabelEncoder()

In [32]:
dict_label_cat = dict()

for c in lista:
    dict_label_cat[c] = f'label_encoder_{c}'
dict_label_cat

{'MSZoning': 'label_encoder_MSZoning',
 'Street': 'label_encoder_Street',
 'Alley': 'label_encoder_Alley',
 'LotShape': 'label_encoder_LotShape',
 'LandContour': 'label_encoder_LandContour',
 'Utilities': 'label_encoder_Utilities',
 'LotConfig': 'label_encoder_LotConfig',
 'LandSlope': 'label_encoder_LandSlope',
 'Neighborhood': 'label_encoder_Neighborhood',
 'Condition1': 'label_encoder_Condition1',
 'Condition2': 'label_encoder_Condition2',
 'BldgType': 'label_encoder_BldgType',
 'HouseStyle': 'label_encoder_HouseStyle',
 'RoofStyle': 'label_encoder_RoofStyle',
 'RoofMatl': 'label_encoder_RoofMatl',
 'Exterior1st': 'label_encoder_Exterior1st',
 'Exterior2nd': 'label_encoder_Exterior2nd',
 'MasVnrType': 'label_encoder_MasVnrType',
 'ExterQual': 'label_encoder_ExterQual',
 'ExterCond': 'label_encoder_ExterCond',
 'Foundation': 'label_encoder_Foundation',
 'BsmtQual': 'label_encoder_BsmtQual',
 'BsmtCond': 'label_encoder_BsmtCond',
 'BsmtExposure': 'label_encoder_BsmtExposure',
 'BsmtFi

In [33]:
lista_train_data = list()
count = 0
for c in train_data_X_train.columns:
    for d in lista:
        if c == d:
            for a in dict_label_cat.keys():
                if d == a:
                    print(f'X_train_data[:,{count}] = {dict_label_cat[a]}.fit_transform(X_train_data[:,{count}])')
    count+=1

X_train_data[:,1] = label_encoder_MSZoning.fit_transform(X_train_data[:,1])
X_train_data[:,4] = label_encoder_Street.fit_transform(X_train_data[:,4])
X_train_data[:,5] = label_encoder_LotShape.fit_transform(X_train_data[:,5])
X_train_data[:,6] = label_encoder_LandContour.fit_transform(X_train_data[:,6])
X_train_data[:,7] = label_encoder_Utilities.fit_transform(X_train_data[:,7])
X_train_data[:,8] = label_encoder_LotConfig.fit_transform(X_train_data[:,8])
X_train_data[:,9] = label_encoder_LandSlope.fit_transform(X_train_data[:,9])
X_train_data[:,10] = label_encoder_Neighborhood.fit_transform(X_train_data[:,10])
X_train_data[:,11] = label_encoder_Condition1.fit_transform(X_train_data[:,11])
X_train_data[:,12] = label_encoder_Condition2.fit_transform(X_train_data[:,12])
X_train_data[:,13] = label_encoder_BldgType.fit_transform(X_train_data[:,13])
X_train_data[:,14] = label_encoder_HouseStyle.fit_transform(X_train_data[:,14])
X_train_data[:,19] = label_encoder_RoofStyle.fit_transform(X_tra

In [35]:
X_train_data[:,1] = label_encoder_MSZoning.fit_transform(X_train_data[:,1])
X_train_data[:,4] = label_encoder_Street.fit_transform(X_train_data[:,4])
X_train_data[:,5] = label_encoder_LotShape.fit_transform(X_train_data[:,5])
X_train_data[:,6] = label_encoder_LandContour.fit_transform(X_train_data[:,6])
X_train_data[:,7] = label_encoder_Utilities.fit_transform(X_train_data[:,7])
X_train_data[:,8] = label_encoder_LotConfig.fit_transform(X_train_data[:,8])
X_train_data[:,9] = label_encoder_LandSlope.fit_transform(X_train_data[:,9])
X_train_data[:,10] = label_encoder_Neighborhood.fit_transform(X_train_data[:,10])
X_train_data[:,11] = label_encoder_Condition1.fit_transform(X_train_data[:,11])
X_train_data[:,12] = label_encoder_Condition2.fit_transform(X_train_data[:,12])
X_train_data[:,13] = label_encoder_BldgType.fit_transform(X_train_data[:,13])
X_train_data[:,14] = label_encoder_HouseStyle.fit_transform(X_train_data[:,14])
X_train_data[:,19] = label_encoder_RoofStyle.fit_transform(X_train_data[:,19])
X_train_data[:,20] = label_encoder_RoofMatl.fit_transform(X_train_data[:,20])
X_train_data[:,21] = label_encoder_Exterior1st.fit_transform(X_train_data[:,21])
X_train_data[:,22] = label_encoder_Exterior2nd.fit_transform(X_train_data[:,22])
X_train_data[:,24] = label_encoder_ExterQual.fit_transform(X_train_data[:,24])
X_train_data[:,25] = label_encoder_ExterCond.fit_transform(X_train_data[:,25])
X_train_data[:,26] = label_encoder_Foundation.fit_transform(X_train_data[:,26])
X_train_data[:,31] = label_encoder_Heating.fit_transform(X_train_data[:,31])
X_train_data[:,32] = label_encoder_HeatingQC.fit_transform(X_train_data[:,32])
X_train_data[:,33] = label_encoder_CentralAir.fit_transform(X_train_data[:,33])
X_train_data[:,44] = label_encoder_KitchenQual.fit_transform(X_train_data[:,44])
X_train_data[:,46] = label_encoder_Functional.fit_transform(X_train_data[:,46])
X_train_data[:,51] = label_encoder_PavedDrive.fit_transform(X_train_data[:,51])
X_train_data[:,61] = label_encoder_SaleType.fit_transform(X_train_data[:,61])
X_train_data[:,62] = label_encoder_SaleCondition.fit_transform(X_train_data[:,62])

In [36]:
count = 0
for c in train_data_X_train.columns:
    for d in lista:
        if c == d:
            print(f'{count}, ', end='')
    count+=1

1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 19, 20, 21, 22, 24, 25, 26, 31, 32, 33, 44, 46, 51, 61, 62, 

In [37]:
X_train_data[2]

array([60, 3, 68.0, 11250, 1, 0, 3, 0, 4, 0, 5, 2, 2, 0, 5, 7, 5, 2001,
       2002, 1, 1, 12, 13, 104, 2, 4, 2, 222, 0, 278, 214, 1, 0, 1, 920,
       866, 0, 570, 1, 0, 2, 1, 3, 1, 2, 6, 6, 1, 2001.0, 2, 608, 2, 0,
       30, 0, 0, 0, 0, 0, 9, 2, 8, 4, 223500], dtype=object)

#### OneHotEncoder 

In [38]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [39]:
onehotencoder_train_data = ColumnTransformer(transformers=[('OneHot', OneHotEncoder(), [1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 19, 20, 21, 22, 24, 25, 26, 31, 32, 33, 44, 46, 51, 61, 62])], remainder='passthrough')

In [40]:
X_train_data
X_train_data = onehotencoder_train_data.fit_transform(X_train_data).toarray()
X_train_data

array([[0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 2.00000e+00,
        2.00000e+00, 2.08500e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 5.00000e+00,
        1.00000e+00, 1.81500e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 9.00000e+00,
        2.00000e+00, 2.23500e+05],
       ...,
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 5.00000e+00,
        4.00000e+00, 2.66500e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 4.00000e+00,
        4.00000e+00, 1.42125e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 6.00000e+00,
        2.00000e+00, 1.47500e+05]])

In [41]:
X_train_data[0]

array([0.000e+00, 0.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 0.000e+00,
       1.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 1.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 1.000e+00, 1.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 1.000e+00, 1.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       1.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 1.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 1.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 1.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 1.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 1.000e+00, 

In [42]:
X_train_data.shape

(1460, 219)

### 3.4 - Escalonamento dos Valores

In [43]:
from sklearn.preprocessing import StandardScaler
scaler_census = StandardScaler()
X_train_data = scaler_census.fit_transform(X_train_data)

In [44]:
X_train_data[0]

array([-0.08304548, -0.21585871, -0.10526316,  0.51813339, -0.41895507,
       -0.06423821,  0.06423821, -0.70420284, -0.16998114, -0.08304548,
        0.76051192, -0.21235968, -0.18831089, -0.15899968,  0.33712564,
        0.02618016, -0.02618016, -0.46873869, -0.26232433, -0.18238027,
       -0.05241424,  0.62276215,  0.23757092, -0.21585871, -0.09478452,
       -0.10854037, -0.03703704, -0.10526316, -0.20339487, -0.1398323 ,
        2.95522137, -0.19025216, -0.27116307, -0.23917551, -0.16124951,
       -0.10854037, -0.1863522 , -0.42683279, -0.07875671, -0.22941573,
       -0.16998114, -0.23595776, -0.28963792, -0.13199092, -0.23106504,
       -0.20521398, -0.25018188, -0.13199092, -0.16347148, -0.08712888,
       -0.18437553, -0.24235968,  0.39840954, -0.07422696, -0.11482721,
       -0.08712888, -0.13465178, -0.03703704, -0.05862104, -0.03703704,
       -0.06423821,  0.10188534, -0.02618016, -0.03703704, -0.02618016,
       -0.02618016, -0.03703704,  0.44353276, -0.14728711, -0.19

## 4 - Exploração e processamento da base de teste

### 4.1 - Análise Exploratória

In [45]:
# Base de dados de treinamento
test_data.head().columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [46]:
# Calda da base de dados de treinamento

test_data.tail()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal
1458,2919,60,RL,74.0,9627,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,11,2006,WD,Normal


In [47]:
# A base de dados tem 1460 linhas e 80 colunas

test_data.shape

(1459, 80)

####  Valores Nulos

In [48]:
# Colunas com valores nulos

nulos = test_data.isnull().sum()
# Caldanulos.head(50)

In [52]:
# Calda das colunas

nulos.head(40)

Id                 0
MSSubClass         0
MSZoning           4
LotFrontage      227
LotArea            0
Street             0
Alley           1352
LotShape           0
LandContour        0
Utilities          2
LotConfig          0
LandSlope          0
Neighborhood       0
Condition1         0
Condition2         0
BldgType           0
HouseStyle         0
OverallQual        0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
RoofStyle          0
RoofMatl           0
Exterior1st        1
Exterior2nd        1
MasVnrType        16
MasVnrArea        15
ExterQual          0
ExterCond          0
Foundation         0
BsmtQual          44
BsmtCond          45
BsmtExposure      44
BsmtFinType1      42
BsmtFinSF1         1
BsmtFinType2      42
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
Heating            0
dtype: int64

####  Análise das colunas numéricas

In [50]:
# Colunas com os dados númericos

test_data_numericos = test_data.select_dtypes(include=['int64', 'float64'])
test_data_numericos.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,1461,20,80.0,11622,5,6,1961,1961,0.0,468.0,...,730.0,140,0,0,0,120,0,0,6,2010
1,1462,20,81.0,14267,6,6,1958,1958,108.0,923.0,...,312.0,393,36,0,0,0,0,12500,6,2010
2,1463,60,74.0,13830,5,5,1997,1998,0.0,791.0,...,482.0,212,34,0,0,0,0,0,3,2010
3,1464,60,78.0,9978,6,6,1998,1998,20.0,602.0,...,470.0,360,36,0,0,0,0,0,6,2010
4,1465,120,43.0,5005,8,5,1992,1992,0.0,263.0,...,506.0,0,82,0,0,144,0,0,1,2010


In [51]:
colunas_numéricos = test_data_numericos.columns
colunas_numéricos

Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold'],
      dtype='object')

###### O tratamento de valores nulos para colunas númericas consistirá em substituir o valores nulos pela média dos valores da coluna. As colunas numéricas que apresentam dados nulos são: GarageYrBlt e LotFrontage

#### Tratando a coluna LotFrontage

In [12]:
# Média dos valores na colunas LotFrontage

media_lotfrontage = train_data["LotFrontage"].mean()
media_lotfrontage

70.04995836802665

In [13]:
# Atibuindo a média às celulas com valores nulos na coluna LotFrontage

train_data["LotFrontage"].fillna(train_data["LotFrontage"].mean(), inplace=True)
train_data[train_data["LotFrontage"].isnull()]

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice


#### Trando a coluna GarageYrBlt

In [14]:
# Média dos valores na colunas GarageYrBlt

media_lotfrontage = train_data["GarageYrBlt"].mean()
media_lotfrontage

1978.5061638868744

In [15]:
# Atibuindo a média às celulas com valores nulos na coluna GarageYrBlt

train_data["GarageYrBlt"].fillna(train_data["GarageYrBlt"].mean(), inplace=True)
train_data[train_data["GarageYrBlt"].isnull()]

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice


#### Análise colunas categóricas

In [16]:
# Colunas com os dados categôricos

train_data_categoricos = train_data.select_dtypes(include=['object'])
train_data_categoricos.head()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


In [17]:
colunas_categoricas = train_data_categoricos.columns
colunas_categoricas

Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')

##### Para o caso das colunas categóricas, será apagado as colunas que apresentarem valores nulos, uma vez que não é possível substituir o v.n. por qualquer outro valor. As colunas categóricas que serão apagadas são: Alley, MasVnrType, BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2, Electrical, FireplaceQu, GarageType, GarageFinish, GarageQual, GarageCond e PoolQC.

In [18]:
train_data = train_data.drop(columns=['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature'])
train_data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrArea', 'ExterQual',
       'ExterCond', 'Foundation', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', '1stFlrSF',
       '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath',
       'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'GarageYrBlt', 'GarageCars',
       'GarageArea', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'SaleType', 'SaleCondition', 'SalePrice'],
      dtype='object')

#### Checagem de valores nulos na base de treinamento

In [19]:
nulos = train_data.isnull().sum()
nulos.tail(21)

KitchenAbvGr     0
KitchenQual      0
TotRmsAbvGrd     0
Functional       0
Fireplaces       0
GarageYrBlt      0
GarageCars       0
GarageArea       0
PavedDrive       0
WoodDeckSF       0
OpenPorchSF      0
EnclosedPorch    0
3SsnPorch        0
ScreenPorch      0
PoolArea         0
MiscVal          0
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
SalePrice        0
dtype: int64

#### Obtendo mais informações sobre as colunas

In [20]:
train_data.info()
train_data.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 65 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1460 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

(1460, 65)

### 3.2 - Pré-processamento dos dados

In [21]:
train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [22]:
X_train_data = train_data.iloc[:, 1:75].values

In [23]:
X_train_data

array([[60, 'RL', 65.0, ..., 'WD', 'Normal', 208500],
       [20, 'RL', 80.0, ..., 'WD', 'Normal', 181500],
       [60, 'RL', 68.0, ..., 'WD', 'Normal', 223500],
       ...,
       [70, 'RL', 66.0, ..., 'WD', 'Normal', 266500],
       [20, 'RL', 68.0, ..., 'WD', 'Normal', 142125],
       [20, 'RL', 75.0, ..., 'WD', 'Normal', 147500]], dtype=object)

In [24]:
y_train_data = train_data.iloc[:, -1].values

In [25]:
y_train_data

array([208500, 181500, 223500, ..., 266500, 142125, 147500], dtype=int64)

### 3.3 - Tratamento de atributos categóricos

#### LabelEncoder

In [26]:
from sklearn.preprocessing import LabelEncoder

In [27]:
X_train_data[0]

array([60, 'RL', 65.0, 8450, 'Pave', 'Reg', 'Lvl', 'AllPub', 'Inside',
       'Gtl', 'CollgCr', 'Norm', 'Norm', '1Fam', '2Story', 7, 5, 2003,
       2003, 'Gable', 'CompShg', 'VinylSd', 'VinylSd', 196.0, 'Gd', 'TA',
       'PConc', 706, 0, 150, 856, 'GasA', 'Ex', 'Y', 856, 854, 0, 1710, 1,
       0, 2, 1, 3, 1, 'Gd', 8, 'Typ', 0, 2003.0, 2, 548, 'Y', 0, 61, 0, 0,
       0, 0, 0, 2, 2008, 'WD', 'Normal', 208500], dtype=object)

In [28]:
train_data.columns
train_data_X_train = train_data.drop(columns=['Id', 'SalePrice'], axis=1)
train_data_X_train

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,61,0,0,0,0,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,42,0,0,0,0,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,35,272,0,0,0,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,84,0,0,0,0,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,40,0,0,0,0,0,8,2007,WD,Normal
1456,20,RL,85.0,13175,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,0,2,2010,WD,Normal
1457,70,RL,66.0,9042,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,60,0,0,0,0,2500,5,2010,WD,Normal
1458,20,RL,68.0,9717,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,112,0,0,0,0,4,2010,WD,Normal


In [29]:
lista = list(train_data_categoricos.columns)
lista

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [30]:
for c in lista:
    print(f'label_encoder_{c} = LabelEncoder()')

label_encoder_MSZoning = LabelEncoder()
label_encoder_Street = LabelEncoder()
label_encoder_Alley = LabelEncoder()
label_encoder_LotShape = LabelEncoder()
label_encoder_LandContour = LabelEncoder()
label_encoder_Utilities = LabelEncoder()
label_encoder_LotConfig = LabelEncoder()
label_encoder_LandSlope = LabelEncoder()
label_encoder_Neighborhood = LabelEncoder()
label_encoder_Condition1 = LabelEncoder()
label_encoder_Condition2 = LabelEncoder()
label_encoder_BldgType = LabelEncoder()
label_encoder_HouseStyle = LabelEncoder()
label_encoder_RoofStyle = LabelEncoder()
label_encoder_RoofMatl = LabelEncoder()
label_encoder_Exterior1st = LabelEncoder()
label_encoder_Exterior2nd = LabelEncoder()
label_encoder_MasVnrType = LabelEncoder()
label_encoder_ExterQual = LabelEncoder()
label_encoder_ExterCond = LabelEncoder()
label_encoder_Foundation = LabelEncoder()
label_encoder_BsmtQual = LabelEncoder()
label_encoder_BsmtCond = LabelEncoder()
label_encoder_BsmtExposure = LabelEncoder()
label_encode

In [31]:
label_encoder_MSZoning = LabelEncoder()
label_encoder_Street = LabelEncoder()
label_encoder_Alley = LabelEncoder()
label_encoder_LotShape = LabelEncoder()
label_encoder_LandContour = LabelEncoder()
label_encoder_Utilities = LabelEncoder()
label_encoder_LotConfig = LabelEncoder()
label_encoder_LandSlope = LabelEncoder()
label_encoder_Neighborhood = LabelEncoder()
label_encoder_Condition1 = LabelEncoder()
label_encoder_Condition2 = LabelEncoder()
label_encoder_BldgType = LabelEncoder()
label_encoder_HouseStyle = LabelEncoder()
label_encoder_RoofStyle = LabelEncoder()
label_encoder_RoofMatl = LabelEncoder()
label_encoder_Exterior1st = LabelEncoder()
label_encoder_Exterior2nd = LabelEncoder()
label_encoder_MasVnrType = LabelEncoder()
label_encoder_ExterQual = LabelEncoder()
label_encoder_ExterCond = LabelEncoder()
label_encoder_Foundation = LabelEncoder()
label_encoder_BsmtQual = LabelEncoder()
label_encoder_BsmtCond = LabelEncoder()
label_encoder_BsmtExposure = LabelEncoder()
label_encoder_BsmtFinType1 = LabelEncoder()
label_encoder_BsmtFinType2 = LabelEncoder()
label_encoder_Heating = LabelEncoder()
label_encoder_HeatingQC = LabelEncoder()
label_encoder_CentralAir = LabelEncoder()
label_encoder_Electrical = LabelEncoder()
label_encoder_KitchenQual = LabelEncoder()
label_encoder_Functional = LabelEncoder()
label_encoder_FireplaceQu = LabelEncoder()
label_encoder_GarageType = LabelEncoder()
label_encoder_GarageFinish = LabelEncoder()
label_encoder_GarageQual = LabelEncoder()
label_encoder_GarageCond = LabelEncoder()
label_encoder_PavedDrive = LabelEncoder()
label_encoder_PoolQC = LabelEncoder()
label_encoder_Fence = LabelEncoder()
label_encoder_MiscFeature = LabelEncoder()
label_encoder_SaleType = LabelEncoder()
label_encoder_SaleCondition = LabelEncoder()

In [32]:
dict_label_cat = dict()

for c in lista:
    dict_label_cat[c] = f'label_encoder_{c}'
dict_label_cat

{'MSZoning': 'label_encoder_MSZoning',
 'Street': 'label_encoder_Street',
 'Alley': 'label_encoder_Alley',
 'LotShape': 'label_encoder_LotShape',
 'LandContour': 'label_encoder_LandContour',
 'Utilities': 'label_encoder_Utilities',
 'LotConfig': 'label_encoder_LotConfig',
 'LandSlope': 'label_encoder_LandSlope',
 'Neighborhood': 'label_encoder_Neighborhood',
 'Condition1': 'label_encoder_Condition1',
 'Condition2': 'label_encoder_Condition2',
 'BldgType': 'label_encoder_BldgType',
 'HouseStyle': 'label_encoder_HouseStyle',
 'RoofStyle': 'label_encoder_RoofStyle',
 'RoofMatl': 'label_encoder_RoofMatl',
 'Exterior1st': 'label_encoder_Exterior1st',
 'Exterior2nd': 'label_encoder_Exterior2nd',
 'MasVnrType': 'label_encoder_MasVnrType',
 'ExterQual': 'label_encoder_ExterQual',
 'ExterCond': 'label_encoder_ExterCond',
 'Foundation': 'label_encoder_Foundation',
 'BsmtQual': 'label_encoder_BsmtQual',
 'BsmtCond': 'label_encoder_BsmtCond',
 'BsmtExposure': 'label_encoder_BsmtExposure',
 'BsmtFi

In [33]:
lista_train_data = list()
count = 0
for c in train_data_X_train.columns:
    for d in lista:
        if c == d:
            for a in dict_label_cat.keys():
                if d == a:
                    print(f'X_train_data[:,{count}] = {dict_label_cat[a]}.fit_transform(X_train_data[:,{count}])')
    count+=1

X_train_data[:,1] = label_encoder_MSZoning.fit_transform(X_train_data[:,1])
X_train_data[:,4] = label_encoder_Street.fit_transform(X_train_data[:,4])
X_train_data[:,5] = label_encoder_LotShape.fit_transform(X_train_data[:,5])
X_train_data[:,6] = label_encoder_LandContour.fit_transform(X_train_data[:,6])
X_train_data[:,7] = label_encoder_Utilities.fit_transform(X_train_data[:,7])
X_train_data[:,8] = label_encoder_LotConfig.fit_transform(X_train_data[:,8])
X_train_data[:,9] = label_encoder_LandSlope.fit_transform(X_train_data[:,9])
X_train_data[:,10] = label_encoder_Neighborhood.fit_transform(X_train_data[:,10])
X_train_data[:,11] = label_encoder_Condition1.fit_transform(X_train_data[:,11])
X_train_data[:,12] = label_encoder_Condition2.fit_transform(X_train_data[:,12])
X_train_data[:,13] = label_encoder_BldgType.fit_transform(X_train_data[:,13])
X_train_data[:,14] = label_encoder_HouseStyle.fit_transform(X_train_data[:,14])
X_train_data[:,19] = label_encoder_RoofStyle.fit_transform(X_tra

In [35]:
X_train_data[:,1] = label_encoder_MSZoning.fit_transform(X_train_data[:,1])
X_train_data[:,4] = label_encoder_Street.fit_transform(X_train_data[:,4])
X_train_data[:,5] = label_encoder_LotShape.fit_transform(X_train_data[:,5])
X_train_data[:,6] = label_encoder_LandContour.fit_transform(X_train_data[:,6])
X_train_data[:,7] = label_encoder_Utilities.fit_transform(X_train_data[:,7])
X_train_data[:,8] = label_encoder_LotConfig.fit_transform(X_train_data[:,8])
X_train_data[:,9] = label_encoder_LandSlope.fit_transform(X_train_data[:,9])
X_train_data[:,10] = label_encoder_Neighborhood.fit_transform(X_train_data[:,10])
X_train_data[:,11] = label_encoder_Condition1.fit_transform(X_train_data[:,11])
X_train_data[:,12] = label_encoder_Condition2.fit_transform(X_train_data[:,12])
X_train_data[:,13] = label_encoder_BldgType.fit_transform(X_train_data[:,13])
X_train_data[:,14] = label_encoder_HouseStyle.fit_transform(X_train_data[:,14])
X_train_data[:,19] = label_encoder_RoofStyle.fit_transform(X_train_data[:,19])
X_train_data[:,20] = label_encoder_RoofMatl.fit_transform(X_train_data[:,20])
X_train_data[:,21] = label_encoder_Exterior1st.fit_transform(X_train_data[:,21])
X_train_data[:,22] = label_encoder_Exterior2nd.fit_transform(X_train_data[:,22])
X_train_data[:,24] = label_encoder_ExterQual.fit_transform(X_train_data[:,24])
X_train_data[:,25] = label_encoder_ExterCond.fit_transform(X_train_data[:,25])
X_train_data[:,26] = label_encoder_Foundation.fit_transform(X_train_data[:,26])
X_train_data[:,31] = label_encoder_Heating.fit_transform(X_train_data[:,31])
X_train_data[:,32] = label_encoder_HeatingQC.fit_transform(X_train_data[:,32])
X_train_data[:,33] = label_encoder_CentralAir.fit_transform(X_train_data[:,33])
X_train_data[:,44] = label_encoder_KitchenQual.fit_transform(X_train_data[:,44])
X_train_data[:,46] = label_encoder_Functional.fit_transform(X_train_data[:,46])
X_train_data[:,51] = label_encoder_PavedDrive.fit_transform(X_train_data[:,51])
X_train_data[:,61] = label_encoder_SaleType.fit_transform(X_train_data[:,61])
X_train_data[:,62] = label_encoder_SaleCondition.fit_transform(X_train_data[:,62])

In [36]:
count = 0
for c in train_data_X_train.columns:
    for d in lista:
        if c == d:
            print(f'{count}, ', end='')
    count+=1

1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 19, 20, 21, 22, 24, 25, 26, 31, 32, 33, 44, 46, 51, 61, 62, 

In [37]:
X_train_data[2]

array([60, 3, 68.0, 11250, 1, 0, 3, 0, 4, 0, 5, 2, 2, 0, 5, 7, 5, 2001,
       2002, 1, 1, 12, 13, 104, 2, 4, 2, 222, 0, 278, 214, 1, 0, 1, 920,
       866, 0, 570, 1, 0, 2, 1, 3, 1, 2, 6, 6, 1, 2001.0, 2, 608, 2, 0,
       30, 0, 0, 0, 0, 0, 9, 2, 8, 4, 223500], dtype=object)

#### OneHotEncoder 

In [38]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [39]:
onehotencoder_train_data = ColumnTransformer(transformers=[('OneHot', OneHotEncoder(), [1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 19, 20, 21, 22, 24, 25, 26, 31, 32, 33, 44, 46, 51, 61, 62])], remainder='passthrough')

In [40]:
X_train_data
X_train_data = onehotencoder_train_data.fit_transform(X_train_data).toarray()
X_train_data

array([[0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 2.00000e+00,
        2.00000e+00, 2.08500e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 5.00000e+00,
        1.00000e+00, 1.81500e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 9.00000e+00,
        2.00000e+00, 2.23500e+05],
       ...,
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 5.00000e+00,
        4.00000e+00, 2.66500e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 4.00000e+00,
        4.00000e+00, 1.42125e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 6.00000e+00,
        2.00000e+00, 1.47500e+05]])

In [41]:
X_train_data[0]

array([0.000e+00, 0.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 0.000e+00,
       1.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 1.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 1.000e+00, 1.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 1.000e+00, 1.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       1.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 1.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 1.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 1.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 1.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 1.000e+00, 

In [42]:
X_train_data.shape

(1460, 219)

### 3.4 - Escalonamento dos Valores

In [43]:
from sklearn.preprocessing import StandardScaler
scaler_census = StandardScaler()
X_train_data = scaler_census.fit_transform(X_train_data)

In [44]:
X_train_data[0]

array([-0.08304548, -0.21585871, -0.10526316,  0.51813339, -0.41895507,
       -0.06423821,  0.06423821, -0.70420284, -0.16998114, -0.08304548,
        0.76051192, -0.21235968, -0.18831089, -0.15899968,  0.33712564,
        0.02618016, -0.02618016, -0.46873869, -0.26232433, -0.18238027,
       -0.05241424,  0.62276215,  0.23757092, -0.21585871, -0.09478452,
       -0.10854037, -0.03703704, -0.10526316, -0.20339487, -0.1398323 ,
        2.95522137, -0.19025216, -0.27116307, -0.23917551, -0.16124951,
       -0.10854037, -0.1863522 , -0.42683279, -0.07875671, -0.22941573,
       -0.16998114, -0.23595776, -0.28963792, -0.13199092, -0.23106504,
       -0.20521398, -0.25018188, -0.13199092, -0.16347148, -0.08712888,
       -0.18437553, -0.24235968,  0.39840954, -0.07422696, -0.11482721,
       -0.08712888, -0.13465178, -0.03703704, -0.05862104, -0.03703704,
       -0.06423821,  0.10188534, -0.02618016, -0.03703704, -0.02618016,
       -0.02618016, -0.03703704,  0.44353276, -0.14728711, -0.19

## Divisão das bases de treinamento e teste

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train_data_treinamento, X_train_data_teste, y_train_data_treinamento, y_train_data_teste = train_test_split(X_train_data, y_train_data, test_size=0.25, random_state=0)

In [None]:
X_train_data_treinamento.shape

In [None]:
y_train_data_treinamento.shape

In [None]:
X_train_data_teste.shape, y_train_data_teste.shape

#### Salvar as variáveis

In [None]:
import pickle

In [None]:
#with open('train_data.pkl', mode='wb') as f:
    pickle.dump([X_train_data_treinamento, y_train_data_treinamento, X_train_data_teste, y_train_data_teste],f)