# Trabalho 2

**Pesquise e apresente um trabalho sobre aplicações da técnica Random Forest**

* Dataset utilizado: https://www.kaggle.com/datasets/camnugent/california-housing-prices

* Adaptado com base no artigo: https://medium.com/@alexrubino05/house-price-prediction-in-python-using-random-forest-59dc51bf7498

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('housing.csv')

In [3]:
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [5]:
# Convertendo valores categóricos em colunas dummy
data = pd.get_dummies(data, columns=['ocean_proximity'])
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,False,False,False,True,False
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,False,False,False,True,False
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,False,False,False,True,False
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,False,False,False,True,False
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,False,False,False,True,False


In [6]:
data.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity_<1H OCEAN',
       'ocean_proximity_INLAND', 'ocean_proximity_ISLAND',
       'ocean_proximity_NEAR BAY', 'ocean_proximity_NEAR OCEAN'],
      dtype='object')

In [8]:
# Converter as colunas específicas de booleano para inteiro
data = data.astype({col: 'int' for col in data.columns if data[col].dtype == 'bool'})



In [9]:
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0,0,0,1,0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0,0,0,1,0


In [10]:
X = data.drop(['median_house_value'],axis=1)
y = data['median_house_value']

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [15]:
# Remover linhas com NaN em X_train e y_train
X_train = X_train.dropna()
y_train = y_train[X_train.index]  # Atualizar y_train para corresponder aos índices de X_train

X_test = X_test.dropna()
y_test = y_test[X_test.index]  # Atualizar y_test para corresponder aos índices de X_test


In [16]:
from sklearn.ensemble import RandomForestRegressor

forest = RandomForestRegressor()
forest.fit(X_train, y_train)


In [18]:
forest.score(X_test, y_test)

0.8209011274655333

In [21]:
from sklearn.preprocessing import MinMaxScaler

In [31]:
# Ajuste para escalar apenas as colunas de X_train
scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)  # Isso retorna um array NumPy
X_test_scaled = scaler.transform(X_test)

# Para manter o formato DataFrame com os nomes das colunas:
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.ataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)


In [28]:
X_train_scaled.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
7641,0.604582,0.134043,0.54902,0.070019,0.078535,0.057261,0.079934,0.324375,1.0,0.0,0.0,0.0,0.0
17586,0.241036,0.505319,0.254902,0.159596,0.270061,0.0847,0.267763,0.245886,1.0,0.0,0.0,0.0,0.0
3579,0.578685,0.178723,0.666667,0.086983,0.092969,0.047283,0.094079,0.267245,1.0,0.0,0.0,0.0,0.0
3066,0.507968,0.343617,0.509804,0.03843,0.05277,0.037641,0.052796,0.154563,0.0,1.0,0.0,0.0,0.0
13460,0.693227,0.164894,0.352941,0.025383,0.032438,0.015948,0.0375,0.131233,0.0,1.0,0.0,0.0,0.0


In [33]:
scoreTrain = reg.score(X_train, y_train)
prediction = reg.predict(X_test)

print("Score de Treinamento: ", scoreTrain)

Score de Treinamento:  0.9639562606602377


In [39]:
# Suponha que você tenha as características de um novo imóvel no seu dataset
test_house = {
    'longitude': -122.23,
    'latitude': 37.88,
    'housing_median_age': 41,
    'total_rooms': 9,
    'total_bedrooms': 129,
    'population': 322,
    'households': 126,
    'median_income': 8.3252,
    'ocean_proximity_<1H OCEAN': 1,
    'ocean_proximity_INLAND': 0,
    'ocean_proximity_ISLAND': 0,
    'ocean_proximity_NEAR BAY': 0,
    'ocean_proximity_NEAR OCEAN': 0
}

# Cria um DataFrame com os dados do novo imóvel
test_house_df = pd.DataFrame([test_house])

# Escala as colunas numéricas de test_house_df da mesma forma que o conjunto de treino foi escalado
test_house_scaled = scaler.transform(test_house_df)

# Fazer a predição do preço do novo imóvel
predicted_price = forest.predict(test_house_scaled)

print("Preço previsto do novo imóvel:", predicted_price[0])

Preço previsto do novo imóvel: 235937.24


