In [34]:
import numpy as np
import pandas as pd
import basedosdados as bd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error, precision_score, recall_score, confusion_matrix

from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

## Dados

In [35]:
main_table = bd.read_sql(query = """ SELECT * FROM `rj-cor-dev.clima_pluviometro.main_table_fields_1H`""", billing_project_id = 'projeto-fgv1', use_bqstorage_api = True)
main_table.to_csv('main_table.csv')

In [36]:
main_table = pd.read_csv("main_table.csv")
main_table.head()

Unnamed: 0.1,Unnamed: 0,id_h3,estacoes,chuva_15min,chuva_1h,chuva_4h,chuva_24h,chuva_96h,data_hora,estacao_ano,quinzenas,alagamento_inicio,alagamento_fim,alagamento_pop,alagamento_lat,alagamento_long,gravidade_alagamento
0,0,88a8a07191fffff,"Alto da boa vista,Barrinha,Rocinha",0.0,0.0,0.0,0.600326,6.400151,2015-01-01 00:00:00,Verão,quinzena_2015_1_1,,,,,,
1,1,88a8a07191fffff,"Alto da boa vista,Barrinha,Rocinha",0.0,0.0,0.0,0.600326,6.400151,2015-01-01 01:00:00,Verão,quinzena_2015_1_1,,,,,,
2,2,88a8a07191fffff,"Alto da boa vista,Barrinha,Rocinha",0.0,0.0,0.0,0.600326,6.400151,2015-01-01 02:00:00,Verão,quinzena_2015_1_1,,,,,,
3,3,88a8a07191fffff,"Alto da boa vista,Barrinha,Rocinha",0.0,0.0,0.0,0.600326,6.400151,2015-01-01 03:00:00,Verão,quinzena_2015_1_1,,,,,,
4,4,88a8a07191fffff,"Alto da boa vista,Barrinha,Rocinha",0.0,0.0,0.0,0.600326,6.400151,2015-01-01 04:00:00,Verão,quinzena_2015_1_1,,,,,,


## Pré-processamento

In [37]:
main_table['target'] = main_table['alagamento_pop'].notna() # 1 se houve alagamento, 0 se não houve
main_table['target'] = main_table['target'].astype(int)

# Aplicar one-hot encoding
one_hot_encoder = OneHotEncoder(sparse=False)
encoded_cols = one_hot_encoder.fit_transform(main_table[['estacao_ano']])
encoded_labels = one_hot_encoder.categories_[0]

# Adicionar as colunas codificadas ao DataFrame original
for i, label in enumerate(encoded_labels):
    main_table[f'estacao_ano_{label}'] = encoded_cols[:, i]

main_table.drop(columns=['data_hora','id_h3', 'estacao_ano', 'alagamento_fim',
                         'estacoes', 'Unnamed: 0', 'alagamento_pop', 'alagamento_inicio', 
                         'quinzenas', 'alagamento_lat', 'alagamento_long',
                         'gravidade_alagamento'], inplace=True)

main_table.fillna(0, inplace=True)

main_table

Unnamed: 0,chuva_15min,chuva_1h,chuva_4h,chuva_24h,chuva_96h,target,estacao_ano_Inverno,estacao_ano_Outono,estacao_ano_Primavera,estacao_ano_Verão
0,0.000000,0.000000,0.000000,0.600326,6.400151,0,0.0,0.0,0.0,1.0
1,0.000000,0.000000,0.000000,0.600326,6.400151,0,0.0,0.0,0.0,1.0
2,0.000000,0.000000,0.000000,0.600326,6.400151,0,0.0,0.0,0.0,1.0
3,0.000000,0.000000,0.000000,0.600326,6.400151,0,0.0,0.0,0.0,1.0
4,0.000000,0.000000,0.000000,0.600326,6.400151,0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
76302,0.000002,0.400000,2.199972,26.399820,26.599871,0,0.0,0.0,1.0,0.0
76303,0.000002,0.000003,1.199995,26.399823,26.599875,0,0.0,0.0,1.0,0.0
76304,0.000000,0.000005,0.600007,10.799813,26.599879,0,0.0,0.0,1.0,0.0
76305,0.000000,0.000000,0.400009,10.599800,26.599879,0,0.0,0.0,1.0,0.0


In [38]:
X = main_table.drop(columns=['target'])
y = main_table['target']



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5)

## Modelos

In [39]:
# Logistic Regression

logreg = LogisticRegression(class_weight='balanced')
logreg.fit(X_train, y_train)

y_pred_lr = logreg.predict(X_test)

print(accuracy_score(y_test, y_pred_lr))
print(r2_score(y_test, y_pred_lr))
print(mean_squared_error(y_test, y_pred_lr))
print(precision_score(y_test, y_pred_lr))
print(recall_score(y_test, y_pred_lr))
conf = confusion_matrix(y_test, y_pred_lr)

print("False Positive Rate:", conf)

0.9307242072158645
-5.077659716921759
0.06927579278413558
0.11525029103608847
0.75
False Positive Rate: [[10555   760]
 [   33    99]]


In [40]:
# Neural Network

# 10 camadas ocultas com 10 neurônios cada relu
mlp = MLPClassifier(hidden_layer_sizes=(10,10), max_iter=1000, activation='relu', 
                    solver='adam', random_state=1, early_stopping=True)
mlp.fit(X_train, y_train)

y_pred_nn = mlp.predict(X_test)

print(accuracy_score(y_test, y_pred_nn))
print(r2_score(y_test, y_pred_nn))
print(mean_squared_error(y_test, y_pred_nn))
print(precision_score(y_test, y_pred_nn))
print(recall_score(y_test, y_pred_nn))
conf = confusion_matrix(y_test, y_pred_nn)

print("False Positive Rate:", conf)

0.9902157770594916
0.14161678651294207
0.00978422294050843
0.7272727272727273
0.24242424242424243
False Positive Rate: [[11303    12]
 [  100    32]]


In [41]:
# Decision Tree

dt = DecisionTreeClassifier(class_weight='balanced')
dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)

print(accuracy_score(y_test, y_pred_dt))
print(r2_score(y_test, y_pred_dt))
print(mean_squared_error(y_test, y_pred_dt))
print(recall_score(y_test, y_pred_dt, average='macro'))
print(precision_score(y_test, y_pred_dt, average='macro'))
conf = confusion_matrix(y_test, y_pred_dt)

print("False Positive Rate:", conf)

0.9871582073905827
-0.12662796770176366
0.012841792609417315
0.6940090252949289
0.7149563215116554
False Positive Rate: [[11248    67]
 [   80    52]]
