In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "ml_prf_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

### Preparacao dos dados

In [2]:
import pandas as pd

df_prf_1723 = pd.read_json("data/acidentes2017-2023.json")

In [3]:
df_prf_1723.head()

Unnamed: 0,data_inversa,dia_semana,horario,uf,br,km,municipio,causa_acidente,tipo_acidente,classificacao_acidente,...,idade,sexo,marca_veiculo,modelo_veiculo,ilesos,feridos_leves,feridos_graves,mortos,latitude,longitude
0,2017-01-01,domingo,01:45:00,RS,116,349,vacaria,defeito_mecanico_no_veiculo,colisao_traseira,com_vitimas_feridas,...,31,masculino,VW,17.280 CRM 4X2 4P,1,0,0,0,-285071196,-50941176
1,2017-01-01,domingo,01:00:00,PR,376,636,tijucas_do_sul,velocidade_incompativel,saida_de_leito_carrocavel,com_vitimas_fatais,...,28,masculino,M.BENZ,ATEGO 2430,1,0,0,0,-25754,-491266
2,2017-01-01,domingo,04:40:00,BA,101,65,entre_rios,condutor_dormindo,colisao_frontal,com_vitimas_fatais,...,53,masculino,SCANIA,G 380 A4X2,1,0,0,0,-119618,-380953
3,2017-01-01,domingo,07:40:00,RN,405,30,mossoro,ingestao_de_alcool,colisao_frontal,com_vitimas_fatais,...,44,masculino,SCANIA,R 480 A6X4,1,0,0,0,-53136,-37569
4,2017-01-01,domingo,10:35:00,PR,376,2495,apucarana,nao_guardar_distancia_de_seguranca,colisao_traseira,com_vitimas_feridas,...,56,masculino,FORD,CARGO 1723,1,0,0,0,-2361073749,-5139895052


In [4]:
df_prf_1723.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 977884 entries, 0 to 977883
Data columns (total 32 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   data_inversa            977884 non-null  object
 1   dia_semana              977884 non-null  object
 2   horario                 977884 non-null  object
 3   uf                      977884 non-null  object
 4   br                      977884 non-null  int64 
 5   km                      977884 non-null  object
 6   municipio               977884 non-null  object
 7   causa_acidente          977884 non-null  object
 8   tipo_acidente           977884 non-null  object
 9   classificacao_acidente  977884 non-null  object
 10  fase_dia                977884 non-null  object
 11  sentido_via             977884 non-null  object
 12  condicao_metereologica  977884 non-null  object
 13  tipo_pista              977884 non-null  object
 14  tracado_via             977884 non-n

In [5]:
def definir_gravidade(row):
    if row['mortos'] > 0:
        return "mortos"
    elif row['feridos_graves'] > 0:
        return "feridos_graves"
    elif row['feridos_leves'] > 0:
        return "feridos_leves"
    else:
        return "ilesos"

In [6]:
df_prf_1723['gravidade_acidente'] = df_prf_1723.apply(definir_gravidade, axis=1)

df_prf_1723 = df_prf_1723.dropna(subset=['ilesos', 'feridos_leves', 'feridos_graves', 'mortos'])
df_prf_1723 = df_prf_1723.drop(['estado_fisico', 'horario', 'data_inversa', 'uso_solo','modelo_veiculo','id_veiculo', 'classificacao_acidente', 'ilesos', 'feridos_leves', 'feridos_graves', 'mortos'], axis=1)
df_prf_1723 = df_prf_1723.dropna()

In [7]:
df_prf_1723['gravidade_acidente'].unique()

array(['ilesos', 'feridos_leves', 'mortos', 'feridos_graves'],
      dtype=object)

In [8]:
df_prf_1723.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 977884 entries, 0 to 977883
Data columns (total 22 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   dia_semana              977884 non-null  object
 1   uf                      977884 non-null  object
 2   br                      977884 non-null  int64 
 3   km                      977884 non-null  object
 4   municipio               977884 non-null  object
 5   causa_acidente          977884 non-null  object
 6   tipo_acidente           977884 non-null  object
 7   fase_dia                977884 non-null  object
 8   sentido_via             977884 non-null  object
 9   condicao_metereologica  977884 non-null  object
 10  tipo_pista              977884 non-null  object
 11  tracado_via             977884 non-null  object
 12  tipo_veiculo            977884 non-null  object
 13  marca                   977884 non-null  object
 14  ano_fabricacao_veiculo  977884 non-n

### Pre processamento

In [9]:
def is_within_brazil(lat, lon):
    return -33.7422 <= lat <= 5.2718 and -73.989 <= lon <= -34.793

df_prf_1723['longitude'] = df_prf_1723['longitude'].str.replace(",", ".").astype(float).round(6)
df_prf_1723['latitude'] = df_prf_1723['latitude'].str.replace(",", ".").astype(float).round(6)

df_prf_1723 = df_prf_1723[df_prf_1723.apply(lambda row: is_within_brazil(row['latitude'], row['longitude']), axis=1)]
df_prf_1723['km'] = df_prf_1723['km'].astype(str).str.replace(',', '.').astype(float)
df_prf_1723['idade'] = df_prf_1723['idade'].astype(str).str.replace(',', '.').astype(float)

In [10]:
df_processed = df_prf_1723.copy()

In [12]:
# normalizar com StandardScaler km, idade, latitude, longitude ??
# stratify=y para balancear dataset

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Identificar variáveis categóricas
categorical_columns = [
    'dia_semana', 'uf', 'municipio', 'causa_acidente', 'tipo_acidente', 
    'fase_dia', 'sentido_via', 'condicao_metereologica', 'tipo_pista', 
    'tracado_via', 'tipo_veiculo', 'marca_veiculo', 'sexo',
    'marca', 'tipo_envolvido'
]

# Codificação de variáveis categóricas
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df_processed[col] = le.fit_transform(df_processed[col])
    label_encoders[col] = le  # Salvar o codificador para futuras previsões

# Normalizar variáveis numéricas
numerical_columns = ['km', 'idade', 'latitude', 'longitude']
scaler = StandardScaler()
df_processed[numerical_columns] = scaler.fit_transform(df_processed[numerical_columns])

# Separar variáveis independentes (X) e variável alvo (y)
X = df_processed.drop(columns=['gravidade_acidente'])
y = df_processed['gravidade_acidente']

# Converter a variável-alvo para valores numéricos
le_target = LabelEncoder()
y = le_target.fit_transform(y)

# Converter a variável-alvo para numérico
df_processed['gravidade_acidente'] = le_target.fit_transform(df_processed['gravidade_acidente'])

# Dividir os dados em treino e teste (80% treino, 20% teste)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Exibir o tamanho das divisões
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((782222, 21), (195556, 21), (782222,), (195556,))

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Treinar um modelo Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_model.fit(X_train, y_train)

# Fazer previsões no conjunto de teste
y_pred = rf_model.predict(X_test)

# Avaliar o modelo
report = classification_report(y_test, y_pred, target_names=le_target.classes_)
conf_matrix = confusion_matrix(y_test, y_pred)

# Exibir resultados
report, conf_matrix


('                precision    recall  f1-score   support\n\nferidos_graves       0.46      0.12      0.20     20913\n feridos_leves       0.63      0.69      0.66     73949\n        ilesos       0.75      0.85      0.79     94889\n        mortos       0.44      0.11      0.17      5805\n\n      accuracy                           0.69    195556\n     macro avg       0.57      0.44      0.45    195556\n  weighted avg       0.66      0.69      0.66    195556\n',
 array([[ 2598, 13034,  4807,   474],
        [ 1842, 50714, 21146,   247],
        [  388, 13827, 80608,    66],
        [  804,  2751,  1631,   619]], dtype=int64))

In [14]:
report

'                precision    recall  f1-score   support\n\nferidos_graves       0.46      0.12      0.20     20913\n feridos_leves       0.63      0.69      0.66     73949\n        ilesos       0.75      0.85      0.79     94889\n        mortos       0.44      0.11      0.17      5805\n\n      accuracy                           0.69    195556\n     macro avg       0.57      0.44      0.45    195556\n  weighted avg       0.66      0.69      0.66    195556\n'

In [None]:
report

In [None]:
from imblearn.over_sampling import SMOTE

# Aplicar SMOTE para balancear as classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Treinar novamente o Random Forest com os dados balanceados
rf_model_smote = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_model_smote.fit(X_resampled, y_resampled)

# Fazer previsões no conjunto de teste
y_pred_smote = rf_model_smote.predict(X_test)

# Avaliar o modelo após SMOTE
report_smote = classification_report(y_test, y_pred_smote, target_names=le_target.classes_)
conf_matrix_smote = confusion_matrix(y_test, y_pred_smote)

# Exibir os resultados
report_smote, conf_matrix_smote


In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
df_prf_1723.hist(bins=50, figsize=(20,15))
save_fig("attribute_histogram_plots")
plt.show()

### Create a Test Set

In [64]:
# to make this notebook's output identical at every run
np.random.seed(42)

In [65]:
import numpy as np

# For illustration only. Sklearn has train_test_split()
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
train_set, test_set = split_train_test(df_prf_1723, 0.2)
len(train_set)

In [None]:
len(test_set)

In [68]:
from zlib import crc32

def test_set_check(identifier, test_ratio):
    return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32

def split_train_test_by_id(data, test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]

In [69]:
import hashlib

def test_set_check(identifier, test_ratio, hash=hashlib.md5):
    return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio

In [70]:
def test_set_check(identifier, test_ratio, hash=hashlib.md5):
    return bytearray(hash(np.int64(identifier)).digest())[-1] < 256 * test_ratio

In [71]:
accident_with_id = df_prf_1723.reset_index()   # adds an `index` column
train_set, test_set = split_train_test_by_id(accident_with_id, 0.2, "index")

In [75]:
accident_with_id.dropna(subset=["id"], inplace=True)

In [76]:
accident_with_id["id"] = df_prf_1723["longitude"] * 1000 + df_prf_1723["latitude"]
train_set, test_set = split_train_test_by_id(accident_with_id, 0.2, "id")

In [None]:
test_set.head()

In [78]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(df_prf_1723, test_size=0.2, random_state=42)

In [None]:
test_set.head()

### Discover and Visualize the Data to Gain Insights


In [None]:
df_prf_1723.plot(kind="scatter", x="longitude", y="latitude")
save_fig("bad_visualization_plot")

In [None]:
df_prf_1723.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)
save_fig("better_visualization_plot")

In [None]:
df_prf_1723.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
             s=df_prf_1723["mortos"]/100, label="mortos", figsize=(10,7),
             c="idade", cmap=plt.get_cmap("jet"), colorbar=True,
             sharex=False)
plt.legend()
save_fig("housing_prices_scatterplot")