## Calcular probabilidades de evento por una determinada ruta

In [1]:
import utils
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

alerts, _ = utils.load_data()

In [2]:
alerts_cleaned = utils.extract_event(
    alerts, ["ACCIDENT", "JAM"], extra_col=["type"]
).drop(["uuid", "x", "y"], axis=1)
alerts_cleaned["day"] = alerts_cleaned.inicio.dt.day
alerts_cleaned["minute"] = alerts_cleaned.inicio.dt.minute
alerts_cleaned = alerts_cleaned.drop(["inicio", "fin", "geometry"], axis=1)

alerts_cleaned.head()


Unnamed: 0,street,type,hour,week_day,day_type,day,minute
1,Av. Edmundo Pérez Zujovic,JAM,10,4,s,23,37
2,Pasaje El Volcán,JAM,11,4,s,23,3
3,Azapa,JAM,11,4,s,23,7
4,Irarrázaval,ACCIDENT,11,4,s,23,12
5,Coquimbo,JAM,11,4,s,23,40


In [3]:
alerts_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7995 entries, 1 to 11248
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   street    7963 non-null   object
 1   type      7995 non-null   object
 2   hour      7995 non-null   int32 
 3   week_day  7995 non-null   int32 
 4   day_type  7995 non-null   object
 5   day       7995 non-null   int32 
 6   minute    7995 non-null   int32 
dtypes: int32(4), object(3)
memory usage: 374.8+ KB


In [4]:
alerts_cleaned["street"].value_counts()[:20]

street
Av. Edmundo Pérez Zujovic               1981
Av. Pedro Aguirre Cerda                  689
Av. Grecia                               567
Av. Antonio Rendic                       395
Av. Iquique                              385
Av. Balmaceda                            258
Nicolás Tirado                           191
Av. Argentina                            161
Av. Óscar Bonilla                        139
Av. Séptimo de Línea                     133
Av. Salvador Allende                     110
Av. Huamachuco                            99
Los Leones                                95
Av. Arturo Pérez Canto                    83
Av. Andrés Sabella                        82
General Óscar Bonilla                     81
14 de Febrero                             74
Av. República de Croacia                  74
Circunvalación Padre Alberto Hurtado      68
Victor Farías                             60
Name: count, dtype: int64

In [5]:
print(
    f"{len(alerts_cleaned[alerts_cleaned['street'].isna()]) / len(alerts_cleaned['street']) * 100:.2f}% de lo datos es nulos en 'street'"
)
# Eliminamos valores nulos y filtro por calle

streets = alerts_cleaned["street"].value_counts().index[:20].to_numpy()

alerts_cleaned = alerts_cleaned[alerts_cleaned["street"].apply(lambda x: x in streets)]
alerts_cleaned.info()

0.40% de lo datos es nulos en 'street'
<class 'pandas.core.frame.DataFrame'>
Index: 5725 entries, 1 to 11248
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   street    5725 non-null   object
 1   type      5725 non-null   object
 2   hour      5725 non-null   int32 
 3   week_day  5725 non-null   int32 
 4   day_type  5725 non-null   object
 5   day       5725 non-null   int32 
 6   minute    5725 non-null   int32 
dtypes: int32(4), object(3)
memory usage: 268.4+ KB


In [6]:
# Balanceo de eventos y creación de no eventos

events = alerts_cleaned.copy()
events["happen"] = 1

q_events = len(events)

street = events["street"]
type = np.random.choice(["ACCIDENT", "JAM"], q_events)
hour = np.random.randint(events.hour.min(), events.hour.max(), q_events)
minute = np.random.randint(events.minute.min(), events.minute.max(), q_events)
week_day = np.random.randint(events.week_day.min(), events.week_day.max(), q_events)
day_type = np.random.choice(["s", "f"], q_events)
day = np.random.randint(events.day.min(), events.day.max() + 1, q_events)


no_events = pd.DataFrame(
    {
        "street": street,
        "type": type,
        "hour": hour,
        "minute": minute,
        "week_day": week_day,
        "day_type": day_type,
        "day": day,
        "happen": 0,
    }
)

no_events


Unnamed: 0,street,type,hour,minute,week_day,day_type,day,happen
1,Av. Edmundo Pérez Zujovic,JAM,20,53,0,f,18,0
7,Av. Grecia,ACCIDENT,6,18,4,f,19,0
8,Av. Grecia,ACCIDENT,11,29,3,f,26,0
9,Av. Balmaceda,ACCIDENT,17,8,3,s,4,0
11,Av. Balmaceda,ACCIDENT,6,37,4,s,14,0
...,...,...,...,...,...,...,...,...
11241,Los Leones,JAM,19,25,3,s,20,0
11242,General Óscar Bonilla,JAM,10,18,3,f,8,0
11246,Av. Edmundo Pérez Zujovic,JAM,12,5,3,f,3,0
11247,Av. Edmundo Pérez Zujovic,ACCIDENT,8,23,3,f,9,0


In [7]:
total_events = pd.concat([events, no_events], axis=0)
total_events["happen"].value_counts()

happen
1    5725
0    5725
Name: count, dtype: int64

In [8]:
X_happen = total_events.drop(["happen"], axis=1)
y_happen = total_events["happen"]

In [9]:
dt_bin = {"f": 0, "s": 1}
type_bin = {"ACCIDENT": 0, "JAM": 1}

X_happen["day_type"] = X_happen["day_type"].map(dt_bin)
X_happen["type"] = X_happen["type"].map(type_bin)

In [23]:
from sklearn.preprocessing import OneHotEncoder


onehot = OneHotEncoder(handle_unknown='ignore')
oht = onehot.fit_transform(total_events[['street', 'type', 'day_type']])
                 
street = pd.DataFrame(oht.toarray(), columns=onehot.get_feature_names_out())

street.head()

total_events

Unnamed: 0,street,type,hour,week_day,day_type,day,minute,happen
1,Av. Edmundo Pérez Zujovic,JAM,10,4,s,23,37,1
7,Av. Grecia,JAM,12,4,s,23,5,1
8,Av. Grecia,JAM,12,4,s,23,6,1
9,Av. Balmaceda,JAM,12,4,s,23,5,1
11,Av. Balmaceda,JAM,12,4,s,23,14,1
...,...,...,...,...,...,...,...,...
11241,Los Leones,JAM,19,3,s,20,25,0
11242,General Óscar Bonilla,JAM,10,3,f,8,18,0
11246,Av. Edmundo Pérez Zujovic,JAM,12,3,f,3,5,0
11247,Av. Edmundo Pérez Zujovic,ACCIDENT,8,3,f,9,23,0


In [24]:
from sklearn.model_selection import train_test_split

X_happen_labeled = pd.concat([total_events.drop(["street", "type", 'day_type'], axis=1).reset_index(drop=True), street.reset_index(drop=True)], axis=1)

X_train_happen, X_test_happen, y_train_happen, y_test_happen = train_test_split(
    X_happen_labeled, y_happen, test_size=0.2, random_state=42
)


In [25]:
from xgboost import XGBClassifier

xgb_happen = XGBClassifier(
    learning_rate=0.03,
    random_state=42,
    n_estimators=50,
    max_depth=5,
    gamma=0.2,
    colsample_bytree=0.7,
)
xgb_happen.fit(X_train_happen, y_train_happen)


In [26]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

cvs = cross_val_score(xgb_happen, X_happen_labeled, y_happen, cv=10)

cvs

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [27]:
f1_score(y_test_happen, xgb_happen.predict(X_test_happen))

np.float64(1.0)

In [28]:
y_type = X_happen_labeled["type"]
X_type = X_happen_labeled.drop("type", axis=1)


X_train_type, X_test_type, y_train_type, y_test_type = train_test_split(X_type, y_type, test_size=0.2, random_state=42)

xgb_type = XGBClassifier(
    learning_rate=0.03,
    random_state=42,
    n_estimators=50,
    max_depth=5,
    gamma=0.2,
    colsample_bytree=0.7,
)
xgb_type.fit(X_train_type, y_train_type)

KeyError: 'type'

In [16]:
cvs = cross_val_score(xgb_type, X_type, y_type, cv=10)

cvs


array([0.69917203, 0.62465501, 0.69733211, 0.69917203, 0.700092  ,
       0.69733211, 0.69244936, 0.6786372 , 0.66482505, 0.6611418 ])

In [17]:
f1_score(y_test_type, xgb_type.predict(X_test_type))

np.float64(0.8268551236749117)

### Parámetros iniciales

In [29]:
day_type = 1
day = 10
hour = 7
minute = 30
week_day = 2
type_ = 0

#### Av. Grecia -> Av. República de Croacia -> Av. Grecia -> Av. Balmaceda -> Av. Séptimo de Línea -> Av. Edmundo Pérez Zujovic

In [30]:
# Preparing a sample with the same structure as X_train_happen
obj = pd.DataFrame(columns=X_train_happen.columns)
obj.loc[0] = 0  # Initialize all values to 0

probs = []

# Set desired features
street_ = "Av. Edmundo Pérez Zujovic"
obj["day_type"] = day_type
obj["day"] = day
obj["hour"] = hour
obj["minute"] = minute
obj["week_day"] = week_day
obj["type"] = type_
obj[f"street_{street_}"] = 1

prob_happen = xgb_happen.predict_proba(obj)[0][1] 
prob_type = xgb_type.predict_proba(obj.drop("type", axis=1))[0][0]

print(f"Proba de que ocurrió una alerta: {prob_happen:.2f}")
print(f"Proba de que es un accidente: {prob_type:.2f}")
print(f"Probabilidad total: {(prob_happen * prob_type):.2f}")

probs.append(prob_happen * prob_type)


Proba de que ocurrió una alerta: 0.51
Proba de que es un accidente: 0.16
Probabilidad total: 0.08


In [31]:
obj[f"street_{street_}"] = 0
street_ = "Av. Grecia"
obj[f"street_{street_}"] = 1

prob_happen = xgb_happen.predict_proba(obj)[0][1]
prob_type = xgb_type.predict_proba(obj.drop("type", axis=1))[0][0]

print(f"Proba de que ocurrió una alerta: {prob_happen:.2f}")
print(f"Proba de que es un accidente: {prob_type:.2f}")
print(f"Probabilidad total: {(prob_happen * prob_type):.2f}")

probs.append(prob_happen * prob_type)

Proba de que ocurrió una alerta: 0.53
Proba de que es un accidente: 0.16
Probabilidad total: 0.08


In [32]:
obj[f"street_{street_}"] = 0
street_ = "Av. Balmaceda"
obj[f"street_{street_}"] = 1

prob_happen = xgb_happen.predict_proba(obj)[0][1]
prob_type = xgb_type.predict_proba(obj.drop("type", axis=1))[0][0]

print(f"Proba de que ocurrió una alerta: {prob_happen:.2f}")
print(f"Proba de que es un accidente: {prob_type:.2f}")
print(f"Probabilidad total: {(prob_happen * prob_type):.2f}")

probs.append(prob_happen * prob_type)


Proba de que ocurrió una alerta: 0.50
Proba de que es un accidente: 0.16
Probabilidad total: 0.08


In [33]:
obj[f"street_{street_}"] = 0
street_ = "Av. República de Croacia"
obj[f"street_{street_}"] = 1

prob_happen = xgb_happen.predict_proba(obj)[0][1]
prob_type = xgb_type.predict_proba(obj.drop("type", axis=1))[0][0]

print(f"Proba de que ocurrió una alerta: {prob_happen:.2f}")
print(f"Proba de que es un accidente: {prob_type:.2f}")
print(f"Probabilidad total: {(prob_happen * prob_type):.2f}")

probs.append(prob_happen * prob_type)

Proba de que ocurrió una alerta: 0.53
Proba de que es un accidente: 0.16
Probabilidad total: 0.08


In [34]:
np.mean(probs)

np.float32(0.08199933)

#### Av. Argentina -> Av. Iquique -> El Yodo -> Nicolás Tirado -> Av. Pedro Aguirre Cerda

In [35]:
probs = []

obj[f"street_{street_}"] = 0
street_ = "Av. Argentina"
obj[f"street_{street_}"] = 1

prob_happen = xgb_happen.predict_proba(obj)[0][1]
prob_type = xgb_type.predict_proba(obj.drop("type", axis=1))[0][0]

print(f"Proba de que ocurrió una alerta: {prob_happen:.2f}")
print(f"Proba de que es un accidente: {prob_type:.2f}")
print(f"Probabilidad total: {(prob_happen * prob_type):.2f}")

probs.append(prob_happen * prob_type)

Proba de que ocurrió una alerta: 0.53
Proba de que es un accidente: 0.16
Probabilidad total: 0.08


In [36]:
obj[f"street_{street_}"] = 0
street_ = "Av. Iquique"
obj[f"street_{street_}"] = 1

prob_happen = xgb_happen.predict_proba(obj)[0][1]
prob_type = xgb_type.predict_proba(obj.drop("type", axis=1))[0][0]

print(f"Proba de que ocurrió una alerta: {prob_happen:.2f}")
print(f"Proba de que es un accidente: {prob_type:.2f}")
print(f"Probabilidad total: {(prob_happen * prob_type):.2f}")

probs.append(prob_happen * prob_type)

Proba de que ocurrió una alerta: 0.53
Proba de que es un accidente: 0.16
Probabilidad total: 0.08


In [37]:
obj[f"street_{street_}"] = 0
street_ = "Nicolás Tirado"
obj[f"street_{street_}"] = 1

prob_happen = xgb_happen.predict_proba(obj)[0][1]
prob_type = xgb_type.predict_proba(obj.drop("type", axis=1))[0][0]

print(f"Proba de que ocurrió una alerta: {prob_happen:.2f}")
print(f"Proba de que es un accidente: {prob_type:.2f}")
print(f"Probabilidad total: {(prob_happen * prob_type):.2f}")

probs.append(prob_happen * prob_type)


Proba de que ocurrió una alerta: 0.56
Proba de que es un accidente: 0.21
Probabilidad total: 0.12


In [38]:
obj[f"street_{street_}"] = 0
street_ = "Av. Pedro Aguirre Cerda"
obj[f"street_{street_}"] = 1

prob_happen = xgb_happen.predict_proba(obj)[0][1]
prob_type = xgb_type.predict_proba(obj.drop("type", axis=1))[0][0]

print(f"Proba de que ocurrió una alerta: {prob_happen:.2f}")
print(f"Proba de que es un accidente: {prob_type:.2f}")
print(f"Probabilidad total: {(prob_happen * prob_type):.2f}")

probs.append(prob_happen * prob_type)

Proba de que ocurrió una alerta: 0.64
Proba de que es un accidente: 0.22
Probabilidad total: 0.14


In [39]:
np.mean(probs)

np.float32(0.10589569)