In [1]:
# --- Manejo de datos ---
import pandas as pd
import numpy as np
import geopandas as gpd

# --- Visualización ---
import matplotlib.pyplot as plt
import seaborn as sns
from folium.plugins import HeatMap
# --- Preprocesamiento ---
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

# --- Modelado (Clasificación y Clustering) ---
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline

# --- Evaluación de modelos ---
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay

# --- Configuración general ---
pd.set_option('display.max_columns', None)

# --- Suprimir advertencias ---
import warnings
warnings.filterwarnings("ignore")

from scipy.stats import zscore
from scipy.stats import zscore
import folium

In [2]:
df = pd.read_pickle("data/df_EDA.pkl")

TTO de columnas 

In [3]:
print(df[['id', 'case_number']].head())

         id case_number
0  13311263    JG503434
1  13053066    JG103252
2  11227634    JB147599
3  13203321    JG415333
4  13204489    JG416325


variable block

In [4]:
print(df['block'].unique().shape)

(65050,)


In [5]:
df['block'].value_counts()

block
100XX W OHARE ST        16874
001XX N STATE ST        16731
076XX S CICERO AVE      10677
0000X N STATE ST        10022
008XX N MICHIGAN AVE     9690
                        ...  
002XX N Wells st            1
026XX S Emerald Ave         1
058XX S Artesian ave        1
002XX W Illinois ST         1
026XX N St Louis Ave        1
Name: count, Length: 65050, dtype: int64

In [6]:
#he separado el bloque deacuerdo a la direccion cardinal 
df['direction']=df['block'].str.extract(r'\b([NSEW])\b')

In [7]:
df

Unnamed: 0,id,case_number,block,iucr,primary_type,description,location_description,arrest,domestic,beat,district,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,latitude,longitude,date_only,time_only,updated_on_only,updated_on_time_only,direction
0,13311263,JG503434,023XX S TROY ST,1582,OFFENSE INVOLVING CHILDREN,CHILD PORNOGRAPHY,RESIDENCE,1,0,1033,10.0,25.0,30.0,17,1166183.0,1890983.0,2022,41.856422,-87.665619,2022-07-29,03:39:00,2024-04-18,15:40:59,S
1,13053066,JG103252,039XX W WASHINGTON BLVD,2017,NARCOTICS,MANUFACTURE / DELIVER - CRACK,SIDEWALK,1,0,1122,11.0,28.0,26.0,18,1166183.0,1890983.0,2023,41.856422,-87.665619,2023-01-03,16:44:00,2024-01-20,15:41:12,W
2,11227634,JB147599,001XX W RANDOLPH ST,0281,CRIM SEXUAL ASSAULT,NON-AGGRAVATED,HOTEL/MOTEL,0,0,122,1.0,42.0,32.0,02,1166183.0,1890983.0,2017,41.856422,-87.665619,2017-08-26,10:00:00,2018-02-11,15:57:41,W
3,13203321,JG415333,002XX N Wells st,1320,CRIMINAL DAMAGE,TO VEHICLE,PARKING LOT / GARAGE (NON RESIDENTIAL),0,0,122,1.0,42.0,32.0,14,1174694.0,1901831.0,2023,41.886018,-87.633938,2023-09-06,17:00:00,2023-11-04,15:40:18,N
4,13204489,JG416325,0000X E 8TH ST,0810,THEFT,OVER $500,PARKING LOT / GARAGE (NON RESIDENTIAL),0,0,123,1.0,4.0,32.0,06,1176857.0,1896680.0,2023,41.871835,-87.626151,2023-09-06,11:00:00,2023-11-04,15:40:18,E
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8316824,13837992,JJ256829,033XX N HALSTED ST,0870,THEFT,POCKET-PICKING,BAR OR TAVERN,0,0,1925,19.0,44.0,6.0,06,1170328.0,1922582.0,2025,41.943056,-87.649363,2025-05-10,00:00:00,2025-05-18,15:40:55,N
8316825,13837994,JJ257013,011XX W WINONA ST,0810,THEFT,OVER $500,RESIDENCE - GARAGE,0,0,2033,20.0,48.0,3.0,06,1167719.0,1934285.0,2025,41.975227,-87.658614,2025-04-16,00:00:00,2025-05-18,15:40:55,W
8316826,13838005,JJ256878,009XX W GUNNISON ST,1154,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT $300 AND UNDER,STREET,0,0,2024,20.0,46.0,3.0,11,1169143.0,1932410.0,2025,41.970051,-87.653432,2025-01-17,12:00:00,2025-05-18,15:40:55,W
8316827,13838008,JJ256744,026XX S Emerald Ave,0820,THEFT,$500 AND UNDER,RESIDENCE - YARD (FRONT / BACK),0,0,914,9.0,11.0,60.0,06,1171739.0,1886866.0,2025,41.845019,-87.645230,2025-05-02,12:23:00,2025-05-18,15:40:55,S


In [8]:
df['direction'].value_counts()

direction
S    3368455
W    2572140
N    1840184
E     535997
Name: count, dtype: int64

In [9]:
df.groupby('direction')['block'].value_counts(50)

direction  block                 
E          0000X E GRAND AVE         7.244444e-03
           005XX E BROWNING AVE      6.199662e-03
           021XX E 71ST ST           5.994437e-03
           0000X E ROOSEVELT RD      5.923541e-03
           002XX E HURON ST          5.388090e-03
                                         ...     
W          114XX W TOUHY AVE         3.887813e-07
           119XX W IRVING PARK RD    3.887813e-07
           121XX W IRVING PARK RD    3.887813e-07
           123XX W IRVING PARK RD    3.887813e-07
           130XX W RWY 14R           3.887813e-07
Name: proportion, Length: 65045, dtype: float64

In [10]:
df.groupby('direction')['arrest'].value_counts(50)

direction  arrest
E          0         0.750885
           1         0.249115
N          0         0.759604
           1         0.240396
S          0         0.754334
           1         0.245666
W          0         0.725803
           1         0.274197
Name: proportion, dtype: float64