# Imports

In [34]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from torch.optim.lr_scheduler import StepLR

# Load Data

In [2]:
data = pd.read_csv('unified_dataset.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,Incident Year,Incident Day of Week,Row ID,Incident ID,Incident Number,Incident Code,Incident Category,Incident Subcategory,Incident Description,...,Supervisor District,Supervisor District 2012,Latitude,Longitude,Neighborhoods,Current Supervisor Districts,Current Police Districts,time,month,month_cont
0,9,2023,Thursday,125482604134,1254826,230187101,4134,Assault,Simple Assault,Battery,...,10.0,10.0,37.76229,-122.401324,54.0,9.0,2.0,17.5,3,3.52
1,191,2021,Wednesday,104785804134,1047858,210426383,4134,Assault,Simple Assault,Battery,...,9.0,9.0,37.753837,-122.418594,53.0,2.0,3.0,8.3,7,7.23
2,204,2021,Friday,103693404134,1036934,210345909,4134,Assault,Simple Assault,Battery,...,5.0,6.0,37.785893,-122.419739,20.0,10.0,4.0,9.67,6,6.13
3,262,2021,Monday,104980919057,1049809,210440703,19057,Disorderly Conduct,Intimidation,Terrorist Threats,...,5.0,6.0,37.783214,-122.410765,20.0,10.0,5.0,12.33,7,7.39
4,267,2019,Tuesday,103770002004,1037700,210348157,2004,Sex Offense,Rape,"Rape, Forcible, W/ Force",...,6.0,6.0,37.775953,-122.408846,32.0,10.0,1.0,16.5,6,6.37


In [3]:
data.columns

Index(['Unnamed: 0', 'Incident Year', 'Incident Day of Week', 'Row ID',
       'Incident ID', 'Incident Number', 'Incident Code', 'Incident Category',
       'Incident Subcategory', 'Incident Description', 'Resolution',
       'Intersection', 'Police District', 'Analysis Neighborhood',
       'Supervisor District', 'Supervisor District 2012', 'Latitude',
       'Longitude', 'Neighborhoods', 'Current Supervisor Districts',
       'Current Police Districts', 'time', 'month', 'month_cont'],
      dtype='object')

In [4]:
drop_cols = ['Unnamed: 0', 'Row ID',  'Incident ID', 'Incident Number', 'Incident Code', 'Incident Subcategory', 'Incident Description', 'Resolution', 'month']
data = data.drop(labels=drop_cols, axis=1)
data.head()

Unnamed: 0,Incident Year,Incident Day of Week,Incident Category,Intersection,Police District,Analysis Neighborhood,Supervisor District,Supervisor District 2012,Latitude,Longitude,Neighborhoods,Current Supervisor Districts,Current Police Districts,time,month_cont
0,2023,Thursday,Assault,18TH ST \ DE HARO ST,Bayview,Potrero Hill,10.0,10.0,37.76229,-122.401324,54.0,9.0,2.0,17.5,3.52
1,2021,Wednesday,Assault,23RD ST \ MISSION ST,Mission,Mission,9.0,9.0,37.753837,-122.418594,53.0,2.0,3.0,8.3,7.23
2,2021,Friday,Assault,GEARY ST \ POLK ST,Northern,Tenderloin,5.0,6.0,37.785893,-122.419739,20.0,10.0,4.0,9.67,6.13
3,2021,Monday,Disorderly Conduct,TURK ST \ TAYLOR ST,Tenderloin,Tenderloin,5.0,6.0,37.783214,-122.410765,20.0,10.0,5.0,12.33,7.39
4,2019,Tuesday,Sex Offense,HALLAM ST \ FOLSOM ST,Southern,South of Market,6.0,6.0,37.775953,-122.408846,32.0,10.0,1.0,16.5,6.37


In [5]:
data.columns

Index(['Incident Year', 'Incident Day of Week', 'Incident Category',
       'Intersection', 'Police District', 'Analysis Neighborhood',
       'Supervisor District', 'Supervisor District 2012', 'Latitude',
       'Longitude', 'Neighborhoods', 'Current Supervisor Districts',
       'Current Police Districts', 'time', 'month_cont'],
      dtype='object')

In [6]:
reorder_cols = [
    'Incident Year', 'month_cont', 'Incident Day of Week', 'time', 
    'Latitude', 'Longitude', 'Analysis Neighborhood', 'Neighborhoods', 'Intersection', 
    'Police District', 'Supervisor District', 'Supervisor District 2012', 'Current Supervisor Districts', 'Current Police Districts',
    'Incident Category']
data = data[reorder_cols]
data.head()

Unnamed: 0,Incident Year,month_cont,Incident Day of Week,time,Latitude,Longitude,Analysis Neighborhood,Neighborhoods,Intersection,Police District,Supervisor District,Supervisor District 2012,Current Supervisor Districts,Current Police Districts,Incident Category
0,2023,3.52,Thursday,17.5,37.76229,-122.401324,Potrero Hill,54.0,18TH ST \ DE HARO ST,Bayview,10.0,10.0,9.0,2.0,Assault
1,2021,7.23,Wednesday,8.3,37.753837,-122.418594,Mission,53.0,23RD ST \ MISSION ST,Mission,9.0,9.0,2.0,3.0,Assault
2,2021,6.13,Friday,9.67,37.785893,-122.419739,Tenderloin,20.0,GEARY ST \ POLK ST,Northern,5.0,6.0,10.0,4.0,Assault
3,2021,7.39,Monday,12.33,37.783214,-122.410765,Tenderloin,20.0,TURK ST \ TAYLOR ST,Tenderloin,5.0,6.0,10.0,5.0,Disorderly Conduct
4,2019,6.37,Tuesday,16.5,37.775953,-122.408846,South of Market,32.0,HALLAM ST \ FOLSOM ST,Southern,6.0,6.0,10.0,1.0,Sex Offense


In [7]:
rename_cols = ['year','month_cont','day','time','lat','long','a_neigh','neigh','intsct','pd','sd','sd_2012','csd','cpd','cat']
data.columns = rename_cols
data.head()

Unnamed: 0,year,month_cont,day,time,lat,long,a_neigh,neigh,intsct,pd,sd,sd_2012,csd,cpd,cat
0,2023,3.52,Thursday,17.5,37.76229,-122.401324,Potrero Hill,54.0,18TH ST \ DE HARO ST,Bayview,10.0,10.0,9.0,2.0,Assault
1,2021,7.23,Wednesday,8.3,37.753837,-122.418594,Mission,53.0,23RD ST \ MISSION ST,Mission,9.0,9.0,2.0,3.0,Assault
2,2021,6.13,Friday,9.67,37.785893,-122.419739,Tenderloin,20.0,GEARY ST \ POLK ST,Northern,5.0,6.0,10.0,4.0,Assault
3,2021,7.39,Monday,12.33,37.783214,-122.410765,Tenderloin,20.0,TURK ST \ TAYLOR ST,Tenderloin,5.0,6.0,10.0,5.0,Disorderly Conduct
4,2019,6.37,Tuesday,16.5,37.775953,-122.408846,South of Market,32.0,HALLAM ST \ FOLSOM ST,Southern,6.0,6.0,10.0,1.0,Sex Offense


## Handling Categorical

In [8]:
cont_cols = ['year', 'month_cont', 'time', 'lat', 'long']
cat_cols = ['day', 'a_neigh', 'neigh', 'intsct', 'pd', 'sd', 'sd_2012', 'csd', 'cpd']

data[cat_cols].nunique()

day           7
a_neigh      41
neigh       116
intsct     9743
pd           11
sd           11
sd_2012      11
csd          11
cpd          10
dtype: int64

In [9]:
# tree models (XGBoost, LightGBM, and CatBoost) handle categoricals natively
#data.to_csv('tree_dataset.csv')

### Frequency Encoding 
Should only be done on training data, so instead of compiling dataset, we'll write function

In [10]:
# input to functions should be a full training dataset (cont + cat cols, w/o target) in the form of pandas dataframe
# returns a numerical dataframe
def dfFreqEncoder(df, cat_cols):
    df = df.copy()
    for cat in cat_cols:
        freq_map = df[cat].value_counts().to_dict()
        df[cat] = df[cat].map(freq_map)
    return df

dfFreqEncoder(data.drop('cat', axis=1), cat_cols).head()

Unnamed: 0,year,month_cont,day,time,lat,long,a_neigh,neigh,intsct,pd,sd,sd_2012,csd,cpd
0,2023,3.52,86851,17.5,37.76229,-122.401324,11137,12218,88,54530,58062,64531,64531,55815
1,2021,7.23,89912,8.3,37.753837,-122.418594,62828,46939,879,75862,69437,60497,60497,76483
2,2021,6.13,94643,9.67,37.785893,-122.419739,56969,37233,694,91698,91714,134486,134486,93134
3,2021,7.39,86055,12.33,37.783214,-122.410765,56969,37233,565,55713,91714,134486,134486,52049
4,2019,6.37,85455,16.5,37.775953,-122.408846,48280,59422,35,72193,96899,134486,134486,75157


### Neural Network Embedding
Use an autoencoder to learn a latent space represenation of our categorical data.
1. convert cat data to initial numerical form
2. initialize model
3. define loss
4. train

In [11]:
ae_data = data.drop('cat', axis=1)[cat_cols]
#ae_data_ohe = pd.get_dummies(ae_data, columns=ae_data.columns)
ae_data_le = ae_data.copy()

le = LabelEncoder()
scl = MinMaxScaler()
for col in ae_data.columns:
    ae_data_le[col] = le.fit_transform(ae_data[col])
normalized_ae_data = scl.fit_transform(ae_data_le)
ae_data_le_norm = pd.DataFrame(normalized_ae_data, columns=ae_data_le.columns)


print(f'ae_data shape: {ae_data.shape}')
print(f'ae_data_le_norm shape: {ae_data_le_norm.shape}')
#print(f'after ohe: {ae_data_ohe.shape}')
ae_data_le_norm.head()

ae_data shape: (612306, 9)
ae_data_le_norm shape: (612306, 9)


Unnamed: 0,day,a_neigh,neigh,intsct,pd,sd,sd_2012,csd,cpd
0,0.666667,0.7,0.452174,0.073086,0.0,0.9,0.9,0.8,0.111111
1,1.0,0.45,0.443478,0.113118,0.3,0.8,0.8,0.1,0.222222
2,0.0,0.875,0.165217,0.531513,0.4,0.4,0.5,0.9,0.333333
3,0.166667,0.875,0.165217,0.935229,1.0,0.4,0.5,0.9,0.444444
4,0.833333,0.825,0.269565,0.573086,0.8,0.5,0.5,0.9,0.0


In [12]:
# autoencoder model
class NaiveAutoEncoder(nn.Module):
    # where hidden_arch is an array of tuples specifing hidden layers and their i/o sizes
    def __init__(self, input_dim, latent_dim, hidden_dim):
        super().__init__()
        
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, latent_dim),
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim),
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x
    
    def get_latents(self, x):
        return self.encoder(x)

class BetterAutoEncoder(nn.Module):
    def __init__(self, input_dim, latent_dim, hidden_dims=[128,64]):
        super().__init__()

        # encoder
        encoder_layers = []
        in_dim = input_dim
        for dim in hidden_dims:
            encoder_layers.append(nn.Linear(in_dim, dim))
            encoder_layers.append(nn.BatchNorm1d(dim))
            encoder_layers.append(nn.LeakyReLU(0.2))
            encoder_layers.append(nn.Dropout(0.1))
            in_dim = dim
        encoder_layers.append(nn.Linear(hidden_dims[-1], latent_dim))
        self.encoder = nn.Sequential(*encoder_layers)

        # decoder
        decoder_layers = []
        hidden_dims.reverse()
        in_dim = latent_dim
        for dim in hidden_dims:
            decoder_layers.append(nn.Linear(in_dim, dim))
            decoder_layers.append(nn.BatchNorm1d(dim))
            decoder_layers.append(nn.LeakyReLU(0.2))
            decoder_layers.append(nn.Dropout(0.1))
            in_dim = dim
        decoder_layers.append(nn.Linear(hidden_dims[-1], input_dim))
        self.decoder = nn.Sequential(*decoder_layers)
    
    def forward(self, x):
        latents = self.encoder(x)
        reconstruction = self.decoder(latents)
        return reconstruction
    
    def get_latents(self, x):
        return self.encoder(x)

# dataset - takes ohe dataframe and converts to np array when polled
class aeDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return self.df.iloc[idx].astype(np.float32).to_numpy()

Comparing Naive and Improved Autoencoder architectures

In [13]:
ae_torch_dataset = aeDataset(ae_data_le_norm)
trainloader = DataLoader(ae_torch_dataset, batch_size=256, shuffle=True)

input_dim = ae_data_le_norm.shape[1]
latent_dim = 4
hidden_dim = 6
n_autoencoder = NaiveAutoEncoder(input_dim, latent_dim, hidden_dim)

criterion = nn.MSELoss()
lr = 1e-2
optimizer = optim.AdamW(params = n_autoencoder.parameters(), lr = lr)
scheduler = StepLR(optimizer, step_size=3, gamma=0.1)
device = torch.device('mps') if torch.mps.is_available() else torch.device('cpu')

epochs = 10
n_autoencoder.to(device)
for epoch in range(epochs):
    n_autoencoder.train()
    avg_loss = 0.0

    pbar = tqdm(range(len(trainloader)))
    for idx, batch in enumerate(trainloader):
        optimizer.zero_grad()
        batch = batch.to(device)
        outputs = n_autoencoder(batch)

        loss = criterion(outputs, batch)
        loss.backward()

        optimizer.step()

        avg_loss += loss.item()
        pbar.set_description(f'Epoch {epoch+1}/{epochs}, batch {idx+1}/{len(trainloader)}, avg_loss = {(avg_loss/(idx+1)):0.6f}')
        pbar.update(1)
    pbar.close()
    scheduler.step()

Epoch 1/10, batch 2392/2392, avg_loss = 0.072210: 100%|██████████| 2392/2392 [00:27<00:00, 87.95it/s]
Epoch 2/10, batch 2392/2392, avg_loss = 0.064770: 100%|██████████| 2392/2392 [00:26<00:00, 89.60it/s]
Epoch 3/10, batch 2392/2392, avg_loss = 0.062557: 100%|██████████| 2392/2392 [00:26<00:00, 89.36it/s]
Epoch 4/10, batch 2392/2392, avg_loss = 0.062302: 100%|██████████| 2392/2392 [00:26<00:00, 88.67it/s]
Epoch 5/10, batch 2392/2392, avg_loss = 0.053172: 100%|██████████| 2392/2392 [00:26<00:00, 88.96it/s]
Epoch 6/10, batch 2392/2392, avg_loss = 0.047958: 100%|██████████| 2392/2392 [00:26<00:00, 90.36it/s]
Epoch 7/10, batch 2392/2392, avg_loss = 0.047748: 100%|██████████| 2392/2392 [00:26<00:00, 90.27it/s]
Epoch 8/10, batch 2392/2392, avg_loss = 0.047735: 100%|██████████| 2392/2392 [00:26<00:00, 89.86it/s]
Epoch 9/10, batch 2392/2392, avg_loss = 0.047723: 100%|██████████| 2392/2392 [00:26<00:00, 89.89it/s]
Epoch 10/10, batch 2392/2392, avg_loss = 0.047714: 100%|██████████| 2392/2392 [00:

In [14]:
autoencoder = BetterAutoEncoder(input_dim=9, latent_dim=3)

criterion = nn.MSELoss()
lr = 1e-2
optimizer = optim.AdamW(params = autoencoder.parameters(), lr = lr)
scheduler = StepLR(optimizer, step_size=3, gamma=0.1)
device = torch.device('mps') if torch.mps.is_available() else torch.device('cpu')

epochs = 10
autoencoder.to(device)
for epoch in range(epochs):
    autoencoder.train()
    avg_loss = 0.0

    pbar = tqdm(range(len(trainloader)))
    for idx, batch in enumerate(trainloader):
        optimizer.zero_grad()
        batch = batch.to(device)
        outputs = autoencoder(batch)

        loss = criterion(outputs, batch)
        loss.backward()

        optimizer.step()

        avg_loss += loss.item()
        pbar.set_description(f'Epoch {epoch+1}/{epochs}, batch {idx+1}/{len(trainloader)}, avg_loss = {(avg_loss/(idx+1)):0.6f}')
        pbar.update(1)
    pbar.close()
    scheduler.step()

Epoch 1/10, batch 2392/2392, avg_loss = 0.020953: 100%|██████████| 2392/2392 [00:37<00:00, 63.86it/s]
Epoch 2/10, batch 2392/2392, avg_loss = 0.016360: 100%|██████████| 2392/2392 [00:37<00:00, 64.25it/s]
Epoch 3/10, batch 2392/2392, avg_loss = 0.015983: 100%|██████████| 2392/2392 [00:37<00:00, 64.15it/s]
Epoch 4/10, batch 2392/2392, avg_loss = 0.014123: 100%|██████████| 2392/2392 [00:37<00:00, 63.92it/s]
Epoch 5/10, batch 2392/2392, avg_loss = 0.014029: 100%|██████████| 2392/2392 [00:37<00:00, 63.86it/s]
Epoch 6/10, batch 2392/2392, avg_loss = 0.013987: 100%|██████████| 2392/2392 [00:37<00:00, 64.22it/s]
Epoch 7/10, batch 2392/2392, avg_loss = 0.013732: 100%|██████████| 2392/2392 [00:37<00:00, 63.87it/s]
Epoch 8/10, batch 2392/2392, avg_loss = 0.013746: 100%|██████████| 2392/2392 [00:37<00:00, 63.99it/s]
Epoch 9/10, batch 2392/2392, avg_loss = 0.013718: 100%|██████████| 2392/2392 [00:37<00:00, 64.08it/s]
Epoch 10/10, batch 2392/2392, avg_loss = 0.013692: 100%|██████████| 2392/2392 [00:

In [None]:
latentloader = DataLoader(ae_torch_dataset, batch_size=len(ae_torch_dataset))
next(iter(latentloader))

tensor([[0.6667, 0.7000, 0.4522,  ..., 0.9000, 0.8000, 0.1111],
        [1.0000, 0.4500, 0.4435,  ..., 0.8000, 0.1000, 0.2222],
        [0.0000, 0.8750, 0.1652,  ..., 0.5000, 0.9000, 0.3333],
        ...,
        [0.6667, 0.3000, 0.8522,  ..., 0.1000, 0.5000, 0.3333],
        [0.5000, 0.4500, 0.4435,  ..., 0.8000, 0.1000, 0.2222],
        [0.3333, 0.0000, 0.7304,  ..., 0.9000, 0.8000, 0.1111]])

In [None]:
ae_data_le_norm_array = torch.tensor(ae_data_le_norm.astype(np.float32).to_numpy())
ae_data_le_norm_array = ae_data_le_norm_array.to(device)
autoencoder.eval()
latents = autoencoder.get_latents(ae_data_le_norm_array)
latents = latents.to(device='cpu').detach().numpy()
latents = pd.DataFrame(latents)
display(latents.head())

# save latents
#latents.to_csv('cat_embeddings.csv')

Unnamed: 0,0,1,2
0,1.670652,1.708243,2.362822
1,2.623498,5.801712,3.710731
2,-1.999077,-4.358907,1.704254
3,-6.174358,-6.333201,2.651307
4,5.04915,-4.938178,-2.569633


In [None]:
data_numeric = pd.concat([data[cont_cols], latents], axis=1)
data_numeric.columns = ['year','month_cont','time','lat','long','latent1','latent2','latent3']
display(data_numeric.head())

standard_scl = StandardScaler()
mm_scl = MinMaxScaler()

time_data = data_numeric[['year','month_cont','time']]
time_data = pd.DataFrame(mm_scl.fit_transform(time_data), columns=time_data.columns)
pos_latent_data = data_numeric[['lat','long','latent1','latent2','latent3']]
pos_latent_data = pd.DataFrame(standard_scl.fit_transform(pos_latent_data), columns=pos_latent_data.columns)

data_numeric_scaled = pd.concat([time_data,pos_latent_data], axis=1)
display(data_numeric_scaled.head())

# save final scaled and embedded data
#data_numeric_scaled.to_csv('preprocessed_data.csv')

Unnamed: 0,year,month_cont,time,lat,long,latent1,latent2,latent3
0,2023,3.52,17.5,37.76229,-122.401324,1.670652,1.708243,2.362822
1,2021,7.23,8.3,37.753837,-122.418594,2.623498,5.801712,3.710731
2,2021,6.13,9.67,37.785893,-122.419739,-1.999077,-4.358907,1.704254
3,2021,7.39,12.33,37.783214,-122.410765,-6.174358,-6.333201,2.651307
4,2019,6.37,16.5,37.775953,-122.408846,5.04915,-4.938178,-2.569633


Unnamed: 0,year,month_cont,time,lat,long,latent1,latent2,latent3
0,0.714286,0.20802,0.729775,-0.28624,0.876764,0.40543,0.229879,0.705834
1,0.428571,0.517962,0.346122,-0.629501,0.228733,0.648177,1.189576,1.019907
2,0.428571,0.426065,0.403253,0.672192,0.185776,-0.529472,-1.192541,0.552383
3,0.428571,0.531328,0.514178,0.563424,0.522513,-1.593168,-1.655406,0.773053
4,0.142857,0.446115,0.688073,0.268577,0.594524,1.266137,-1.328349,-0.443464
