# Task for Today  

***

## 911 Call Type Prediction  

Given *data about 911 calls*, let's try to predict the **type** of a given call.  
  
We will use a multi-input TensorFlow neural network to make our predictions.

# Getting Started

In [None]:
import numpy as np
import pandas as pd
import re

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import tensorflow as tf

In [None]:
data = pd.read_csv('../input/montcoalert/911.csv', nrows=50000)

In [None]:
data

In [None]:
data.info()

# Preprocessing

In [None]:
data.isna().sum()

In [None]:
{column: len(data[column].unique()) for column in data.columns}

In [None]:
def get_sequences(texts, vocab_length=10000):
    tokenizer = Tokenizer(num_words=vocab_length)
    tokenizer.fit_on_texts(texts)
    
    sequences = tokenizer.texts_to_sequences(texts)
    
    max_seq_length = np.max([len(sequence) for sequence in sequences])
    
    sequences = pad_sequences(sequences, maxlen=max_seq_length, padding='post')
    
    return sequences

In [None]:
def onehot_encode(df, columns, prefixes):
    df = df.copy()
    
    for column, prefix in zip(columns, prefixes):
        dummies = pd.get_dummies(df[column], prefix=prefix)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
        
    return df

In [None]:
def preprocess_inputs(df):
    
    # Drop e column (only 1 value)
    df = df.drop('e', axis=1)
    
    # Create label column and drop the title column
    df['type'] = df['title'].apply(lambda x: re.search(r'^\w+', x).group(0))
    df = df.drop('title', axis=1)
    
    # Create date/time features
    df['timeStamp'] = pd.to_datetime(df['timeStamp'])
    df['year'] = df['timeStamp'].apply(lambda x: x.year)
    df['month'] = df['timeStamp'].apply(lambda x: x.month)
    df['day'] = df['timeStamp'].apply(lambda x: x.day)
    df['hour'] = df['timeStamp'].apply(lambda x: x.hour)
    df['minute'] = df['timeStamp'].apply(lambda x: x.minute)
    df['second'] = df['timeStamp'].apply(lambda x: x.second)
    df = df.drop('timeStamp', axis=1)
    
    # Get sequences for desc and addr columns (and drop original columns)
    vocab_length = 10000
    desc_sequences = get_sequences(df['desc'], vocab_length=vocab_length)
    addr_sequences = get_sequences(df['addr'], vocab_length=vocab_length)
    df = df.drop(['desc', 'addr'], axis=1)
    
    # One-hot encode remaining categorical columns (zip and twp)
    df = onehot_encode(df, columns=['zip', 'twp'], prefixes=['z', 't'])
    
    # Split df into X and y 
    y = df['type'].copy()
    X = df.drop('type', axis=1).copy()
    
    # Map labels to integers
    label_mapping = {'EMS': 0, 'Traffic': 1, 'Fire': 2}
    y = y.replace(label_mapping)
    
    # Scale X with a standard scaler
    scaler = StandardScaler()
    X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    
    return X, desc_sequences, addr_sequences, y

In [None]:
X, desc_sequences, addr_sequences, y = preprocess_inputs(data)

In [None]:
X

In [None]:
desc_sequences.shape

In [None]:
addr_sequences.shape

In [None]:
y.value_counts()

In [None]:
X_train, X_test, desc_train, desc_test, addr_train, addr_test, y_train, y_test = \
    train_test_split(X, desc_sequences, addr_sequences, y, train_size=0.7, random_state=123)

# Modeling

In [None]:
desc_train

In [None]:
X_inputs = tf.keras.Input(shape=(X_train.shape[1],))
desc_inputs = tf.keras.Input(shape=(desc_train.shape[1],))
addr_inputs = tf.keras.Input(shape=(addr_train.shape[1],))

# X_inputs
X_dense1 = tf.keras.layers.Dense(128, activation='relu')(X_inputs)
X_dense2 = tf.keras.layers.Dense(128, activation='relu')(X_dense1)

# desc_inputs
desc_embedding = tf.keras.layers.Embedding(
    input_dim=10000,
    output_dim=64,
    input_length=desc_train.shape[1]
)(desc_inputs)
desc_flatten = tf.keras.layers.Flatten()(desc_embedding)

# addr_inputs
addr_embedding = tf.keras.layers.Embedding(
    input_dim=10000,
    output_dim=64,
    input_length=addr_train.shape[1]
)(addr_inputs)
addr_flatten = tf.keras.layers.Flatten()(addr_embedding)

# Concatenate results
concat = tf.keras.layers.concatenate([X_dense2, desc_flatten, addr_flatten])

# Make predictions
outputs = tf.keras.layers.Dense(3, activation='softmax')(concat)


model = tf.keras.Model(inputs=[X_inputs, desc_inputs, addr_inputs], outputs=outputs)

print(model.summary())
tf.keras.utils.plot_model(model)

# Training

In [None]:
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)


history = model.fit(
    [X_train, desc_train, addr_train],
    y_train,
    validation_split=0.2,
    batch_size=32,
    epochs=20,
    callbacks=[
        tf.keras.callbacks.ReduceLROnPlateau()
    ]
)

# Results

In [None]:
results = model.evaluate([X_test, desc_test, addr_test], y_test, verbose=0)

In [None]:
print("Model loss: {:.5f}".format(results[0]))
print("Model accuracy: {:.2f}%".format(results[1] * 100))

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/I78TAjpPFD8