In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [16,9]

In [2]:
%%time
ems_data = pd.read_csv('./ems-incident-dispatch-data.csv', parse_dates=['INCIDENT_DATETIME'], nrows=300000)

CPU times: user 1.99 s, sys: 174 ms, total: 2.16 s
Wall time: 2.18 s


In [3]:
dataset = ems_data[['INCIDENT_DATETIME','INITIAL_CALL_TYPE','INITIAL_SEVERITY_LEVEL_CODE','DISPATCH_RESPONSE_SECONDS_QY']]
dataset['MONTH'] = dataset.INCIDENT_DATETIME.dt.month_name()
dataset['DAY'] = dataset.INCIDENT_DATETIME.dt.day_name()
dataset['HOUR'] = dataset.INCIDENT_DATETIME.dt.hour.astype('str')
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 7 columns):
 #   Column                        Non-Null Count   Dtype         
---  ------                        --------------   -----         
 0   INCIDENT_DATETIME             300000 non-null  datetime64[ns]
 1   INITIAL_CALL_TYPE             300000 non-null  object        
 2   INITIAL_SEVERITY_LEVEL_CODE   300000 non-null  int64         
 3   DISPATCH_RESPONSE_SECONDS_QY  300000 non-null  int64         
 4   MONTH                         300000 non-null  object        
 5   DAY                           300000 non-null  object        
 6   HOUR                          300000 non-null  object        
dtypes: datetime64[ns](1), int64(2), object(4)
memory usage: 16.0+ MB


In [4]:
dataset.sample(5)

Unnamed: 0,INCIDENT_DATETIME,INITIAL_CALL_TYPE,INITIAL_SEVERITY_LEVEL_CODE,DISPATCH_RESPONSE_SECONDS_QY,MONTH,DAY,HOUR
196056,2018-11-13 12:40:50,STATEP,2,21,November,Tuesday,12
213867,2018-11-08 21:46:59,GYNHEM,5,4,November,Thursday,21
198456,2018-11-12 20:37:15,INHALE,5,13,November,Monday,20
285804,2018-10-22 21:09:29,EDP,7,14,October,Monday,21
291856,2018-10-21 10:03:22,STATEP,2,7,October,Sunday,10


In [5]:
dataset.INITIAL_SEVERITY_LEVEL_CODE.corr(dataset.DISPATCH_RESPONSE_SECONDS_QY)

0.10428916291114526

In [7]:
from sklearn.model_selection import train_test_split, cross_val_score

numeric_features = ['INITIAL_SEVERITY_LEVEL_CODE']
categorical_features = ['INITIAL_CALL_TYPE','MONTH','DAY','HOUR']

X = dataset[numeric_features + categorical_features]
y = dataset.DISPATCH_RESPONSE_SECONDS_QY
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

numeric_transformer = Pipeline(steps=[('poly',PolynomialFeatures(degree = 2)),
                                      ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

model = SGDRegressor()

p = Pipeline([('preprocessor', preprocessor), ('model', model)])

In [9]:
%%time
p.fit(X_train, y_train)
p.score(X_test, y_test)

CPU times: user 2.34 s, sys: 77.5 ms, total: 2.42 s
Wall time: 2.45 s


0.06383433584428122

In [10]:
scores = cross_val_score(p, X, y, cv=10, scoring='neg_mean_squared_error')
np.mean(np.sqrt(-scores))

581.5555404876784

## Feature selection
