In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from math import log

Import the data set from the city of Boston:

In [2]:
olddf = pd.read_csv('boston_311_service_requests.csv', low_memory=False);

Making a copy of the dataframe, but saving the old one in case I want to refer to it later:

In [3]:
df = olddf.copy()

Removing cases that were listed as duplicates or opened/submitted in error:

In [4]:
df = df[~(df.closure_reason.str.contains('uplicate', na=False))]
df = df[~(df.closure_reason.str.contains('ubmitted in error', na=False))]
df = df[~(df.closure_reason.str.contains('pened in error', na=False))]
df = df[~(df.closure_reason.str.contains('NOBASE', na=False))]

Want to consider only cases that are closed and with a neighborhood listed, so drop rows where those columns are null:

In [5]:
df = df.dropna(subset=['closed_dt', 'neighborhood'])

Adding a new column that gives how long the case was open:

In [6]:
df['time_diff'] = pd.to_datetime(df['closed_dt']) - pd.to_datetime(df['open_dt'])

Removing the "Boston" neighborhood because it is ambiguous:

In [7]:
df = df[df.neighborhood != 'Boston']

Removing the "Chestnut Hill" neighborhood because there were very few calls form it:

In [8]:
df = df[df.neighborhood != 'Chestnut Hill']

Combining similar (in location) neighborhoods:

In [9]:
df = df.replace({'neighborhood': {'Allston': 'Allston / Brighton', \
                             'Brighton': 'Allston / Brighton', \
                             'Greater Mattapan': 'Mattapan', \
                             'South Boston / South Boston Waterfront':'South Boston'}})

Removing the features that I don't use. Some of these features are geographic, time, too related to what I want to predict, or not something a Boston resident may easily know. Removing closure_reason, too:

In [10]:
df = df.drop(columns = ['pwd_district', \
                        'police_district', 'ward','precinct','location_zipcode',\
                        'subject', 'case_enquiry_id', \
                        'fire_district', 'city_council_district',\
                        'neighborhood_services_district', \
                         'ontime', 'case_status', \
                        'case_title', 'queue', 'department','closedphoto',\
                        'location','location_street_name', \
                        'latitude', 'longitude', 'target_dt',\
                        'open_dt', 'closed_dt', 'closure_reason'])

If a photo was submitted (has a URL) turning it into a 1. If it was not submitted, turning the null into a 0:

In [11]:
df.submittedphoto = ~df.submittedphoto.isnull()

Converting the time_diff (which is a time delta) into the number of seconds:

In [12]:
sec = lambda t: t.total_seconds() 
df.time_diff = df.time_diff.map(sec)

Some of the cases were listed as open for a very short time or even for negative time, so removing cases open for less than two minutes:

In [13]:
df = df[df.time_diff > 120]

Converting the time difference to days since that is more on the scale that I am interested in:

In [14]:
df.time_diff = df.time_diff/(60*60*24)

Saving a version of the dataframe in case I want to look at it without the dummy variables I am about to introduce. It also keeps the reason column:

In [15]:
cleandf = df.copy()

Dropping the reason column since each type only corresponds with one reason:

In [16]:
df = df.drop(columns = ['reason'])

Turning the categorical variables (neighborhood, type, source) into dummy variables:

In [17]:
df = pd.get_dummies(df) 

I noted that dropping the extra dummy did not improve the model

The time_diff was highly skewed, so I will perform a log transform.To to this, I need to map with a log function. I also want to cap the maximum value at 7 (meaning if log(time_diff) > 7, then it will give 7 instead). Note that exp(7) is approximately 1096 days, which is about 3 years. Creating the log function:

In [18]:
def customlog(x): #input days, want output to be between -infinity and 7
    return min(log(x), 7)

Splitting into training and test sets:

In [19]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns = ['time_diff']), df.time_diff, test_size=0.2)

Want to train the model on the time_diff after the log transform. Won't need to do that to the test set since I'm interested in how it works as a predictor for the actual days. Transforming y_train:

In [20]:
log_y_train = y_train.map(customlog)

I ultimately used a random forest regression model. Defining that here:

In [21]:
def forestmodel(Xtrain, ytrain):
    fregr = RandomForestRegressor()
    fregr.fit(Xtrain, ytrain)
    return fregr

Fitting the model to my training data:

In [22]:
F = forestmodel(X_train, log_y_train)



In some of the models, I was getting very large predictions, so I wanted to cap them at exp(7) (this is why I defined the log the way I did above). Capping the output at 7:

In [23]:
f_y_pred = np.minimum(F.predict(X_test), 7)

Exponentiating the output so that it is in days, not log(days):

In [24]:
f_y_pred = np.exp(f_y_pred)

Defining the mean absolute error:

In [25]:
MAE = ((abs(f_y_pred - y_test)).mean())

Comparing the MAE to the mean difference between the mean and the test values:

In [27]:
print(1 - MAE/((abs(y_train.mean() - y_test)).mean()))

0.4257967222589367


So, my predictions are 42.6% better than predicting the mean time