## 1. Preprocessing

Import relevant packages

In [None]:
#import relevant packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score, mean_squared_error

In [None]:
#read in data
df = pd.read_csv("../input/austin-311/austin_311.csv")

In [None]:
#quick initial look at the dataset
print(df.shape)
df.head()

'SR Number' is a unique identifier for every column, so it would not be very useful when making predictions. 'SR Location' is just a combination of other columns, as is '(Latitude, Longitude)'. 'Street Number' also did not seem useful for this analysis. I removed these columns.

In [None]:
df = df[["SR Description", "Method Received", "SR Status", "Street Name", "City", "Zip Code", "County", \
    "Latitude Coordinate", "Longitude Coordinate", "Council District", "Created Date", "Closed Date"]]
print(df.shape)
df.head()

Removing rows with incomplete data.

In [None]:
df.isna().sum()

In [None]:
df = df.dropna()
print(df.shape)
df.head()

Note that the total number of rows has dropped from 912301 to 850918. Total number of columns dropped from 16 to 12.

In [None]:
print(np.unique(df['City']))

Look at all the different spellings for Austin! Typos and differences like this are common in manually entered datasets. Let's try to fix this by replacing the mistyped values with a standard version.

In [None]:
austin_spellings =  ['ATX', 'AUST', 'AUSTIN', 'AUSTIN 5 ETJ', 'AUSTUN', 'AUsti', 'AUstin', 'Au', \
 'AuStin', 'Aus', 'Ausitn', 'Aust', 'AustIn', 'AustiN', 'Austin', 'Austin, TX', \
 'Austin.', 'Austn', 'Austtin', 'a', 'aUSTIN', 'aus', 'ausitn', 'austi', 'austibn', 'austin', 'austin `', 'austn']

for i in austin_spellings:
    df['City'].replace({i:"AUSTIN"}, inplace = True)

df['City'].replace({'Del Valle': 'DEL VALLE', 'Del valle': 'DEL VALLE', 'del valle': 'DEL VALLE'}, inplace = True)

df['City'] = df['City'].apply(lambda x: x.upper())

print(df.shape)
df.head()

In [None]:
df['Zip Code'] = df['Zip Code'].apply(lambda x: int(x))
df['Council District'] = df['Council District'].apply(lambda x: int(x))

df['Zip Code'] = df['Zip Code'].astype('category')
df['Council District'] = df['Council District'].astype('category')

Creating a column for the time it takes for a ticket to be closed

In [None]:
df['Created Date'] = pd.to_datetime(df['Created Date'])
df['Closed Date'] = pd.to_datetime(df['Closed Date'])
df['time to close'] = (df['Closed Date'] - df['Created Date']).apply(lambda x: pd.Timedelta.total_seconds(x))

Converting the dates into a number. This will be easier to use in our modeling algorithms.

In [None]:
df['Created Date'] = df['Created Date'].apply(lambda x: int(x.timestamp()))
df['Closed Date'] = df['Closed Date'].apply(lambda x: int(x.timestamp()))

In [None]:
print(df.shape)
df.head()

Let's look at each column's value and see if we can find anything else to clean.

In [None]:
for i in list(df):
    print(i, len(np.unique(df[i])))
    print(np.unique(df[i]))
    print()

The only thing that looks like it could use some extra cleaning is the Street Name, but it may be too complicated to do that now. There are packages and methods to find minor typos in manually entered data programmatically, but it's not necessary for this class.

For the classification problem, we need to label our classes based on the case's status. We are looking to predict closed or resolved cases.

In [None]:
df.groupby('SR Status')['SR Description'].count()

In [None]:
df['SR Status'].replace({'Closed': 1, 'Closed -Incomplete': 0, 'Closed -Incomplete Information': 0, 'Duplicate (closed)': 0, 'Duplicate (open)': 0, 'Incomplete':0, 'New': 0, 'Open': 0, 'Resolved':1, 'TO BE DELETED': 0, 'Work In Progress': 0}, inplace = True)
df.groupby('SR Status')['SR Description'].count()

## 2. EDA

In this section we will try to do some basic exploratory data analysis by visualizing the data.

The first thing I want to try is to see how different zip codes are reacted to differently. We can look at the zip codes with the most 311 complaints, as well as how their 311 complaints were resolved.

In [None]:
#find top and bottom zip codes for number of complaints
top_zips = [(i, df['Zip Code'][df['Zip Code'] == i].count()) for i in np.unique(df['Zip Code'])]
top_zips = sorted(top_zips, key = lambda x: x[1], reverse = True)[0:8]

bot_zips = [(i, df['Zip Code'][df['Zip Code'] == i].count()) for i in np.unique(df['Zip Code'])]
bot_zips = sorted(bot_zips, key = lambda x: x[1], reverse = False)[0:8]

In [None]:
#seperate zip codes by number of completed and incomplete compaints
zips = [i[0] for i in top_zips]
zips_zeros = [df['Zip Code'][(df['Zip Code'] == i) & (df['SR Status'] == 0)].count() for i in zips]
zips_ones = [df['Zip Code'][(df['Zip Code'] == i) & (df['SR Status'] == 1)].count() for i in zips]

bzips = [i[0] for i in bot_zips]
bzips_zeros = [df['Zip Code'][(df['Zip Code'] == i) & (df['SR Status'] == 0)].count() for i in bzips]
bzips_ones = [df['Zip Code'][(df['Zip Code'] == i) & (df['SR Status'] == 1)].count() for i in bzips]

In [None]:
#plot top zip codes
x = np.arange(len(zips))
width = 0.35

fig, ax = plt.subplots(figsize = (8,3))
rects1 = ax.bar(x - width/2, zips_zeros, width, label='0')
rects2 = ax.bar(x + width/2, zips_ones, width, label='1')

ax.set_ylabel('Scores')
ax.set_title('Scores by Zip Code and Status')
ax.set_xticks(x)
ax.set_xticklabels(zips)
ax.legend()


fig.tight_layout()

plt.show()

There are a lot of 311 calls in 78704. This area of Austin is pretty packed, as it's near downtown, so that seems to make sense. 

Similarly,  we can look at the zip codes with the least 311 complaints and how they were resolved.

In [None]:
#plot bottom zip codes
x = np.arange(len(zips))
width = 0.35

fig, ax = plt.subplots(figsize = (8,3))
rects1 = ax.bar(x - width/2, bzips_zeros, width, label='0')
rects2 = ax.bar(x + width/2, bzips_ones, width, label='1')

ax.set_ylabel('Scores')
ax.set_title('Scores by Zip Code and Status')
ax.set_xticks(x)
ax.set_xticklabels(bzips)
ax.legend()


fig.tight_layout()

plt.show()

I don't know much about Austin, but 78664 appears to be in Round Rock. Maybe Round Rock is a less populated area, or it is generally safer or has little to no problems. It could also be possible that Round Rock has a seperate 311 line that most complaints get directed to, and only a few of them make it to the Austin 311 dataset. 

Lets also look at the ratio of incomplete complaints to total complaints, and see if we can learn anything from there.

In [None]:
#find ratio of incomplete complaints to total complaints for each zip code

top_zips = [(i, df['Zip Code'][df['Zip Code'] == i].count()) for i in np.unique(df['Zip Code'])]
top_zips = sorted(top_zips, key = lambda x: x[1], reverse = True)
zips = [i[0] for i in top_zips]

ratio_zips = [(i, df['Zip Code'][(df['Zip Code'] == i) & (df['SR Status'] == 0)].count()/(df['Zip Code'][(df['Zip Code'] == i) & \
            (df['SR Status'] == 1)].count() + df['Zip Code'][(df['Zip Code'] == i) & (df['SR Status'] == 0)].count())) \
             for i in zips]
ratio_zips = sorted(ratio_zips, key = lambda x: x[1], reverse = True)[0:8]
zips = [i[0] for i in ratio_zips]
ratios = [i[1] for i in ratio_zips]

In [None]:
#plot
x = np.arange(len(ratio_zips))
width = 0.35

fig, ax = plt.subplots(figsize = (8,3))
rects1 = ax.bar(x, ratios, width)

ax.set_ylabel('Incompletion Rate')
ax.set_title('Highest Incompletion Rate')
ax.set_xticks(x)
ax.set_xticklabels(zips)

fig.tight_layout()

plt.show()

What's going on in 78737? They seem to have a ton of incomplete complaints! This could be something worth digging further into.

As a final part of my EDA, I wanted to look at the distribution of the time it takes to close a ticket.

In [None]:
_ = sns.distplot(df['time to close'], hist = False)

From this plot we can see there are some strong outliers that may influence us when we try to model the time to close. I filter out some outliers by removing rows that are more than three standard deviations outside of our mean time to close. Note that this reduces our total number of rows down to 836854. 

In [None]:
df = df[(np.abs(stats.zscore(df['time to close'])) < 3)]
df.shape

In [None]:
_ = sns.distplot(df['time to close'], hist = False)

## 3. Making Predictions

In [None]:
X = df.copy()

Encoding each of the categorical variables to make it easier to put into the algorithms. I keep the label encoders in a dictionary so that I can re-use them later if necessary.

In [None]:
le_dict = {}
for i in ['SR Description', 'Method Received', 'Street Name', 'City', 'Zip Code', 'County', 'Council District']:
    le = LabelEncoder()
    le.fit(X[i])
    X[i] = le.transform(X[i])
    le_dict[i] = le

Removed Closed Date and time to close from classification problem. Removed Closed Date from regression problem. Including those columns would provide information to the model that it wouldn't have if it were to be used for making predictions in real time.

In [None]:
#remove the target variable from each problem
X1 = X[['SR Description','Method Received', 'Street Name', 'City', 'Zip Code', 'County', 'Latitude Coordinate', 'Longitude Coordinate', 'Council District', 'Created Date']]
y1 = X['SR Status']

X2 = X[['SR Description', 'Method Received', 'SR Status', 'Street Name', 'City', 'Zip Code', 'County', 'Latitude Coordinate', 'Longitude Coordinate', 'Council District', 'Created Date']]
y2 = X['time to close']

In [None]:
#splitting into a train and test set
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.33)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.33)

y1_train = y1_train.values.ravel()
y1_test = y1_test.values.ravel()
y2_train = y2_train.values.ravel()
y2_test = y2_test.values.ravel()

In [None]:
X1_train.head()

### 3.1 Classification

Using a Random Forest for a simple classification algorithm. We want to use a classification algorithm to see if we can predict if a call will be successfully resolved.

In [None]:
rfc = RandomForestClassifier(n_estimators = 10)
rfc.fit(X1_train, y1_train)
y1_pred = rfc.predict(X1_test)

In [None]:
print(confusion_matrix(y1_test, rfc.predict(X1_test)))
print(roc_auc_score(y1_test, y1_pred), accuracy_score(y1_test, y1_pred))

This isn't too bad of a prediction based on the confusion matrix, auc score, and accuracy score. But let's see if we can make some improvements using hyperparameter tuning.

In [None]:
n_estimators = [10, 20, 30]

for i in n_estimators:
    rfc = RandomForestClassifier(n_estimators = i)
    rfc.fit(X1_train, y1_train)
    y1_pred = rfc.predict(X1_test)
    print(roc_auc_score(y1_test, y1_pred))


Playing with n_estimators didn't seem to make a big difference, so let's keep it at the default.

In [None]:
max_depth = [None, 10, 100, 500]

for i in max_depth:
    rfc = RandomForestClassifier(n_estimators = 10, max_depth = i)
    rfc.fit(X1_train, y1_train)
    y1_pred = rfc.predict(X1_test)
    print(roc_auc_score(y1_test, y1_pred))


Playing with max_depth seems to say that having more max depth is (generally) best for our auc score. Let's keep it at None.

### 3.2 Regression

Using a Gradient Boosting Tree for a simple regression algorithm. We want to use a regression algorithm to see if we can predict the mean time to resolution for a case.

In [None]:
gbr = GradientBoostingRegressor()
gbr.fit(X2_train, y2_train)
y2_pred = gbr.predict(X2_test)

In [None]:
print(mean_squared_error(y2_test, y2_pred))

Wow this looks like a huge error... Let's look at it a little closer

In [None]:
#plotting the predictions and the true values side by side. (When we plot on top it's hard to see them)
plt.figure(figsize = (80, 5))
plt.plot(y2_pred)
plt.show()
plt.figure(figsize = (80, 5))
_ = plt.plot(y2_test, c = 'orange')

The look pretty similar, maybe looking at the differences will help show me why the error is so large.

In [None]:
plt.figure(figsize = (15, 5))
_ = plt.plot(y2_test - y2_pred)

The errors don't look crazy on this graph. They're centered near 0 and are fairly stable.

The original error was really not quite as bad as it looked originally, it only looked bad because I was looking at the mean SQUARED error and given the magnitudes of the errors, squaring them made it seem worse.

With additional hyperparameter tuning, or by trying different models, we may be able to lower the error further. But what we have here is a valid model that gives us reasonable predictions depending on what we are using them for. 