**Analysing 311 Service Requests for New York City**

Forecast of volume of calls for particular day

In [9]:
import pandas as pd
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt
%matplotlib inline
plt.style.use(['fivethirtyeight'])
mpl.rcParams['lines.linewidth'] = 3
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split

In [12]:
df_chunk = pd.read_csv(r'311_Service_Requests_from_2010_to_Present.csv',chunksize=1000000, header=0,
    sep=',', parse_dates=['Created Date', 'Closed Date', 'Resolution Action Updated Date'],index_col='Unique Key')

Calculating Resolution Time in terms of Days

In [13]:
def prepareData(df):
    df['Resolution_Time'] = (df['Closed Date'] - df['Created Date']).dt.days
    df_clean=df[df['Resolution_Time'].notnull()]
    df_perfect = df_clean[df_clean['Closed Date'] >= df_clean['Created Date']]
    df_perfect['Day of Week'] = df_perfect['Created Date'].dt.dayofweek
    df_perfect['Day of Month'] = df_perfect['Created Date'].dt.day
    df_perfect['Month'] = df_perfect['Created Date'].dt.month
    df_perfect['Year'] = df_perfect['Created Date'].dt.year
    df_perfect=df_perfect[df_perfect.Borough!='Unspecified']
    return df_perfect

In [14]:
chunk_list = []

for chunk in df_chunk:
    chunk_good = prepare_data(chunk)
    chunk_list.append(chunk_good)

df_concat = pd.concat(chunk_list)
df_concat.shape()

KeyboardInterrupt: 

The most frequent and least frequent complaints in NYC

In [None]:
(df_perfect['Complaint Type'].value_counts()).head(25).plot(kind='bar',
                    figsize=(10,6), title = 'Most common Complaints')

In [None]:
(df_perfect['Complaint Type'].value_counts()).tail(25).plot(kind='bar',
                    figsize=(10,6), title = 'Least frequent Complaints')

Complaint distribution across boroughs

In [None]:
colors = ['#639ace','#ca6b39','#7f67ca','#5ba85f','#c360aa','#a7993f','#cc566a']
df_perfect['Borough'].value_counts().plot(kind='pie',autopct='%1.1f%%',
                        explode = (0.15, 0, 0, 0, 0), startangle=45, shadow=False, colors = colors,
                        figsize = (8,6))
#plt.legend(title='BOROUGH', loc='upper right', bbox_to_anchor=(1.5,1))
plt.axis('equal')
plt.title('# complaints distribution across Boroughs (2015)\n')
plt.tight_layout()
plt.show()

In [None]:
df_perfect_Air = df_perfect[df_perfect['Complaint Type'] == 'Air Quality']
df_perfect_Air.plot(kind='hexbin', x='Longitude', y='Latitude', gridsize=40,
    colormap = 'jet',mincnt=1,title = 'Air Quality issues across NYC\n', figsize=(10,6)).axis('equal')

In [None]:
df_Brooklyn = df_perfect[df_perfect['Borough']=='BROOKLYN']
df_Brooklyn.shape

In [None]:
(df_Brooklyn['Complaint Type'].value_counts()).head(25).plot(kind='bar',
                                                    figsize=(10,6),title = 'Most Frequent Complaints in Brooklyn')

In [None]:
import matplotlib.ticker as ticker

var = df_Brooklyn.groupby('Complaint Type').Resolution_Time.mean()
frequent = df_Brooklyn['Complaint Type'].value_counts().head(25)
var = var.ix[frequent.index]
fig = plt.figure()
ax1 = fig.add_subplot(1,1,1)
ax1.set_xlabel('Complaint_Type')
ax1.set_ylabel('Average Response Time')
ax1.set_title("Avg Response Time of Complaints")
tick_spacing = 2
ax1.yaxis.set_major_locator(ticker.MultipleLocator(tick_spacing))
var.head(15).plot(kind='bar')

In [None]:
import matplotlib.ticker as ticker

var = df_Brooklyn.groupby('Agency').Resolution_Time.mean()
frequent = df_Brooklyn['Agency'].value_counts().head(25)
var1 = var.ix[frequent.index]
fig = plt.figure()
ax1 = fig.add_subplot(1,1,1)
ax1.set_xlabel('Agencies')
ax1.set_ylabel('Average Response Time')
ax1.set_title("Avg Response Time of Agencies")
tick_spacing = 5
ax1.yaxis.set_major_locator(ticker.MultipleLocator(tick_spacing))
var1.head(25).plot(kind='bar')

In [None]:
(df_Brooklyn['Location Type'].value_counts()).head(25).plot(kind='bar',
                                                    figsize=(10,6),title = 'Location Type vs # Complaints')

In [None]:
df_Brooklyn[['Longitude', 'Latitude']].plot(kind='scatter',
    x='Longitude', y='Latitude', figsize=(10,8),title = 'Complaints concentration across Brooklyn').axis('equal')

In [None]:
df_Brooklyn.plot(kind='hexbin', x='Longitude', y='Latitude', gridsize=40,
    colormap = 'jet',mincnt=1,title = 'Complaints concentration across Brooklyn\n', figsize=(10,6)).axis('equal')

In [None]:
df_perfect[df_perfect['Complaint Type'] == 'HEAT/HOT WATER']['Descriptor'].value_counts()

In [None]:
df_Brook_Heat = df_Brooklyn[df_Brooklyn['Complaint Type'] == 'HEAT/HOT WATER']
df_Brook_Heat[df_Brook_Heat['Complaint Type'] == 'HEAT/HOT WATER'].plot(
    kind='hexbin', x='Longitude', y='Latitude', gridsize=40,title = 'Heat issues concentration across Brooklyn\n',
    colormap='jet', mincnt=1, figsize=(10,6)).axis('equal')

In [None]:
df_Brook_Noise = df_Brooklyn[df_Brooklyn['Complaint Type'] == 'Noise - Residential']
df_Brook_Noise['Month'].value_counts().plot(kind = 'bar',figsize=(10,6), title = 'Volume of Noise issues by Month\n')

In [None]:
def getDfSummary(input_data):
    number_nan = input_data.isnull().sum()
    number_distinct = input_data.nunique(dropna=True)
    output_data = pd.DataFrame({'number_nan':number_nan,'number_distinct':number_distinct})
    return output_data

getDfSummary(df_Brook_Heat)

In [None]:
def buildFeatures(dataset):
    df_features = dataset.filter(['Descriptor','Incident Zip','Resolution_Time','Day of Week','Day of Month','Month'],axis = 1)
    data = pd.DataFrame({'descriptor': df_features.Descriptor})
    dummies=pd.get_dummies(data)
    df_model=pd.concat([df_features,dummies],axis=1)
    df_model = df_model.drop('Descriptor',1)
    df_model = df_model.dropna()
    df_model['Incident Zip'] = df_model['Incident Zip'].astype(int)
    df_model['Resolution_Time']=df_model['Resolution_Time'].astype(int)
    df_model['Incident Zip']=df_model['Incident Zip']-df_model['Incident Zip'].min()
    return df_model

In [None]:
df_model = buildFeatures(df_Brook_Heat)
df_model = df_model[['Incident Zip','Day of Week','Day of Month','Month','descriptor_APARTMENT ONLY',
                     'descriptor_ENTIRE BUILDING','Resolution_Time']]
df_model.describe()

Train Test Split

In [None]:
def splitter(dataset,y):
    X_train, X_test, y_train, y_test = train_test_split(dataset, y, test_size=0.3)
    return X_train,X_test,y_train,y_test

df_model.describe()

Binning the Traget variable

In [None]:
# Since the data is highly imbalanced we are removing few outliers.
df_model.drop(df_model[df_model.Resolution_Time > 28].index, inplace=True)
df_model.Resolution_Time.value_counts()

In [None]:
bins = [0,2,6,28]
group_names = [0,1,2]
df_model['categories'] = pd.cut(df_model['Resolution_Time'], bins,include_lowest=True,labels=group_names)
df_model.describe()

In [None]:
getDfSummary(df_model)

Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = splitter(df_model,df_model['categories'])
X1 = X_train.drop(['Resolution_Time','categories'],1)
Y1 = y_train
X2 = X_test.drop(['Resolution_Time','categories'],1)
Y2 = y_test
log=LogisticRegression(C=1e30)
log.fit(X1,Y1)
accuracy_score(Y2,log.predict(X2))

In [None]:
print(log.predict(X2))

In [None]:
Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
import scipy as sp
clf = DecisionTreeClassifier(criterion='entropy')
clf = clf.fit(X1,Y1)
y1_model = clf.score(X1,Y1)
y2_model = clf.score(X2,Y2)
print("Accuracy on training dataset = ",y1_model*100,"%","\nAccuracy on test dataset = ",y2_model*100,"%")

Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier(n_estimators=100)
rf.fit(X1, Y1)

predicted = rf.predict(X2)
accuracy = accuracy_score(Y2, predicted)
accuracy

Evaluation Metric

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

For Random Forest Classifier

In [None]:
from sklearn.metrics import confusion_matrix
import itertools
Y2_pred = rf.predict(X2)
cnf_matrix=confusion_matrix(Y2, Y2_pred)
class_names = ['< 2 days','2 <= days <= 6','more than a week',]
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')
plt.show()

For Decision Tree Classifier

In [None]:
from sklearn.metrics import confusion_matrix
import itertools
Y2_pred = clf.predict(X2)
cnf_matrix=confusion_matrix(Y2, Y2_pred)
class_names = ['< 2 days','2 <= days <= 6','more than a week',]
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')
plt.show()

In [None]:
Scaling using sklearn preprocessing

In [None]:
from sklearn import preprocessing
scaled = preprocessing.scale(df_model[['Incident Zip', 'Day of Week',
       'Day of Month', 'Month']])
data_clean = pd.DataFrame(scaled,index=df_model.index,columns= ['Incident Zip','Day of Week',
       'Day of Month', 'Month'] )
data_clean = pd.concat([data_clean, df_model.iloc[:,4:6],df_model['categories']], axis=1)
data_clean.describe()

Logistic Regression on Scaled Data

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = splitter(data_clean,data_clean.categories)
X1 = X_train.drop('categories',1)
Y1 = y_train
X2 = X_test.drop('categories',1)
Y2 = y_test
log=LogisticRegression(C=1e30)
log.fit(X1,Y1)
accuracy_score(Y2,log.predict(X2))

Decision Tree classifier on Scaled Data

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
import scipy as sp
clf = DecisionTreeClassifier(criterion='entropy')
clf = clf.fit(X1,Y1)
y1_model = clf.score(X1,Y1)
y2_model = clf.score(X2,Y2)
print("Accuracy on training dataset = ",y1_model*100,"%","\nAccuracy on test dataset = ",y2_model*100,"%")

Random Forest Classifier on Scaled Data

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier(n_estimators=100)
rf.fit(X1, Y1)

predicted = rf.predict(X2)
accuracy = accuracy_score(Y2, predicted)
accuracy