# Open311 Classifier
Find within some exploration of the Open311DataSet and some classification experiments

In [1]:
#%reset
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline 

DATA_FOLDER = './data/'
reports = pd.read_csv(os.path.join(DATA_FOLDER, 'Open311Data.csv'))
reports.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92286 entries, 0 to 92285
Data columns (total 14 columns):
service_request_id      92286 non-null int64
requested_datetime      92286 non-null object
updated_datetime        92286 non-null object
closed_date             92286 non-null object
status                  92286 non-null object
source                  58419 non-null object
revised_service_name    591 non-null object
service_name            92286 non-null object
service_subtype         0 non-null float64
description             84009 non-null object
agency_responsible      24915 non-null object
address                 86808 non-null object
lat                     74468 non-null float64
long                    74470 non-null float64
dtypes: float64(3), int64(1), object(10)
memory usage: 9.9+ MB


  interactivity=interactivity, compiler=compiler, result=result)


Of particular interest is the 'description' column. No doubt location, date, and time info could prove useful. First we just use text.

In [2]:
reports['service_name'].value_counts()

Trash                                 40124
Recycling                             18606
Excessive Growth                       7445
Yard Waste                             4586
Other                                  2556
Potholes, Other Street Repair          2020
Temporary Signage w/o permit           1730
Sewer Problems (Sanitary Sewers)       1467
Water Utility Problems                 1358
Sidewalk Snow Removal                  1297
Parking on Unimproved Surface          1009
Street Lights                           922
Graffiti                                749
Water Quality                           730
Traffic Related Complaints              654
Street Snow Removal                     635
Line of Sight                           572
Blocked Street                          558
Traffic Signals                         503
Website & Mobile Apps Feedback          451
Sidewalk & Curb Complaints              444
Parks & Playgrounds                     396
Unsafe Buildings                

In [3]:
import nltk
from sklearn.feature_extraction.text import CountVectorizer
#nltk.download() # kicks off a GUI thing - do we always need?
from nltk.stem.snowball import SnowballStemmer
# try other (or no?) stemmers?
stemmer = SnowballStemmer("english", ignore_stopwords=False) # True)
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

Oh snap. lots of NaN descriptions. That's no help!
Do your data massaging here

In [4]:
"""
how's about a season feature?
Yes, we are using numbers. it does happen in order, but could do this using 2 features
to capture the cyclical nature

SPRING EQUINOX	March 20, 12:15 P.M. EDT 0
SUMMER SOLSTICE	June 21, 6:07 A.M. EDT 1
FALL EQUINOX	September 22, 9:54 P.M. EDT 2
WINTER SOLSTICE	December 21, 5:23 P.M. EST 3

oops, I didn't include time below, slap me
"""
from datetime import datetime

def get_season(in_date):
    if in_date.month == 12 and in_date.day >= 21:
        return 3
    elif (in_date.month >= 9 and in_date.day >= 22) or in_date.month > 9:
        return 2
    elif (in_date.month >= 6 and in_date.day >= 21) or in_date.month > 6:
        return 1
    elif (in_date.month >= 3 and in_date.day >= 20) or in_date.month > 3:
        return 0
    else:
        return 3

reports['parsed_requested_datetime'] = pd.to_datetime(reports['requested_datetime'])
reports['season'] = reports['parsed_requested_datetime'].apply(get_season)

goodreports = reports[(pd.notnull(reports['description']))]
x=goodreports['description']
goodreports['service_name'].value_counts()
goodreports.info()

season = goodreports['season']

<class 'pandas.core.frame.DataFrame'>
Int64Index: 84009 entries, 0 to 92285
Data columns (total 16 columns):
service_request_id           84009 non-null int64
requested_datetime           84009 non-null object
updated_datetime             84009 non-null object
closed_date                  84009 non-null object
status                       84009 non-null object
source                       56819 non-null object
revised_service_name         591 non-null object
service_name                 84009 non-null object
service_subtype              0 non-null float64
description                  84009 non-null object
agency_responsible           21476 non-null object
address                      78549 non-null object
lat                          66779 non-null float64
long                         66781 non-null float64
parsed_requested_datetime    84009 non-null datetime64[ns]
season                       84009 non-null int64
dtypes: datetime64[ns](1), float64(3), int64(2), object(10)
memory usage

In [5]:
from sklearn import preprocessing
import warnings
warnings.filterwarnings(action='ignore', category=DeprecationWarning)
# above fixes this REALLY DUMB thing: 
# https://stackoverflow.com/questions/48687375/deprecation-error-in-sklearn-about-empty-array-without-any-empty-array-in-my-cod
y=goodreports['service_name']

In [6]:
from sklearn.linear_model import LogisticRegression

# use df as train set
# do da splits 33 for test? 25 is better?
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(goodreports['description'], y, test_size=0.33, random_state=42)

### Solver notes

In reading it was found liblinear is really slow, lbfgs is faster at a slight accuracy cost.

'The “sag” solver uses a Stochastic Average Gradient descent [6]. It is faster than other solvers for large datasets, when both the number of samples and the number of features are large.'

Trying lbfgs, it is indeed slow! sag comes back in under 40 seconds.
lbfgs: CPU times: user 8min 4s, sys: 21.9 s, total: 8min 26s - 79.59 acc tho
let's 'pickle it!'

In [7]:
%%time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer

multi_class = 'multinomial'   #'ovr' # other option is 'multinomial'

# ngram range 8/30/2018
# first one works, second one works, both together? nope
text_clf_logreg = \
Pipeline([
    ('vect', CountVectorizer(ngram_range=(1, 2))),
                         ('tfidf', TfidfTransformer()),   
 ('clf-lr', LogisticRegression(solver='sag', max_iter=300, 
                            random_state=42,
                             multi_class=multi_class)),
                         ])

_ = text_clf_logreg.fit(X_train, y_train)
predicted_logreg = text_clf_logreg.predict(X_test)
print(np.mean(predicted_logreg == y_test))
# max_iter can be used to get shorter test cycles - takes about 36 sec out of the box

0.7959456047325325
CPU times: user 24.2 s, sys: 349 ms, total: 24.6 s
Wall time: 24.6 s


In [52]:
#from sklearn.externals import joblib
#joblib.dump(text_clf_logreg, 'open311lbfgs.pkl') 
# pickled package is 100MB!

['open311lbfgs.pkl']

In [8]:
# play time - how to extract MODEL from pipeline
text_clf_logreg.named_steps
classifier = text_clf_logreg.named_steps['clf-lr']
print(classifier.coef_) # good luck understanding that!
print(classifier.coef_.shape) # - 207828 (words)
#print(dir(classifier))
print(classifier.classes_)
print(classifier.n_iter_) # only 38, w00t

[[-4.03333287e-02 -1.42552236e-03 -7.54719853e-04 ... -4.88119223e-04
  -1.15734164e-03 -1.15734164e-03]
 [-5.39983460e-03 -1.77908930e-04 -1.06847308e-04 ... -5.87132315e-05
  -1.39070863e-04 -1.39070863e-04]
 [ 3.44714731e-02 -2.93593087e-03 -2.11402409e-03 ... -8.20859452e-04
  -1.72053419e-03 -1.72053419e-03]
 ...
 [ 2.26567722e-01 -6.04123988e-03 -2.09849472e-03 ... -1.32281230e-03
  -5.98955375e-03 -5.98955375e-03]
 [-3.42084896e-02 -3.11003532e-03 -2.91982872e-03 ... -2.17289810e-03
  -3.97082918e-03 -3.97082918e-03]
 [ 2.45993083e-01 -7.97564671e-03 -4.91910423e-03 ... -1.46787318e-03
  -2.06941702e-02 -2.06941702e-02]]
(52, 207809)
['Abandoned Vehicle' 'Accessibility Problem' 'Animal Control'
 'Biking & Walking' 'Blocked Street' 'Blocked sidewalk'
 'Bus Services (Bloomington Transit)' 'Business' 'City Performance'
 'Crow Sightings' 'Drainage or Runoff' 'Excessive Growth' 'Fire Hazards'
 'Graffiti' 'Inaccessible Parking' 'Leaf Collection' 'Line of Sight'
 'Open311 API Key Reque

In [56]:
print(X_test.iloc[3],predicted_logreg[3])
bumpup = pd.DataFrame({"description":X_test, "prediction":predicted_logreg})
# add cat map
bumpup.head(200)

Recycable items not properly sorted. Recycling


Unnamed: 0,description,prediction
20134,Bags must have sticker attached.,Trash
56810,from 717 w 4th to 903 w 4th,Trash
9364,Is the street sign at the corner of W.17th and...,Traffic Related Complaints
70132,Recycable items not properly sorted.,Recycling
79888,S,Trash
77018,Recyclables & non-recyclables waste mixed. Ite...,Recycling
24553,10/24/07\r\ncan exceeds 35 gal,Trash
83264,must be sorted,Recycling
66568,too heavy,Trash
12450,Lumber/Junk in the yard.,Trash


In [45]:
pp = text_clf_logreg.predict_proba(['snow on the sidewalk'])
print(classifier.classes_[np.argmax(pp)])
print(pp[0][np.argmax(pp)])
probtab = pd.DataFrame({"class":list(classifier.classes_),"prob":pp[0]})
probtab

TypeError: list indices must be integers or slices, not str

In [21]:
# I kind of don't like this - need more detail!
from sklearn.metrics import f1_score, precision_score, recall_score
fmeasure1 = f1_score(y_test, predicted_logreg, average="macro")
fmeasure2 = f1_score(y_test, predicted_logreg, average="micro")

precision = precision_score(y_test, predicted_logreg, average="macro")
recall = recall_score(y_test, predicted_logreg, average="macro")

fmeasure1,fmeasure2,precision,recall

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


(0.3251744280327871,
 0.7959456047325325,
 0.4555137356380711,
 0.2940286964268814)

In [46]:
from sklearn import metrics

clf = text_clf_logreg
pred = predicted_logreg
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
print("classification report:")
print(metrics.classification_report(y_test, pred)) 
print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred))

accuracy:   0.796
classification report:
                                    precision    recall  f1-score   support

                 Abandoned Vehicle       1.00      0.04      0.08        50
             Accessibility Problem       0.00      0.00      0.00         1
                    Animal Control       1.00      0.19      0.32        64
                  Biking & Walking       0.56      0.10      0.17        88
                    Blocked Street       0.60      0.19      0.29       187
                  Blocked sidewalk       0.33      0.02      0.03        66
Bus Services (Bloomington Transit)       0.00      0.00      0.00         8
                          Business       0.00      0.00      0.00        13
                  City Performance       0.00      0.00      0.00        74
                    Crow Sightings       0.00      0.00      0.00         1
                Drainage or Runoff       0.00      0.00      0.00        52
                  Excessive Growth       0.71 

  'precision', 'predicted', average, warn_for)


In [None]:
#ps = pd.Series(pred)
#le.transform(pred) # strings to nums
#ys = pd.Series(y_test)

In [None]:
#ys.value_counts()
#ps.value_counts()

In [None]:
from sklearn.metrics import log_loss
# later

In [23]:
pd.set_option('display.max_colwidth', -1)
goodreports[goodreports["service_name"] == 'Sidewalk Snow Removal']['description']

55838    Storm water has broken concrete and asphalt draining under street. Two small holes on either side of inlet. The drain elevation is also a problem on other end.                                                                                                                                                                                                                                                                                                                                                                              
55839    will not comply and use stickers.  i sent them a letter on 5/4 letting them know that if they did not comply then they would be turned over to you, so i am turning them over to you.  no stickers again today.                                                                                                                                                                                                                                                         

In [None]:
stuff = reports[reports['service_name'] == 'Crow Sightings'] # or 'Abandoned Vehicle'
stuff.info()

In [None]:
stuff['description']

Let's get confused - below is from scikit-learn docs. Also, it looks like shit!

In [1]:
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# Compute confusion matrix - try w/ subset of things?
#cnf_matrix = confusion_matrix(y_test, y_pred)

c_names = ['Trash','Recycling',
           'Potholes, Other Street Repair',
           'Sewer Problems (Sanitary Sewers)',
          'Street Lights',
          'Graffiti']

cnf_matrix = metrics.confusion_matrix(y_test, pred, labels = c_names)
np.set_printoptions(precision=2)

class_names = c_names # classifier.classes_
# make these deals BIGGER
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

plt.show()

NameError: name 'plt' is not defined

In [25]:
baddies = X_test[predicted_logreg != y_test]

baddies_check = pd.DataFrame({"desc": baddies, "predict": predicted_logreg[predicted_logreg != y_test], "actual": y_test[predicted_logreg != y_test]})
baddies_check.head(100)
#wait why NaN in there?!?!?


Unnamed: 0,desc,predict,actual
56810,from 717 w 4th to 903 w 4th,Trash,Sidewalk Snow Removal
9364,Is the street sign at the corner of W.17th and formerly Rogers/Kinser Pike correct? It is now labeled as Madison St.. I've lived on W. 17th since 1977. The street going South off of W. 17th at this intersection has always been Rogers St. The street going North at this intersection with W. 17th was always Kinser Pike. Was there a change so that this street(s) has now 3 different names in <1 block?,Traffic Related Complaints,Other
79888,S,Trash,Recycling
66568,too heavy,Trash,Yard Waste
52138,"There is an abandoned house located behind my home(5015 S Rogers St) that is unsafe and is attracting young people who break the windows, hang out in the house, generally doing no good. Multiple neighbors fear for our own safety and our property due to this abandoned property.",Excessive Growth,Unsafe Buildings
4941,Trash piled behind duplex and blowing in alley.,Trash,Excessive Growth
49004,Grass needs cutting,Excessive Growth,Trash
53261,LINE LOCATE NEEDED FOR VECTREN,Other,Water Utility Problems
80778,B,Trash,Recycling
80763,"P,C,S",Trash,Recycling


In [26]:
baddies_check.shape

(5657, 3)

In [27]:
goodreports['service_name'].value_counts()

Trash                                 36865
Recycling                             15744
Excessive Growth                      6677 
Yard Waste                            3771 
Other                                 2460 
Potholes, Other Street Repair         1971 
Temporary Signage w/o permit          1724 
Sewer Problems (Sanitary Sewers)      1460 
Water Utility Problems                1354 
Sidewalk Snow Removal                 1167 
Parking on Unimproved Surface         975  
Street Lights                         916  
Water Quality                         730  
Graffiti                              690  
Traffic Related Complaints            649  
Street Snow Removal                   568  
Blocked Street                        548  
Line of Sight                         543  
Traffic Signals                       501  
Website & Mobile Apps Feedback        442  
Sidewalk & Curb Complaints            429  
Parks & Playgrounds                   394  
Sewer Problems (Storm Sewers)   

In [40]:
reports[['season']]

Unnamed: 0,season
0,1
1,1
2,1
3,0
4,0
5,0
6,0
7,0
8,1
9,1
