In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
df = pd.read_csv('Main.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,inspection_id,violation_number,violation_description,violation_text,dcmr_25_code,establishment_name,address,telephone,email,...,ifnoncritical,noncritical_left,date_inspection,year_inspection,month_inspection,season_inspection,establishment_category,risk_category_factor,year,inspection_category
0,7088,7105,49.0,Plumbing installed; proper backflow devices,Leaking faucet at the 3-compartment sink.,2418.1,SANS RIVAL CATERERS,"3636 16TH ST NW WASHINGTON, DC 20010",,,...,1.0,1.0,2010-06-03,2010,6,Summer,Restaurants and Hotels,3.0,2010,Others
1,7089,7106,51.0,"Toilet facilities: properly constructed, suppl...",No covered receptacle in toilet room.,2707.1,EL SAVSALITO RESTAURANT,"1424 PARK RD NW Washington, DC 20010",,,...,1.0,2.0,2010-06-03,2010,6,Summer,Restaurants and Hotels,3.0,2010,Others
2,7090,7106,35.0,Food properly labeled; original container,"Prepared food items,stored in the reach-in ref...",803.1,EL SAVSALITO RESTAURANT,"1424 PARK RD NW Washington, DC 20010",,,...,1.0,2.0,2010-06-03,2010,6,Summer,Restaurants and Hotels,3.0,2010,Others
3,7105,7108,1.0,Correct response to questions,No certified food handler on duty at call.(COR...,203.2,PHO 14,"2436 18TH ST NW WASHINGTON, DC 20009",,,...,0.0,0.0,2010-06-03,2010,6,Summer,Restaurants and Hotels,2.0,2010,Routine
4,7113,7110,2.0,Management awareness; policy present,The establishment does not have an employee he...,300.1,CUP'A CUP'A CAFE & ESPRESSO,"3000 S RANDOLPH ST ARLINGTON, VA 22206",,,...,1.0,9.0,2010-05-26,2010,5,Spring,Restaurants and Hotels,3.0,2010,Routine


In [23]:
df['establishment_category'] = df['establishment_category'].astype('category')

In [24]:
df['inspection_category'] = df['inspection_category'].astype('category')

In [25]:
df['risk_category_factor']=df['risk_category_factor'].astype('category')

In [26]:
df['season_inspection'] = df['season_inspection'].astype('category')

In [27]:
df['year'] = df['year'].astype('category')

In [28]:
df1 = df[['risk_category_factor', 'year', 'season_inspection', 'inspection_category', 'establishment_category', 
        'month_inspection', 'noncritical_left', 'ifnoncritical', 'critical_left', 'ifcritical', 
         'core_left', 'priority_foundation_left', 'priority_left', 'ifcore', 'ifpriorityfoundation', 'ifpriority', 
         'total_violations', 'priority_violations', 'priority_violations_corrected_on_site', 
          'priority_violations_repeated', 'priority_foundation_violations', 
          'priority_foundation_violations_corrected_on_site', 'priority_foundation_violations_repeated',
          'core_violations', 'core_violations_corrected_on_site', 'core_violations_repeated', 'critical_violations', 
          'critical_violations_corrected_on_site', 'critical_violations_repeated', 'noncritical_violations', 
          'noncritical_violations_corrected_on_site', 'noncritical_violations_repeated']]

In [29]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165881 entries, 0 to 165880
Data columns (total 32 columns):
risk_category_factor                                165820 non-null category
year                                                165881 non-null category
season_inspection                                   165881 non-null category
inspection_category                                 165881 non-null category
establishment_category                              165881 non-null category
month_inspection                                    165881 non-null int64
noncritical_left                                    92548 non-null float64
ifnoncritical                                       92548 non-null float64
critical_left                                       92548 non-null float64
ifcritical                                          92548 non-null float64
core_left                                           73333 non-null float64
priority_foundation_left                            733

In [94]:
df_key_predictors = df1[['establishment_category','year', 'inspection_category', 
                                                 'risk_category_factor', 'season_inspection']]
df_key_dummies = pd.get_dummies(df_key_predictors, columns = ['establishment_category','year', 'inspection_category', 
                                                 'risk_category_factor', 'season_inspection'])
df_key_dummies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165881 entries, 0 to 165880
Data columns (total 26 columns):
establishment_category_Confectionary and Catering    165881 non-null uint8
establishment_category_Grocery and Food Products     165881 non-null uint8
establishment_category_Marine                        165881 non-null uint8
establishment_category_Others                        165881 non-null uint8
establishment_category_Restaurants and Hotels        165881 non-null uint8
establishment_category_Vending and Cafeteria         165881 non-null uint8
year_2010                                            165881 non-null uint8
year_2011                                            165881 non-null uint8
year_2012                                            165881 non-null uint8
year_2013                                            165881 non-null uint8
year_2014                                            165881 non-null uint8
year_2015                                            165881 non-

In [76]:
df_with_dummies = pd.get_dummies( df1, columns = ['establishment_category','year', 'inspection_category', 
                                                 'risk_category_factor', 'season_inspection'])

In [79]:
df_with_dummies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165881 entries, 0 to 165880
Data columns (total 53 columns):
month_inspection                                     165881 non-null int64
noncritical_left                                     92548 non-null float64
ifnoncritical                                        92548 non-null float64
critical_left                                        92548 non-null float64
ifcritical                                           92548 non-null float64
core_left                                            73333 non-null float64
priority_foundation_left                             73333 non-null float64
priority_left                                        73333 non-null float64
ifcore                                               73333 non-null float64
ifpriorityfoundation                                 73333 non-null float64
ifpriority                                           73333 non-null float64
total_violations                                     1

In [78]:
df2 = df_with_dummies.drop(['ifcore', 'ifcritical', 'ifnoncritical', 'ifpriority', 'ifpriorityfoundation'], axis=1)
df2.head()

Unnamed: 0,month_inspection,noncritical_left,critical_left,core_left,priority_foundation_left,priority_left,total_violations,priority_violations,priority_violations_corrected_on_site,priority_violations_repeated,...,inspection_category_Routine,risk_category_factor_1.0,risk_category_factor_2.0,risk_category_factor_3.0,risk_category_factor_4.0,risk_category_factor_5.0,season_inspection_Fall,season_inspection_Spring,season_inspection_Summer,season_inspection_Winter
0,6,1.0,0.0,,,,1,,,,...,0,0,0,1,0,0,0,0,1,0
1,6,2.0,0.0,,,,2,,,,...,0,0,0,1,0,0,0,0,1,0
2,6,2.0,0.0,,,,2,,,,...,0,0,0,1,0,0,0,0,1,0
3,6,0.0,1.0,,,,1,,,,...,1,0,1,0,0,0,0,0,1,0
4,5,9.0,5.0,,,,15,,,,...,1,0,0,1,0,0,0,1,0,0


In [80]:
df3 = df_with_dummies[['ifcore', 'ifcritical', 'ifnoncritical', 'ifpriority', 'ifpriorityfoundation']]
df3.head()

Unnamed: 0,ifcore,ifcritical,ifnoncritical,ifpriority,ifpriorityfoundation
0,,0.0,1.0,,
1,,0.0,1.0,,
2,,0.0,1.0,,
3,,1.0,0.0,,
4,,1.0,1.0,,


In [84]:
df2.fillna(0, axis = 1, inplace = True)
df2

Unnamed: 0,month_inspection,noncritical_left,critical_left,core_left,priority_foundation_left,priority_left,total_violations,priority_violations,priority_violations_corrected_on_site,priority_violations_repeated,...,inspection_category_Routine,risk_category_factor_1.0,risk_category_factor_2.0,risk_category_factor_3.0,risk_category_factor_4.0,risk_category_factor_5.0,season_inspection_Fall,season_inspection_Spring,season_inspection_Summer,season_inspection_Winter
0,6,1.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,1,0
1,6,2.0,0.0,0.0,0.0,0.0,2,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,1,0
2,6,2.0,0.0,0.0,0.0,0.0,2,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,1,0
3,6,0.0,1.0,0.0,0.0,0.0,1,0.0,0.0,0.0,...,1,0,1,0,0,0,0,0,1,0
4,5,9.0,5.0,0.0,0.0,0.0,15,0.0,0.0,0.0,...,1,0,0,1,0,0,0,1,0,0
5,5,9.0,5.0,0.0,0.0,0.0,15,0.0,0.0,0.0,...,1,0,0,1,0,0,0,1,0,0
6,5,9.0,5.0,0.0,0.0,0.0,15,0.0,0.0,0.0,...,1,0,0,1,0,0,0,1,0,0
7,5,9.0,5.0,0.0,0.0,0.0,15,0.0,0.0,0.0,...,1,0,0,1,0,0,0,1,0,0
8,5,9.0,5.0,0.0,0.0,0.0,15,0.0,0.0,0.0,...,1,0,0,1,0,0,0,1,0,0
9,5,9.0,5.0,0.0,0.0,0.0,15,0.0,0.0,0.0,...,1,0,0,1,0,0,0,1,0,0


In [87]:
df3.fillna(-1.0, inplace = True)
df3

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


Unnamed: 0,ifcore,ifcritical,ifnoncritical,ifpriority,ifpriorityfoundation
0,-1.0,0.0,1.0,-1.0,-1.0
1,-1.0,0.0,1.0,-1.0,-1.0
2,-1.0,0.0,1.0,-1.0,-1.0
3,-1.0,1.0,0.0,-1.0,-1.0
4,-1.0,1.0,1.0,-1.0,-1.0
5,-1.0,1.0,1.0,-1.0,-1.0
6,-1.0,1.0,1.0,-1.0,-1.0
7,-1.0,1.0,1.0,-1.0,-1.0
8,-1.0,1.0,1.0,-1.0,-1.0
9,-1.0,1.0,1.0,-1.0,-1.0


# Critical Violations

In [95]:
#Random Forest
X = df_key_dummies
y = df3['ifcritical']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)

print(classification_report(rfc_pred, y_test))
print('\n')
print(confusion_matrix(rfc_pred, y_test))


             precision    recall  f1-score   support

       -1.0       0.97      0.96      0.96     22283
        0.0       0.25      0.67      0.36      3144
        1.0       0.92      0.73      0.81     24338

avg / total       0.90      0.83      0.85     49765



[[21360   298   625]
 [   87  2099   958]
 [  577  6043 17718]]


In [None]:
# SVC
X = df_key_dummies
y = df3['ifcritical']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)
predictions = svc.predict(X_test)

print('Classification Report')
print(classification_report(predictions, y_test))
print('\n')
print('Confusion Matrix')
print(confusion_matrix(predictions, y_test))