# Challenge: Neural Network #

## By: Lorenz Madarang ##

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import re
from string import punctuation
from collections import Counter
import operator
from sklearn import linear_model
import statsmodels.formula.api as smf

# Display preferences.
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

### Ingesting Las Vegas Restaurant Inspections Data ###
This data is from the same one I used for my Mid-Course Capstone.  I will be building a neural network on this data.

In [2]:
restaurant=pd.read_csv('Restaurant_Inspections2Copy.csv')

### Cleaning the Data and Feature Selection ###
I utilized the same data cleaning steps that I did for my Mid-Course Capstone.  I created a column that had boolean values that identified whether a restaurant received a downgrade in grade or not.  I also converted all the of the time variables to datetime objects.  Also, I created seperate variables that denote the year, month, and hour of the inspection.  Also, I conducted a feature selection process that limited the variables to the 10 most significant variables. 

In [3]:
#Create an empty list that will hold the boolean expression indicating a downgrade
downgrade_cat = []

#Iterate through Dataframe 
for index, row in restaurant.iterrows():
    #Use regex to see if 'Downgrade' is in the 'Inspection Result' column
    #Append downgrade_cat with '1' if present, '0' otherwise
    if re.match('.*(Downgrade)',str(row['Inspection Result'])):
        downgrade_cat.append(1)
    else:
        downgrade_cat.append(0)

#Create new boolean column indicating downgrade or not
restaurant['Downgrade'] = downgrade_cat

In [4]:
#Convert 'Inspection Time' and 'Inspection Date' to datetime object
restaurant["Inspection Time"] = pd.to_datetime(restaurant["Inspection Time"])
restaurant["Inspection Date"] = pd.to_datetime(restaurant["Inspection Date"])

#Create empty list that will hold the year, month, and hour values
year = []
month = []
hour = []

#Iterate through Dataframe
for index, row in restaurant.iterrows():
    year.append(row['Inspection Time'].year)
    month.append(row['Inspection Time'].month)
    hour.append(row['Inspection Time'].hour)
    
restaurant['Year'] = year
restaurant['Month'] = month
restaurant['Hour'] = hour  

In [5]:
#Sort the restaurant data so that the oldest inspection date is at the top
inspectdate_sort = restaurant.sort_values('Inspection Date', ascending=True)

#Create a Cumulative count column that keeps a count a restaurant's past downgrades
inspectdate_sort["Cum_Count"] = inspectdate_sort.groupby(['Restaurant Name'])['Downgrade'].apply(lambda x: x.cumsum())

In [6]:
#Create dataframe does not have any null values for "Cum_Count"
inspectdate_sort = inspectdate_sort[pd.notnull(inspectdate_sort['Cum_Count'])]

#Reset the index
inspectdate_sort = inspectdate_sort.reset_index()

In [7]:
inspectdate_sort['Year'] = inspectdate_sort['Year'].astype('category')
inspectdate_sort['Month'] = inspectdate_sort['Month'].astype('category')
inspectdate_sort['Hour'] = inspectdate_sort['Hour'].astype('category')

In [8]:
#Create feature dataset 
X = inspectdate_sort[['Category Name', 'City', 'Zip', 'Current Demerits', 'Current Grade', 'Inspection Type', 'Cum_Count' ]]

#Get numerical values for the categorical variables
X = pd.get_dummies(X)

In [9]:
#Create Target column
Y = inspectdate_sort['Downgrade']

from sklearn.model_selection import train_test_split

# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=20)

In [10]:
from sklearn.feature_selection import SelectKBest

sel = SelectKBest()

sel.fit(X_train, y_train)

 1702 1812 1817 2048 2124 2261 2425 2523 2524 2655 2856 3287 3557 3611] are constant.
  f = msb / msw


SelectKBest(k=10, score_func=<function f_classif at 0x1a0d661400>)

In [11]:
#Apply Feature Selection on the train and test data

X_train = sel.transform(X_train)
X_test = sel.transform(X_test)

### Neural Network ###
A neural network was run on the restaurant inspection data.  For this neural network, I used a single 1000 perceptron layer.  Just like my Mid-Course capstone, the method of evaluation is not the accuracy of the model but the precision of the model.  The neural network it is not as accurate as the gradient search optimzed Random Forest model from my Mid-Course Capstone.  

In [12]:
# Import the model.
from sklearn.neural_network import MLPClassifier

# Establish and fit the model, with a single, 1000 perceptron layer.
mlp = MLPClassifier(hidden_layer_sizes=(1000,))
mlp.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(1000,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [13]:
mlp.score(X_train, y_train)

0.89090266875981161

In [14]:
predict_test = mlp.predict(X_test)

In [15]:
#Sensitivity and Specificity Score on the test dataset
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(y_test, predict_test)
print('Sensitivity is {}'.format(matrix[1,1]/(matrix[0,1]+matrix[1,1])))
print('Specificity is {}'.format(matrix[0,0]/(matrix[0,0]+matrix[1,0])))

Sensitivity is 0.5240384615384616
Specificity is 0.8929503085830629


In [17]:
#Cross-validation Precision Scores on the test and train dataset from Best GB GridSearch parameters
from sklearn.model_selection import cross_val_score
precision_MLPtest = cross_val_score(mlp, X_test, y_test, cv=10, scoring='precision')
print('Cross Validation Precision Scores - Test Set: {:.5f}(+/- {:.2f})'.format(precision_MLPtest.mean(), 
                                                                               precision_MLPtest.std()*2))
precision_MLPtrain = cross_val_score(mlp, X_train, y_train, cv=10, scoring='precision')
print('Cross Validation Precision Scores - Train Set: {:.5f}(+/- {:.2f})'.format(precision_MLPtrain.mean(), 
                                                                               precision_MLPtrain.std()*2))

Cross Validation Precision Scores - Test Set: 0.60537(+/- 0.23)
Cross Validation Precision Scores - Train Set: 0.56519(+/- 0.09)


### Precision Scores from Random Forest Model ###
Cross Validation Precision Scores - Training Set: 0.636(+/- 0.12)

Cross Validation Precision Scores - Test Set: 0.659(+/- 0.23)