# Task for Today  

***

## Restaurant Inspection Result Prediction  

Given *data about restaurants in New York*, let's try to predict whether a given restaurant will be marked with a **critical flag**.

We will use a logistic regression model to make our predictions.

# Getting Started

In [None]:
import numpy as np
import pandas as pd
pd.set_option('max_columns', None)

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, classification_report

In [None]:
data = pd.read_csv('../input/nyc-inspections/DOHMH_New_York_City_Restaurant_Inspection_Results.csv')

In [None]:
data

In [None]:
data.info()

# Preprocessing

In [None]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop CAMIS and DBA columns
    df = df.drop(['CAMIS', 'DBA'], axis=1)
    
    # Drop unnecessary (duplicate) column
    df = df.drop('VIOLATION DESCRIPTION', axis=1)
    
    # Drop single-value column
    df = df.drop('RECORD DATE', axis=1)
    
    # Drop columns with more than 25% missing values
    missing_value_columns = df.columns[df.isna().mean() >= 0.25]
    df = df.drop(missing_value_columns, axis=1)
    
    # Drop high-cardinality columns
    high_cardinality_columns = [
        'BUILDING',
        'STREET',
        'PHONE',
    ]
    df = df.drop(high_cardinality_columns, axis=1)
    
    # Convert ZIPCODE column into a string column
    df['ZIPCODE'] = df['ZIPCODE'].astype(str)
    
    # Drop rows with missing target values
    missing_target_rows = df[df['CRITICAL FLAG'] == 'Not Applicable'].index
    df = df.drop(missing_target_rows, axis=0).reset_index(drop=True)
    
    # Fill missing values
    df['SCORE'] = df['SCORE'].fillna(df['SCORE'].mean())
    
    # Extract date features
    df['INSPECTION DATE'] = pd.to_datetime(df['INSPECTION DATE'])
    df['INSPECTION YEAR'] = df['INSPECTION DATE'].apply(lambda x: x.year)
    df['INSPECTION MONTH'] = df['INSPECTION DATE'].apply(lambda x: x.month)
    df['INSPECTION DAY'] = df['INSPECTION DATE'].apply(lambda x: x.day)
    df = df.drop('INSPECTION DATE', axis=1)
    
    # One-hot encode remaining categorical columns
    for column in df.select_dtypes('object').columns.drop('CRITICAL FLAG'):
        dummies = pd.get_dummies(df[column], prefix=column)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
    
    # Split df into X and y
    y = df['CRITICAL FLAG']
    X = df.drop('CRITICAL FLAG', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [None]:
X_train

In [None]:
y_train

# Training

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

# Results

In [None]:
acc = model.score(X_test, y_test)
print("Test Accuracy: {:.2f}%".format(acc * 100))

In [None]:
# Confusion matrix and classification

y_pred = model.predict(X_test)

cm = confusion_matrix(y_test, y_pred, labels=['Not Critical', 'Critical'])
clr = classification_report(y_test, y_pred, labels=['Not Critical', 'Critical'])

plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, vmin=0, fmt='g', cmap='Blues', cbar=False)
plt.xticks(ticks=[0.5, 1.5], labels=['Not Critical', 'Critical'])
plt.yticks(ticks=[0.5, 1.5], labels=['Not Critical', 'Critical'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

print("Classification Report:\n----------------------\n", clr)

# Feature Importance

In [None]:
# Use weights of the model to see the most importance
most_important_feature = np.argmax(model.coef_)
X_train.columns[most_important_feature]

In [None]:
# Check the description associated with this violation code
code_description = data.query("`VIOLATION CODE` == '10F'")['VIOLATION DESCRIPTION'].values[0]

print("CODE 10F:\n=========\n" + code_description)

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/tMaSOYLO-EQ