In [None]:
import pandas as pd
import numpy as np
import math
import pylab as plt
from sklearn import model_selection
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

dataframe =  pd.read_csv('../data/Table_E.csv')

# A. DATASET SUMMARY
## 1. Peak into the data

In [None]:
print(dataframe.head(3))

## 2. Statistics

In [None]:
print(dataframe.describe())

## 3. Transforming string categorical variables to integers

In [None]:
le = preprocessing.LabelEncoder()
dataframe['violation_code'] = le.fit_transform(dataframe['violation_code'])
dataframe['price'] = le.fit_transform(dataframe['price'])
dataframe['critical_flag'] = le.fit_transform(dataframe['critical_flag'])
dataframe['grade'] = le.fit_transform(dataframe['grade'])
dataframe['neighborhood'] = le.fit_transform(dataframe['neighborhood'])
dataframe['borough'] = le.fit_transform(dataframe['borough'])
dataframe['rating'] = le.fit_transform(dataframe['rating'])

data = dataframe.values

## 2. Histogram Plot of attributes

In [None]:
dataframe.hist(figsize=(15,12))

# B. CLASSIFICATION

In [None]:
# function that implements different classification algorithms
def classify(X,y,name):
    metric = 'accuracy'
    seed = 7

    # Set of all classification algorithms considered
    models = []
    models.append(('LR', LogisticRegression()))
    models.append(('LDA', LinearDiscriminantAnalysis()))
    models.append(('KNN', KNeighborsClassifier()))
    models.append(('CART', DecisionTreeClassifier()))
    models.append(('NB', GaussianNB()))
    models.append(('SVM', SVC()))

    # Cross-validation accuracy for different algorithms
    names = []
    performance = []
    for technique, model in models:
        k_fold = model_selection.KFold(n_splits=5, random_state=seed)
        cv_perf = model_selection.cross_val_score(model, X, y, cv=k_fold, scoring=metric)
        performance.append(cv_perf)
        names.append(technique)
        print("Technique:%s, Accuracy_mean: %f, Accuracy_std: %f)" % (technique, cv_perf.mean(), cv_perf.std()))


    # Comparison of the techniques
    plt.figure()
    plt.title(name)
    plt.boxplot(performance)
    plt.xticks(range(1,len(names)+1),names)
    plt.ylabel('Accuracy')
    plt.xlabel('Techniques')
    plt.show()

## 1. Predicting restaurant price from other attributes

In [None]:
# Selection of the best classifier
feature_attrs = [2,5,6,7,8,9,10,11,12,13,14,15,16]
label_attrs = 4

print('Predicting restaurant price from other attributes')
print('Unique classes of the labels in the data ' , (np.unique(data[:,label_attrs])))
X = data[:,feature_attrs]
X_scaled = preprocessing.scale(X)
y = list(data[:,label_attrs])
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.8, random_state=0)
classify(X_train,y_train,'Predict Restuarant Prices')

### Best classifier is chosen as logistic regression

In [None]:
# Set the parameters by cross-validation
tuned_parameters = [{'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }]

scores = ['accuracy','precision_macro', 'recall_macro']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()
    clf = GridSearchCV(LogisticRegression(penalty='l2'), tuned_parameters, cv=5, scoring='%s' % score)
    clf.fit(X_train, y_train)
    print()
    print("Grid scores for the best parameter on development set:")
    print("%0.3f (+/-%0.03f) for %r" % (clf.cv_results_['mean_test_score'][clf.best_index_], clf.cv_results_['std_test_score'][clf.best_index_], clf.cv_results_['params'][clf.best_index_]))

    print("Classification report:")
    print()
    print("The model is trained on the training set.")
    print("The scores are computed on the test set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print('Test set accuracy: %f '% accuracy_score(y_true, y_pred))
    print('Test set precision, recall, f1-score, support:')
    print(classification_report(y_true, y_pred))
    print()


## 2. Predicting restaurant rating from other attributes

In [None]:
feature_attrs = [2,4,6,7,8,9,10,11,12,13,14,15,16]
label_attrs = 5

print('Predicting restaurant price from other attributes')
print('Unique classes of the labels in the data ' , (np.unique(data[:,label_attrs])))
X = data[:,feature_attrs]
X_scaled = preprocessing.scale(X)
y = list(data[:,label_attrs])
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.8, random_state=0)
classify(X_train,y_train,'Predict Restuarant Ratings')

### Best classifier is chosen as LinearDiscriminantAnalysis

In [None]:
# Set the parameters by cross-validation
tuned_parameters = [{'solver': ['svd', 'lsqr', 'eigen'] }]

scores = ['accuracy','precision_macro', 'recall_macro']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()
    clf = GridSearchCV(LinearDiscriminantAnalysis(), tuned_parameters, cv=5, scoring='%s' % score)
    clf.fit(X_train, y_train)
    print()
    print("Grid scores for the best parameter on development set:")
    print("%0.3f (+/-%0.03f) for %r" % (clf.cv_results_['mean_test_score'][clf.best_index_], clf.cv_results_['std_test_score'][clf.best_index_], clf.cv_results_['params'][clf.best_index_]))

    print("Classification report:")
    print()
    print("The model is trained on the training set.")
    print("The scores are computed on the test set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print('Test set accuracy: %f '% accuracy_score(y_true, y_pred))
    print('Test set precision, recall, f1-score, support:')
    print(classification_report(y_true, y_pred))
    print()


## 3. Predicting critical flag from other attributes

In [None]:
feature_attrs = [2,5,6,7,4,9,10,11,12,13,14,15,16]
label_attrs = 8

print('Predicting restaurant price from other attributes')
print('Unique classes of the labels in the data ' , (np.unique(data[:,label_attrs])))
X = data[:,feature_attrs]
X_scaled = preprocessing.scale(X)
y = list(data[:,label_attrs])
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.8, random_state=0)
classify(X_train,y_train,'Predict Restuarant Critical Flag')

### Best classifier is chosen as Decision Trees

In [None]:
# Set the parameters by cross-validation
tuned_parameters = [{'max_depth':range(3,20),
                    'criterion':['gini','entropy']}]

scores = ['accuracy','precision_macro', 'recall_macro']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()
    clf = GridSearchCV(DecisionTreeClassifier(), tuned_parameters, cv=5, scoring='%s' % score)
    clf.fit(X_train, y_train)
    print()
    print("Grid scores for the best parameter on development set:")
    print("%0.3f (+/-%0.03f) for %r" % (clf.cv_results_['mean_test_score'][clf.best_index_], clf.cv_results_['std_test_score'][clf.best_index_], clf.cv_results_['params'][clf.best_index_]))

    print("Classification report:")
    print()
    print("The model is trained on the training set.")
    print("The scores are computed on the test set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print('Test set accuracy: %f '% accuracy_score(y_true, y_pred))
    print('Test set precision, recall, f1-score, support:')
    print(classification_report(y_true, y_pred))
    print()


## 4. Predicting borough from other attributes

In [None]:
feature_attrs = [2,5,6,7,4,9,10,11,12,13,14,15,8]
label_attrs = 16

print('Predicting restaurant price from other attributes')
print('Unique classes of the labels in the data ' , (np.unique(data[:,label_attrs])))
X = data[:,feature_attrs]
X_scaled = preprocessing.scale(X)
y = list(data[:,label_attrs])
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.8, random_state=0)
classify(X_train,y_train,'Predict borough of the Restuarant')

### Best classifier is chosen as Decision Trees

In [None]:
# Set the parameters by cross-validation
tuned_parameters = [{'max_depth':range(3,20),
                    'criterion':['gini','entropy']}]

scores = ['accuracy','precision_macro', 'recall_macro']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()
    clf = GridSearchCV(DecisionTreeClassifier(), tuned_parameters, cv=5, scoring='%s' % score)
    clf.fit(X_train, y_train)
    print()
    print("Grid scores for the best parameter on development set:")
    print("%0.3f (+/-%0.03f) for %r" % (clf.cv_results_['mean_test_score'][clf.best_index_], clf.cv_results_['std_test_score'][clf.best_index_], clf.cv_results_['params'][clf.best_index_]))

    print("Classification report:")
    print()
    print("The model is trained on the training set.")
    print("The scores are computed on the test set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print('Test set accuracy: %f '% accuracy_score(y_true, y_pred))
    print('Test set precision, recall, f1-score, support:')
    print(classification_report(y_true, y_pred))
    print()
