In [None]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# Read 311_cases csv

cases = pd.read_csv("311_Cases.csv", sep = ",", dtype = "str")

cases = cases.drop(cases.columns[range(20,47)], axis = 1)

In [None]:
# Convert to Datetime 
date_columns = ["Opened", "Closed", "Updated"]

# Convert specified columns into datetime format 
### NOTE: datetime changes depending on where you run the program. Change accordingly ###

cases[date_columns] = cases[date_columns].apply(pd.to_datetime, format = "%m/%d/%Y %I:%M:%S %p", errors = 'coerce')
# cases[date_columns] = cases[date_columns].apply(pd.to_datetime, format = "%m/%d/%Y %H:%M", errors = 'coerce')

# Find Closed - Opened in hours
time_elapsed = (cases["Closed"] - cases["Opened"])/np.timedelta64(1, 'h')

# Insert Time Elapsed into cases df
cases.insert(3, "Time Elapsed", time_elapsed)

## Question 1

In [None]:
new_cases = cases.dropna()

new_cases = cases[cases['Time Elapsed'] > 0]

new_cases.sort_values(by = "Time Elapsed", ascending=True)

In [None]:
# Categorical Names
cat_name = ['Status','Category','Neighborhood']

new_cases = pd.DataFrame(new_cases['Time Elapsed'])

for i in cat_name: 
    x = pd.get_dummies(cases[i], drop_first=True)
    new_cases = pd.concat([new_cases, x], axis = 1)


In [None]:
### Histogram of Time Elapsed 

y = new_cases['Time Elapsed']

plt.hist(y, 50)

plt.xlabel("Time Elapsed")
plt.ylabel("Density")
plt.title("Density of Time Elapsed")

# As you can see, the data is heavily skewed towards 0, but we have a few observations that are within the 10,000 range.
# Apply log transformation to recenter the data

In [None]:
### Histogram of Log(Time Elapsed)

plt.hist(np.log(y), 50)

plt.xlabel("Log(Time Elapsed)")
plt.ylabel("Density")
plt.title("Log Density of Time Elapsed")

In [None]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

### Lecture 16 notebook 

X = new_cases.loc[:, new_cases.columns != 'Time Elapsed']

y = np.log(new_cases['Time Elapsed'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# I chose not to transform the data because most of the data is categorical (0 or 1)

### LASSO and Ridge Regression

In [None]:
lasso = linear_model.Lasso(alpha= 0.0001, max_iter=10000000, tol=0.000001)

# Fit the models
lasso.fit(X_train, y_train)

print("lasso score:", lasso.score(X_test, y_test))
print("lasso MSE:", mean_squared_error(y_test, lasso.predict(X_test)))

In [None]:
ridge = linear_model.Ridge()

ridge.fit(X_train, y_train)

print("ridge score:",ridge.score(X_test, y_test))
print("ridge MSE:", mean_squared_error(y_test, ridge.predict(X_test)))

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor(random_state = 0)

dt_reg = regressor.fit(X_train,y_train)

dt_reg.score(X_test, y_test)

In [None]:
### Different Way of Measuring Accuracy 

from sklearn.metrics import mean_squared_error


def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)     # get predicted samples 
    errors = abs(test_labels - predictions)        # abs value of residuals
    MAE = sum(errors)/len(errors)
    MSE = mean_squared_error(y_test, predictions)
    print('Model Performance')
    print('MAE: {:0.4f} log(hrs).'.format(np.mean(errors)))
    print('MSE = ', MSE)
    print('R2 = ', model.score(X_test, y_test))

In [None]:
# Base Accuracy

from sklearn.ensemble import RandomForestRegressor
from pprint import pprint

base_model = RandomForestRegressor()

base_model.fit(X_train, y_train)

base_accuracy = evaluate(base_model, X_test, y_test)

print('Parameters currently in use:\n')
pprint(base_model.get_params())

In [None]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(start = 70, stop = 90, num = 10)]
max_features = ['auto', 'sqrt']
min_samples_split = list(range(2,10))
max_depth = [2,3, 4,5]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split }


In [None]:
# Hyperparameter Tuning with RandomSearchCV 

rf = RandomForestRegressor()

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 500,
                               cv = 3, verbose=2, n_jobs = -1)

rand_reg = rf_random.fit(X_train, y_train)

print('Parameters currently in use:\n')
pprint(rand_reg.best_params_)


rand_accuracy = evaluate(rand_reg, X_test, y_test)

In [None]:
from sklearn.model_selection import GridSearchCV

# Hyperparameter Tuning 

param_grid = {
    'n_estimators': list(range(70,90,2)), 
    'max_features': list(range(2,8,2))
}

# Most important: 
# - n_estimators = # of trees 
# - max_features = # of feature considered at each leaf node 

In [None]:
from pprint import pprint

rf = RandomForestRegressor()

rf_grid = grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1)

rf_reg = rf_grid.fit(X_train, y_train)

print('Parameters currently in use:\n')
pprint(rf_reg.best_params_)


grid_accuracy = evaluate(rf_reg, X_test, y_test)

# Source: https://towardsdatascience.com/random-forest-in-python-24d0893d51c0
# Source: https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
# Source: https://towardsdatascience.com/hyperparameters-optimization-526348bb8e2d

In [None]:
# Plot real vs predicted

rf_pred = rf_reg.predict(X_test)

fig, (ax1, ax2) = plt.subplots(2, figsize = (10,8), sharey= True)
# fig.suptitle('Vertically stacked subplots')
ax1.plot(range(len(y_test)), y_test)
ax2.plot(range(len(rf_pred)), rf_pred)
ax1.set_title("True Data Values")
ax2.set_title("Predicted Values")
ax1.set_ylabel("Log Hours Elapsed")
ax2.set_ylabel("Log Hours Elapsed")

fig.tight_layout()

# Source: https://matplotlib.org/devdocs/gallery/subplots_axes_and_figures/subplots_demo.html

## Question 2

In [None]:
cases_q2 = new_cases

# open = 0
# close = 1

# 24hrs pass, if time elapsed is negative == close; if time elapsed positive == open

cases_q2['Time Elapsed'] = cases_q2['Time Elapsed'] - 24
cases_q2.loc[cases_q2['Time Elapsed'] >= 0, 'Time Elapsed'] = 0
cases_q2.loc[cases_q2['Time Elapsed'] < 0, 'Time Elapsed'] = 1

cases_q2 = cases_q2.rename(columns = {'Time Elapsed': "Status_24hrs"})

In [None]:
X = cases_q2.loc[:, cases_q2.columns != 'Status_24hrs']

y = cases_q2['Status_24hrs']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty='l1', solver='liblinear', random_state=1)
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix 

C = [10, 1, .1, .001]
train = []
test = []

for c in C:
    clf = LogisticRegression(penalty='l1', C=c, solver='liblinear', random_state=1)
    clf.fit(X_train, y_train)
    train.append(clf.score(X_train, y_train))
    test.append(clf.score(X_test, y_test))
    y_pred = clf.predict(X_test) 
    print(confusion_matrix(y_test, y_pred)) 

    
results = pd.DataFrame({'C': C, 'Training Accuracy': train, 'Test Accuracy':test})

In [None]:
lr2 = LogisticRegression(random_state=1)

param_grid = [
    {'penalty' : ['l1', 'l2'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['liblinear']}
]

In [None]:
grid_lr = GridSearchCV(lr2, param_grid, cv = 5, verbose=True, n_jobs=-1)

grid_lr.fit(X_train, y_train)

grid_lr.score(X_test, y_test)

y_pred = grid_lr.predict(X_test)

print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred)) 

### Decision Tree

In [None]:
# Basic Decision Tree Classifier (1 decision tree)

from sklearn.tree import DecisionTreeClassifier  
from sklearn.metrics import classification_report, confusion_matrix  


classifier = DecisionTreeClassifier()  
classifier.fit(X_train, y_train)  
y_pred = classifier.predict(X_test) 


print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred)) 

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=1)

# Create param grid

param_grid = [
    {'n_estimators' : list(range(10,101,5)),
     'max_features' : list(range(6,32,2))}]
                       

clf = GridSearchCV(clf, param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1)
best_clf = clf.fit(X_train, y_train)
y_pred = best_clf.predict(X_test) 

print('Parameters currently in use:\n')
pprint(best_clf.best_params_)

print('Accuracy: ', best_clf.score(X_test, y_test))

print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred)) 

### Plot ROC Curves

In [None]:
from sklearn.metrics import plot_roc_curve

models = [best_clf, classifier, grid_lr, lr]
name = ['Random Forest', 'Decision Tree', 'GridSearchCV LR','Default Logistic Regression']

ax = plt.gca()
ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
         label='Chance', alpha=.8)


for i in range(len(models)):
    plot_roc_curve(models[i], X_test, y_test, ax=ax, name = name[i] )