In [None]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# Read 311_cases csv

cases = pd.read_csv("311_Cases.csv", sep = ",", dtype = "str")

cases = cases.drop(cases.columns[range(20,47)], axis = 1)

In [None]:
# Convert to Datetime 
date_columns = ["Opened", "Closed", "Updated"]

# Convert specified columns into datetime format 
### NOTE: datetime changes depending on where you run the program. Change accordingly ###

cases[date_columns] = cases[date_columns].apply(pd.to_datetime, format = "%m/%d/%Y %I:%M:%S %p", errors = 'coerce')
# cases[date_columns] = cases[date_columns].apply(pd.to_datetime, format = "%m/%d/%Y %H:%M", errors = 'coerce')

# Find Closed - Opened in hours
time_elapsed = (cases["Closed"] - cases["Opened"])/np.timedelta64(1, 'h')

# Insert Time Elapsed into cases df
cases.insert(3, "Time Elapsed", time_elapsed)

## Question 1

In [None]:
new_cases = cases.dropna()

new_cases = cases[cases['Time Elapsed'] > 0]

new_cases.sort_values(by = "Time Elapsed", ascending=True)

In [None]:
# Categorical Names
cat_name = ['Status','Category','Neighborhood']

new_cases = pd.DataFrame(new_cases['Time Elapsed'])

for i in cat_name: 
    x = pd.get_dummies(cases[i], drop_first=True)
    new_cases = pd.concat([new_cases, x], axis = 1)


In [None]:
### Histogram of Time Elapsed 

y = new_cases['Time Elapsed']

plt.hist(y, 50)

plt.xlabel("Time Elapsed")
plt.ylabel("Density")
plt.title("Density of Time Elapsed")

# As you can see, the data is heavily skewed towards 0, but we have a few observations that are within the 10,000 range.
# Apply log transformation to recenter the data

In [None]:
### Histogram of Log(Time Elapsed)

plt.hist(np.log(y), 50)

plt.xlabel("Log(Time Elapsed)")
plt.ylabel("Density")
plt.title("Log Density of Time Elapsed")

In [None]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

### Lecture 16 notebook 

X = new_cases.loc[:, new_cases.columns != 'Time Elapsed']

y = np.log(new_cases['Time Elapsed'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# I chose not to transform the data because most of the data is categorical (0 or 1)

### LASSO and Ridge Regression

In [None]:
lasso = linear_model.Lasso(alpha= 0.0001, max_iter=10000000, tol=0.000001)

# Fit the models
lasso.fit(X_train, y_train)

print("lasso score:", lasso.score(X_test, y_test))
print("lasso MSE:", mean_squared_error(y_test, lasso.predict(X_test)))

In [None]:
ridge = linear_model.Ridge()

ridge.fit(X_train, y_train)

print("ridge score:",ridge.score(X_test, y_test))
print("ridge MSE:", mean_squared_error(y_test, ridge.predict(X_test)))

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor(random_state = 0)

dt_reg = regressor.fit(X_train,y_train)

dt_reg.score(X_test, y_test)

In [None]:
### Different Way of Measuring Accuracy 

from sklearn.metrics import mean_squared_error


def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)     # get predicted samples 
    errors = abs(test_labels - predictions)        # abs value of residuals
    MAE = sum(errors)/len(errors)
    MSE = mean_squared_error(y_test, predictions)
    print('Model Performance')
    print('MAE: {:0.4f} log(hrs).'.format(np.mean(errors)))
    print('MSE = ', MSE)
    print('R2 = ', model.score(X_test, y_test))

In [None]:
# Base Accuracy

from sklearn.ensemble import RandomForestRegressor
from pprint import pprint

base_model = RandomForestRegressor()

base_model.fit(X_train, y_train)

base_accuracy = evaluate(base_model, X_test, y_test)

print('Parameters currently in use:\n')
pprint(base_model.get_params())

In [None]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(start = 70, stop = 90, num = 10)]
max_features = ['auto', 'sqrt']
min_samples_split = list(range(2,10))
max_depth = [2,3, 4,5]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split }


In [None]:
# Hyperparameter Tuning with RandomSearchCV 

rf = RandomForestRegressor()

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 500,
                               cv = 3, verbose=2, n_jobs = -1)

rand_reg = rf_random.fit(X_train, y_train)

print('Parameters currently in use:\n')
pprint(rand_reg.best_params_)


rand_accuracy = evaluate(rand_reg, X_test, y_test)

In [None]:
from sklearn.model_selection import GridSearchCV

# Hyperparameter Tuning 

param_grid = {
    'n_estimators': list(range(70,90,2)), 
    'max_features': list(range(2,8,2))
}

# Most important: 
# - n_estimators = # of trees 
# - max_features = # of feature considered at each leaf node 

In [None]:
from pprint import pprint

rf = RandomForestRegressor()

rf_grid = grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1)

rf_reg = rf_grid.fit(X_train, y_train)

print('Parameters currently in use:\n')
pprint(rf_reg.best_params_)


grid_accuracy = evaluate(rf_reg, X_test, y_test)

# Source: https://towardsdatascience.com/random-forest-in-python-24d0893d51c0
# Source: https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
# Source: https://towardsdatascience.com/hyperparameters-optimization-526348bb8e2d

In [None]:
# Plot real vs predicted

rf_pred = rf_reg.predict(X_test)

fig, (ax1, ax2) = plt.subplots(2, figsize = (10,8), sharey= True)
# fig.suptitle('Vertically stacked subplots')
ax1.plot(range(len(y_test)), y_test)
ax2.plot(range(len(rf_pred)), rf_pred)
ax1.set_title("True Data Values")
ax2.set_title("Predicted Values")
ax1.set_ylabel("Log Hours Elapsed")
ax2.set_ylabel("Log Hours Elapsed")

fig.tight_layout()

# Source: https://matplotlib.org/devdocs/gallery/subplots_axes_and_figures/subplots_demo.html

## Question 2

In [None]:
cases_q2 = new_cases

# open = 0
# close = 1

# 24hrs pass, if time elapsed is negative == close; if time elapsed positive == open

cases_q2['Time Elapsed'] = cases_q2['Time Elapsed'] - 24
cases_q2.loc[cases_q2['Time Elapsed'] >= 0, 'Time Elapsed'] = 0
cases_q2.loc[cases_q2['Time Elapsed'] < 0, 'Time Elapsed'] = 1

cases_q2 = cases_q2.rename(columns = {'Time Elapsed': "Status_24hrs"})

In [None]:
X = cases_q2.loc[:, cases_q2.columns != 'Status_24hrs']

y = cases_q2['Status_24hrs']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty='l1', solver='liblinear', random_state=1)
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix 

C = [10, 1, .1, .001]
train = []
test = []

for c in C:
    clf = LogisticRegression(penalty='l1', C=c, solver='liblinear', random_state=1)
    clf.fit(X_train, y_train)
    train.append(clf.score(X_train, y_train))
    test.append(clf.score(X_test, y_test))
    y_pred = clf.predict(X_test) 
    print(confusion_matrix(y_test, y_pred)) 

    
results = pd.DataFrame({'C': C, 'Training Accuracy': train, 'Test Accuracy':test})

In [None]:
lr2 = LogisticRegression(random_state=1)

param_grid = [
    {'penalty' : ['l1', 'l2'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['liblinear']}
]

In [None]:
grid_lr = GridSearchCV(lr2, param_grid, cv = 5, verbose=True, n_jobs=-1)

grid_lr.fit(X_train, y_train)

grid_lr.score(X_test, y_test)

y_pred = grid_lr.predict(X_test)

print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred)) 

### Decision Tree

In [None]:
# Basic Decision Tree Classifier (1 decision tree)

from sklearn.tree import DecisionTreeClassifier  
from sklearn.metrics import classification_report, confusion_matrix  


classifier = DecisionTreeClassifier()  
classifier.fit(X_train, y_train)  
y_pred = classifier.predict(X_test) 


print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred)) 

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=1)

# Create param grid

param_grid = [
    {'n_estimators' : list(range(10,101,5)),
     'max_features' : list(range(6,32,2))}]
                       

clf = GridSearchCV(clf, param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1)
best_clf = clf.fit(X_train, y_train)
y_pred = best_clf.predict(X_test) 

print('Parameters currently in use:\n')
pprint(best_clf.best_params_)

print('Accuracy: ', best_clf.score(X_test, y_test))

print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred)) 

### Plot ROC Curves

In [None]:
from sklearn.metrics import plot_roc_curve

models = [best_clf, classifier, grid_lr, lr]
name = ['Random Forest', 'Decision Tree', 'GridSearchCV LR','Default Logistic Regression']

ax = plt.gca()
ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
         label='Chance', alpha=.8)


for i in range(len(models)):
    plot_roc_curve(models[i], X_test, y_test, ax=ax, name = name[i] )

## Question 3

In [None]:
cases3 = cases.dropna()
# subset data
cases_q3 = cases3.drop(['Request Details','CaseID','Media URL','Closed','Updated',
                       'Status Notes','Point','Address'], axis=1)

In [None]:
# separate by day and night 
# day = 0; night = 1
cases_q3 = cases_q3.set_index(pd.to_datetime(cases_q3['Opened']))
cases_q3["day_night"] = 1
cases_q3.loc[cases_q3.between_time("06:00", "18:00").index, "day_night"] = 0

In [None]:
# imbalanced data
cases_q3["day_night"].value_counts()

In [None]:
# dummy variables for categorical columns
status = pd.get_dummies(cases_q3['Status'],drop_first=True)
category = pd.get_dummies(cases_q3['Category'],drop_first=True)
request = pd.get_dummies(cases_q3['Request Type'],drop_first=True)
agency = pd.get_dummies(cases_q3['Responsible Agency'],drop_first=True)
street = pd.get_dummies(cases_q3['Street'],drop_first=True)
supervisor = pd.get_dummies(cases_q3['Supervisor District'],drop_first=True)
neighbor = pd.get_dummies(cases_q3['Neighborhood'],drop_first=True)
police = pd.get_dummies(cases_q3['Police District'],drop_first=True)
source = pd.get_dummies(cases_q3['Source'],drop_first=True)

In [None]:
df_cases = pd.concat([cases_q3,status,category,request,agency,street,supervisor,neighbor,police,source],axis=1)

# dropping the original columns
df_cases = df_cases.drop(['Status','Category','Request Type','Time Elapsed','Opened','Responsible Agency','Street',
              'Supervisor District','Neighborhood','Police District',
              'Opened','Source'],axis=1)

In [None]:
# Dimension Reduction & Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
X = df_cases.loc[:,df_cases.columns != 'day_night']

# create y from rating
y = df_cases['day_night']

### Logistic Regression After PCA

In [None]:
# PCA the data except day_night column
from sklearn.decomposition import PCA
pca = PCA(n_components=2,random_state=1)
pca_data = pca.fit_transform(df_cases.loc[:,df_cases.columns != 'day_night'])

# cumulative explained variance 
print(pca.explained_variance_ratio_.cumsum())

In [None]:
# split the data into test and training sets, with 70% of samples being put into the training set
X_train, X_test, y_train, y_test = train_test_split(pca_data, y, test_size=0.3, random_state=0)

In [None]:
# Logistic Regression after PCA
clf = LogisticRegression(penalty='l2', solver='liblinear', class_weight='balanced')
clf.fit(X_train, y_train)
print('Coefficient of each feature:', clf.coef_)
print('Training accuracy:', clf.score(X_train, y_train))
print('Test accuracy:', clf.score(X_test, y_test))
print('')

In [None]:
# accuracy 
from sklearn.metrics import confusion_matrix, recall_score, accuracy_score

y_predict = clf.predict(X_train) 

print("Confusion Matrix")
print(confusion_matrix(y_train, y_predict))
print()

### Logistic Regression Without PCA (Original Data)

In [None]:
# split the data into test and training sets, with 70% of samples being put into the training set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
# Logistic Regression after PCA
clf = LogisticRegression(penalty='l2', solver='liblinear',class_weight='balanced')
clf.fit(X_train, y_train)
print('Training accuracy:', clf.score(X_train, y_train))
print('Test accuracy:', clf.score(X_test, y_test))
print('')

In [None]:
# accuracy 
y_predict = clf.predict(X_train) 

print("Confusion Matrix")
print(confusion_matrix(y_train, y_predict))
print()

## Question 4

In [None]:
cars = cases['Request Details'].str.split('-').str[1]

In [None]:
# separating car brands by price levels
high = ['BMW','Mercedes','Audi','Tesla','Porsche','Ferrari','Land Rover',
        'Lamborghini','Maserati','Jaguar','Cadillac']
med = ['Acura','Lexus','Mini','Infiniti','Volvo']
low = ['Honda','Kia','Toyota','Hyundai','Ford','Nissan','Chevrolet',
       'Volkswagon','VW','Jeep','Mazda','Subaru']

In [None]:
# new dataframe with request type and car brands extracted from request details column
df_cars = pd.concat([cars,cases['Request Type']],axis=1).dropna()

In [None]:
req = df_cars['Request Details']

# labeling appropriate price levels for requests based on car brands
df_cars.loc[req.str.contains('|'.join(high)),'Price Level'] = 'High'
df_cars.loc[req.str.contains('|'.join(med)),'Price Level'] = 'Medium'
df_cars.loc[req.str.contains('|'.join(low)),'Price Level'] = 'Low'

In [None]:
# final cleaned dataset
data_cars = df_cars.dropna().reset_index(drop=True)

In [None]:
# trying to see if there's a relationship between price level and request type
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf = TfidfVectorizer()
tfidf = tf_idf.fit_transform(data_cars['Request Type']).toarray()

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

pca = PCA(n_components=2,random_state=1)
tfidf_pca = pca.fit_transform(tfidf)

In [None]:
high = data_cars.index[data_cars['Price Level'] == 'High']
med = data_cars.index[data_cars['Price Level'] == 'Medium']
low = data_cars.index[data_cars['Price Level'] == 'Low']

In [None]:
# separate by price level and cluster request type
from collections import Counter
# density of request type by high priced cars
# density is shown by the size of the points
x = tfidf_pca[high,0]
y = tfidf_pca[high,1]
c = Counter(zip(x,y)) # count same points
s = [c[(xx,yy)] for xx,yy in zip(x,y)] # scale

In [None]:
plt.scatter(x, y, s=s)
plt.title('Q4: PCA for High Price Level')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()

In [None]:
# merge kpca and data_cars by price level (high)
x_coord = c.most_common(1)[0][0][0]
y_coord = c.most_common(1)[0][0][1]

car_index = np.where((tfidf_pca[high,0] == x_coord) & (tfidf_pca[high,1] == y_coord))

In [None]:
# request type with the highest count
high_car = data_cars.loc[data_cars['Price Level'] == 'High']
high_car.reset_index(drop=True, inplace=True)
high_car.loc[(car_index[0][0]), data_cars.columns == 'Request Type']

In [None]:
# density of request type by medium priced cars
x = tfidf_pca[med,0]
y = tfidf_pca[med,1]
c = Counter(zip(x,y))
s = [c[(xx,yy)] for xx,yy in zip(x,y)]

plt.scatter(x, y, s=s)
plt.title('Q4: PCA for Medium Price Level')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()

In [None]:
# merge kpca and data_cars by price level (med)
x_coord = c.most_common(1)[0][0][0]
y_coord = c.most_common(1)[0][0][1]

car_index = np.where((tfidf_pca[med,0] == x_coord) & (tfidf_pca[med,1] == y_coord))

In [None]:
# request type with the highest count
med_car = data_cars.loc[data_cars['Price Level'] == 'Medium']
med_car.reset_index(drop=True, inplace=True)
med_car.loc[(car_index[0][0]), data_cars.columns == 'Request Type']

In [None]:
# density of request type by low priced cars
x = tfidf_pca[low,0]
y = tfidf_pca[low,1]
c = Counter(zip(x,y))
s = [c[(xx,yy)] for xx,yy in zip(x,y)] # change multiplier 

plt.scatter(x, y, s=s)
plt.title('Q4: PCA for Low Price Level')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()

In [None]:
# merge kpca and data_cars by price level (low)
x_coord = c.most_common(1)[0][0][0]
y_coord = c.most_common(1)[0][0][1]

car_index = np.where((tfidf_pca[low,0] == x_coord) & (tfidf_pca[low,1] == y_coord))

In [None]:
# request type with the highest count
low_car = data_cars.loc[data_cars['Price Level'] == 'Low']
low_car.reset_index(drop=True, inplace=True)
low_car.loc[(car_index[0][0]), data_cars.columns == 'Request Type']

## Question 7

In [None]:
# get NaNs from "Closed" column
cases7 = cases[cases['Closed'].isna()]

In [None]:
cases_q7 = cases7.drop(['Request Details','CaseID','Media URL','Closed','Updated',
                       'Status Notes','Point','Address'], axis=1)

In [None]:
# dummy variables for categorical columns
status = pd.get_dummies(cases_q7['Status'],drop_first=True)
category = pd.get_dummies(cases_q7['Category'],drop_first=True)
agency = pd.get_dummies(cases_q7['Responsible Agency'],drop_first=True)
street = pd.get_dummies(cases_q7['Street'],drop_first=True)
supervisor = pd.get_dummies(cases_q7['Supervisor District'],drop_first=True)
neighbor = pd.get_dummies(cases_q7['Neighborhood'],drop_first=True)
police = pd.get_dummies(cases_q7['Police District'],drop_first=True)
source = pd.get_dummies(cases_q7['Source'],drop_first=True)

In [None]:
df7_cases = pd.concat([cases_q7,status,category,agency,street,supervisor,neighbor,police,source],axis=1)

# dropping the original columns
df7_cases = df7_cases.drop(['Status','Category','Responsible Agency','Street',
              'Supervisor District','Neighborhood','Police District',
              'Opened','Source'],axis=1)
df7_cases = df7_cases.dropna()

In [None]:
# plotting seven most common request types
na_data = df7_cases.groupby('Request Type').agg('count')['Latitude'].sort_values(ascending=False).head(7)

In [None]:
X = df7_cases.loc[:,df7_cases.columns != 'Request Type']

# create y from rating
y = df7_cases['Request Type']

# scaling data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x = sc.fit_transform(X)
pca_q7 = pca.fit_transform(x)
principalDf = pd.DataFrame(data = pca_q7,
                           columns = ['principal component 1', 'principal component 2'])
principalDf.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)
finalDf = pd.concat([principalDf, y], axis=1)

In [None]:
# PCA plot
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
targets = list(na_data.index) # target values
colors = ['red', 'yellow', 'green', 'cyan', 'blue', 'magenta','black']

for target, color in zip(targets,colors):
    indicesToKeep = finalDf['Request Type'] == target
    ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
               , finalDf.loc[indicesToKeep, 'principal component 2']
               , c = color
               , s = 50)
ax.legend(targets,loc = 'upper center',bbox_to_anchor = (1.45, 0.8), shadow = True, ncol = 1)
ax.grid()