Browse files

Merge pull request #17 from sfbrigade/10_4

10 4
  • Loading branch information...
yamariva2000 committed Nov 2, 2017
2 parents ffb5ac3 + 834b627 commit 7ab7ab40389a86baee456575d40456da31b282fd
Showing with 164 additions and 11 deletions.
  1. BIN ROC Curve.png
  2. BIN geo_export_ba2a7c3c-9fd2-4383-947f-399256d6ad60.shp
  3. +43 −0
  4. +75 −0
  5. +46 −11
BIN +28.9 KB ROC Curve.png
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,43 @@
import pandas as pd
import shapefile
def pip(x, y, poly):
# function that determines if x,y coordinates are within a polygon
# function from here:
n = len(poly)
inside = False
p1x, p1y = poly[0]
for i in range(n + 1):
p2x, p2y = poly[i % n]
if y > min(p1y, p2y):
if y <= max(p1y, p2y):
if x <= max(p1x, p2x):
if p1y != p2y:
xints = (y - p1y) * (p2x - p1x) / (p2y - p1y) + p1x
if p1x == p2x or x <= xints:
inside = not inside
p1x, p1y = p2x, p2y
return inside
def tract_id(x, y, r):
#with each geocode, loops through each of the tract shapes to determine to which tract the location belongs
for i in range(len(r.shapes())):
if pip(x, y, r.shape(i).points):
#item 3 contains the tract info.
return r.record(i)[3]
#if not found return none
return None
r = shapefile.Reader("geo_export_ba2a7c3c-9fd2-4383-947f-399256d6ad60.shp")
k = pd.read_csv('masterdf_20170920.csv', low_memory=False, )
#convert text coordinates to x,y floats
k['yx'] = k.Location_y.apply(lambda x: x[1:-1].split(','))
k['x'] = k.yx.apply(lambda x: float(x[1]))
k['y'] = k.yx.apply(lambda x: float(x[0]))
k['tract'] = k.apply(lambda cols: tract_id(cols['x'], cols['y'], r), axis=1)
#save data to new file with additional tract information
@@ -0,0 +1,75 @@
import sys
from fuzzywuzzy import process
import pandas as pd
import numpy as np
import math
from sklearn.neighbors import BallTree
pd.set_option("display.max_columns", 101)
def join_data_on_address_GPS(radius=40, df=None):
# uses Addresses_-_Enterprise_Addressing_System.csv' as the reference address table
# df is the table to be linked by closest lon/lat and address.
# returns data frame of matched EAS, Address in EAS, and Match Score
reference = pd.read_csv('./raw_csv/Addresses_-_Enterprise_Addressing_System.csv')
#convert Lon/Lat to radians
reference['LonRad'] = reference['Longitude'].apply(math.radians)
reference['LatRad'] = reference['Latitude'].apply(math.radians)
if 'Permit Address' in df.columns:
df.rename(columns={'Permit Address': 'Address'}, inplace=True)
class r_closest_EAS(BallTree):
# given an address and lon/lat in the data, uses the Balltree to find the closest lon/lat locations in the EAS reference table.
def __init__(self, reference=None, *args, **kwargs):
#reference EAS table
self.reference = reference
# table of interest
data = reference[['LonRad', 'LatRad']].values
#initializes table as data
super(r_closest_EAS, self).__init__(data=data, *args, **kwargs)
def search_around(self, lon=None, lat=None, address=None, radius=None):
if (lon or lat or address) == None:
print('missing variables')
# given lon/lat as array and radius, return indices of addresses in EAS
indices = self.query_radius(np.array([lon, lat]).reshape(1, -1), r=radius)
indices = indices[0].tolist()
#return rows in reference address that are found in the query_radius
found_places = self.reference.iloc[indices]
#get list of addresses from found_places
found_addresses = found_places['Address'].values.tolist()
if found_addresses == []:
# print('no address found')
#using process function from fuzzy wuzzy (edit distance) to return closest text match in EAS to the address of interest.
closest_address, score = process.extractOne(query=address, choices=found_addresses)
# get index in list
closest_index = found_addresses.index(closest_address)
# get row of index in dataframe
closest_place = found_places.iloc[closest_index]
# return EAS ID of that row
closest_eas = closest_place['EAS BaseID']
if closest_eas is None:
print('None found')
return {'EAS': closest_eas, 'Address': closest_address, 'Score': score}
df['LonRad'] = df['Longitude'].apply(math.radians)
df['LatRad'] = df['Latitude'].apply(math.radians)
r_radians = radius / 40075000 * 2 * math.pi * .7
#instantiate BallTree object with reference EAS table.
k = r_closest_EAS(reference=reference, metric='haversine')
#perform columwise operation with k using columns as inputs from data.
eas_match = df.apply(
lambda cols: k.search_around(cols['LonRad'], cols['LatRad'], cols['Address'], radius=r_radians), axis=1)
#return match dataframe
return eas_match
@@ -4,9 +4,11 @@
def XY_data(multiclass=False):
#will process binary or multiclass
# set target to Fire Incident Type
# assign classes
# Nan becomes no incident
@@ -25,26 +27,49 @@ def XY_data(multiclass=False):
#create one-hot variables for property type and neighborhood
return x,y,unique
def Data_normalized(multiclass=False):
# get quantitative features
x_quantitative=x[['age','Num_Bathrooms', 'Num_Bedrooms',
'Num_Rooms', 'Num_Stories', 'Num_Units', 'Land_Value',
'Property_Area', 'Assessed_Improvement_Val', 'Tot_Rooms' ]]
'Property_Area', 'Assessed_Improvement_Val', 'Tot_Rooms','Perc_Ownership' ,
'count potential fire control', 'count all complaints',
'count all complaints not corrected',
'count potential fire control not corrected',
'count fire emergency safety', 'count potential fire cause',
'count fire emergency safety not corrected',
'count potential fire cause not corrected'
#normalize quantitative features
#combine x dummies and x scaled data
return x_all,y,unique
return x_all,y,unique,x_ids
def classifier(train=True,x=None,y=None,target_names=None,class_weight=None,multiclass=False,plot=False,cross_val=False):
@@ -84,9 +109,10 @@ def classifier(train=True,x=None,y=None,target_names=None,class_weight=None,mult
from sklearn.metrics import classification_report
print('labels {}'.format(target_names))
from datetime import datetime
print('model run time {}'.format(
if multiclass == False:
@@ -103,17 +129,26 @@ def classifier(train=True,x=None,y=None,target_names=None,class_weight=None,mult
plt.title('ROC Curve for Binary Class')
if plot:
print([xtrain.columns[i] for i in np.argsort(rf_model.feature_importances_)[::-1]])
print([xtrain.columns[i] for i in np.argsort(rf_model.feature_importances_)[::-1]])
from sklearn.model_selection import cross_val_score
if cross_val:
print('cross validation {}'.format(scores))
return rf_model
def visualization_table(model=None,x=None,y=None,target_names=None):
probs = pd.DataFrame(model.predict_proba(x), columns=target_names)
predicts = pd.DataFrame(model.predict(x), columns=["prediction"])
return pd.concat([x_ids, probs, predicts, y], axis=1)
if __name__ == '__main__':
multiclass = False
rf_model=classifier(train=True,x=x,y=y,target_names=target_names, class_weight=None,multiclass=multiclass,plot=False,cross_val=False)
classifier(train=True,x=x,y=y,target_names=target_names, class_weight=None,multiclass=False,plot=False,cross_val=True)

0 comments on commit 7ab7ab4

Please sign in to comment.