In [1]:
%matplotlib inline
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import shapely.wkt
from shapely.geometry import Point
from sklearn.datasets.base import Bunch
from timeit import default_timer as timer
from datetime import timedelta
import pickle
import warnings

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore', module='matplotlib.font_manager')

In [2]:
crs_lon_lat = {
    'init': 'epsg:4326',
}

crs_for_distances = {
    'proj': 'aeqd',
    'lon_0': -83.10, 
    'lat_0': 42.38, 
    'units': 'm',
}

In [3]:
output_root = 'output/2'

os.makedirs(output_root, exist_ok=True)

def maybe_pickle(name, generate, force=False):
    filename = os.path.join(output_root, name + '.pickle')
    if os.path.exists(filename) and not force:
        with open(filename, 'rb') as f:
            return pickle.load(f)
    else:
        print('generating', name)
        start_time = timer()
        obj = generate()
        elapsed = timedelta(seconds=timer() - start_time)
        print('finished generating {}; {} elapsed'.format(name, elapsed))
        with open(filename, 'wb') as f:
            pickle.dump(obj, f)
        return obj

In [4]:
def gpd_read_csv(filename):
    df = pd.read_csv(filename)
    df['geometry'] = df.geometry.map(shapely.wkt.loads)
    return gpd.GeoDataFrame(df, crs=crs_lon_lat)

def read_parcels(filename):
    parcels = gpd_read_csv(filename)
    parcels = parcels.set_index('ParcelNo', drop=False)
    parcels = parcels.to_crs(crs_for_distances)
    return parcels

parcels_all = maybe_pickle('parcels', lambda: read_parcels('output/1_parcels.csv'))
print(len(parcels_all), 'parcels,', parcels_all.IsBlighted.sum(), 'blighted')

generating parcels
finished generating parcels; 0:01:13.494929 elapsed
384600 parcels, 6222 blighted


In [5]:
num_test_per_class = 1000

def choose_random_indices(index, size):
    return pd.Index(np.random.choice(index, size, replace=False))

def train_test_split(parcels, seed=None):
    np.random.seed(seed)

    index1 = parcels[parcels.IsBlighted].index
    test_index1 = choose_random_indices(index1, num_test_per_class)
    train_index1 = index1.difference(test_index1)

    index0 = parcels.index.difference(index1)
    test_index0 = choose_random_indices(index0, num_test_per_class)
    train_index0 = choose_random_indices(index0.difference(test_index0), len(train_index1))

    test_index = test_index1.union(test_index0)
    train_index = train_index1.union(train_index0)

    train, test = parcels.loc[train_index], parcels.loc[test_index]
    
    print(len(parcels), 'parcels')
    print('train:', len(train), 'total; blighted ratio:', train.IsBlighted.mean())
    print('test:', len(test), 'total; blighted ratio:', test.IsBlighted.mean())
    
    return train, test

parcels_train, parcels_test = train_test_split(parcels_all, seed=2017)

384600 parcels
train: 10444 total; blighted ratio: 0.5
test: 2000 total; blighted ratio: 0.5


In [28]:
def get_parcel_features(parcels):
    centroids = parcels.centroid
    return pd.DataFrame({
        'X': centroids.map(lambda p: p.coords[0][0]),
        'Y': centroids.map(lambda p: p.coords[0][1]),
    }, index=parcels.index)

parcel_features_train = get_parcel_features(parcels_train)
parcel_features_test = get_parcel_features(parcels_test)

In [6]:
def read_csv(filename, extract_lat_lon):
    df = pd.read_csv(filename)
    lat_lons = extract_lat_lon(df)
    pred = ~lat_lons[['Lat', 'Lon']].isnull().any(axis=1)
    df = df[pred]
    geometry = lat_lons[pred].apply(lambda row: Point(row.Lon, row.Lat), axis=1)
    gdf = gpd.GeoDataFrame(df, geometry=geometry, crs=crs_lon_lat)
    gdf = gdf.to_crs(crs_for_distances)
    return gdf

In [7]:
def extract_lat_lon_calls(df):
    return pd.DataFrame(dict(Lat=df.lat, Lon=df.lng))

calls = maybe_pickle('calls', lambda: read_csv('data/detroit-311.csv', extract_lat_lon_calls))
calls.head()

generating calls
finished generating calls; 0:00:02.612992 elapsed


Unnamed: 0,ticket_id,city,issue_type,ticket_status,issue_description,rating,ticket_closed_date_time,acknowledged_at,ticket_created_date_time,ticket_last_updated_date_time,address,lat,lng,location,image,geometry
0,1516722,City of Detroit,Clogged Drain,Acknowledged,"Two drains one on each side of street, street ...",3,,03/06/2015 10:03:38 PM,03/06/2015 09:57:52 PM,04/12/2015 01:01:10 AM,"13120-13130 Ilene Street Detroit, MI 48238, USA",42.383998,-83.161039,"(42.3839977668, -83.1610385642)",,POINT (-5026.573136771617 445.8795403950064)
1,1525361,City of Detroit,Clogged Drain,Acknowledged,standing water on lumplin,2,,03/11/2015 04:23:11 PM,03/11/2015 04:14:29 PM,04/07/2015 02:04:44 PM,"1485 E. Outer Drive Detroit, Michigan",42.440471,-83.080919,"(42.4404708, -83.080919)",,POINT (1569.926552646237 6717.348674800637)
2,1525218,City of Detroit,Clogged Drain,Closed,CITZEN CALLED TO REPORT CLOGGED DRAINS,2,08/15/2015 12:03:43 AM,03/11/2015 03:39:05 PM,03/11/2015 03:26:20 PM,08/15/2015 12:03:44 AM,"15460 Eastburn Detroit, Michigan",42.445244,-82.962038,"(42.445244, -82.962038)",,POINT (11350.22694166889 7256.603173570079)
3,1525214,City of Detroit,Clogged Drain,Acknowledged,Citizen called DWSD to report clogged drain,3,,03/11/2015 03:35:02 PM,03/11/2015 03:22:42 PM,06/07/2015 10:07:48 PM,"17541 Mendota St Detroit, Michigan",42.421043,-83.166194,"(42.421043, -83.166194)",,POINT (-5447.921799185874 4561.222001907441)
4,1525142,City of Detroit,Clogged Drain,Acknowledged,@ THE CORNER OF GRIGGS & MARGARETA,2,,03/11/2015 03:04:59 PM,03/11/2015 02:53:23 PM,03/11/2015 03:04:59 PM,"Griggs Detroit, Michigan",42.402033,-83.162874,"(42.4020334, -83.1628741)",,POINT (-5176.248798845747 2449.404132262129)


In [8]:
def join_with_parcel_nos(gdf, parcels, radius=0):
    parcel_nos = parcels[['geometry']].reset_index()
    parcel_nos.geometry = parcel_nos.buffer(radius)
    return gpd.sjoin(gdf, parcel_nos, how='inner', op='within')

In [9]:
calls.issue_type.value_counts()

Illegal Dumping / Illegal Dump Sites                                                         3584
Tree Issue                                                                                   3546
Running Water in a Home or Building                                                          2655
Clogged Drain                                                                                2490
Potholes                                                                                     2399
Traffic Sign Issue                                                                           1030
Water Main Break                                                                              778
Fire Hydrant Issue                                                                            678
Abandoned Vehicle                                                                             638
Manhole Cover Issue                                                                           546
Traffic Signal Issue

In [10]:
def get_issue_types(calls, min_occurences=100):
    issue_types = calls.issue_type.value_counts()
    issue_types = pd.Series(issue_types[issue_types >= min_occurences].index)
    issue_types.index = issue_types
    issue_types = issue_types.str.replace(r' [-/].*', '')
    issue_types = issue_types.str.title().str.replace(' ', '')
    return issue_types

get_issue_types(calls).reset_index(name='FeatureName')

Unnamed: 0,index,FeatureName
0,Illegal Dumping / Illegal Dump Sites,IllegalDumping
1,Tree Issue,TreeIssue
2,Running Water in a Home or Building,RunningWaterInAHomeOrBuilding
3,Clogged Drain,CloggedDrain
4,Potholes,Potholes
5,Traffic Sign Issue,TrafficSignIssue
6,Water Main Break,WaterMainBreak
7,Fire Hydrant Issue,FireHydrantIssue
8,Abandoned Vehicle,AbandonedVehicle
9,Manhole Cover Issue,ManholeCoverIssue


In [11]:
# Intuition:
# 0 - inside
# 20 m - street in front of the building
# 1 km - neighborhood
radiuses = [0, 20, 1000]

def get_call_features(calls, parcels):
    features = []
    issue_types = get_issue_types(calls)
    for radius in radiuses:
        print('radius:', radius, 'meters')
        with_parcel_nos = join_with_parcel_nos(calls[['issue_type', 'geometry']], parcels, radius=radius)
        features.append(with_parcel_nos.ParcelNo.value_counts().to_frame(name='Calls{}'.format(radius)))
        for issue_type, feature_name in issue_types.iteritems():
            feature_name += str(radius)
            pred = with_parcel_nos.issue_type == issue_type
            feature = with_parcel_nos[pred].ParcelNo.value_counts().to_frame(name=feature_name)
            features.append(feature)
    print('done joining')
    features = pd.concat(features, axis=1)
    features = features.fillna(0).astype(int)
    return features

call_features_train = maybe_pickle('call_features_train', lambda: get_call_features(calls, parcels_train))
call_features_test = maybe_pickle('call_features_test', lambda: get_call_features(calls, parcels_test))

generating call_features_train
radius: 0 meters
radius: 20 meters
radius: 1000 meters
done joining
finished generating call_features_train; 0:00:32.633052 elapsed
generating call_features_test
radius: 0 meters
radius: 20 meters
radius: 1000 meters
done joining
finished generating call_features_test; 0:00:11.186021 elapsed


In [12]:
def extract_lat_lon_violations(df):
    return df.ViolationAddress.str.extract(r'\((?P<Lat>.+)?,\s*(?P<Lon>.+)?\)', expand=True).astype(float)

violations = maybe_pickle(
    'violations',
    lambda: read_csv('data/detroit-blight-violations.csv', extract_lat_lon_violations),
)
violations.head()

generating violations




finished generating violations; 0:00:40.146549 elapsed


Unnamed: 0,TicketID,TicketNumber,AgencyName,ViolName,ViolationStreetNumber,ViolationStreetName,MailingStreetNumber,MailingStreetName,MailingCity,MailingState,MailingZipCode,NonUsAddressCode,Country,TicketIssuedDT,TicketIssuedTime,HearingDT,CourtTime,ViolationCode,ViolDescription,Disposition,FineAmt,AdminFee,LateFee,StateFee,CleanUpCost,JudgmentAmt,PaymentStatus,Void,ViolationCategory,ViolationAddress,MailingAddress,geometry
0,26288,05000001DAH,Department of Public Works,"Group, LLC, Grand Holding",2566,GRAND BLVD,743,"Beaubien, Ste. 201",Detroit,MI,48226,,,01/01/38440 12:00:00 AM,12:00:00,01/01/38474 12:00:00 AM,9:00AM,22-2-20,Burning solid waste in open fires,Responsible By Determination,$1500.00,$20.00,$150.00,$10.00,$0.00,$1680.00,PAID IN FULL,0.0,0,"2566 GRAND BLVD\nDetroit, MI\n(42.363182370000...","743 Beaubien\nDetroit, MI 48226\n(42.333730630...",POINT (685.6549288663025 -1868.077256153116)
1,19800,05000025DAH,Department of Public Works,"JACKSON, RAECHELLE",19014,ASHTON,20501,HEYDEN,DETROIT,MI,48219,,,01/01/38383 12:00:00 AM,10:15:00,01/01/38425 12:00:00 AM,1:30PM,22-2-22,Bulk solid waste deposited more than 24 hours ...,Not responsible By Determination,$100.00,$20.00,$10.00,$10.00,$0.00,$140.00,NO PAYMENT APPLIED,0.0,0,"19014 ASHTON\nDetroit, MI\n(42.429390762000025...","20501 HEYDEN\nDETROIT, MI 48219\n(42.442177633...",POINT (-9907.358050033987 5493.400880467548)
2,19804,05000026DAH,Department of Public Works,"TALTON, CAROL ANN",18735,STAHELIN,18735,STAHELI N,DETROIT,MI,48219,,,01/01/38383 12:00:00 AM,10:35:00,01/01/38425 12:00:00 AM,1:30PM,22-2-22,Bulk solid waste deposited more than 24 hours ...,Responsible By Determination,$100.00,$20.00,$10.00,$10.00,$0.00,$140.00,PAID IN FULL,0.0,0,"18735 STAHELIN\nDetroit, MI\n(42.4287074590000...","18735 STAHELI N\nDETROIT, MI 48219\n(42.428707...",POINT (-10496.22700333466 5418.357455531896)
3,20208,05000027DAH,Department of Public Works,"BONNER, DARRYL E.",20125,MONICA,25335,PEEKSKILL,SOUTHFIELD,MI,48043,,,01/01/38385 12:00:00 AM,10:45:00,01/01/38422 12:00:00 AM,1:30PM,22-2-45,Violation of time limit for approved container...,Responsible By Default,$100.00,$20.00,$10.00,$10.00,$0.00,$140.00,NO PAYMENT APPLIED,0.0,0,"20125 MONICA\nDetroit, MI\n(42.44169828400004,...","25335 PEEKSKILL\nSOUTHFIELD, MI 48043\n(42.475...",POINT (-3703.889260257796 6854.504793171093)
4,20211,05000028DAH,Department of Public Works,"GREGORY, JAMES LEE",17397,PRAIRIE,17397,PRAIRIE,DETROIT,MI,48221,,,01/01/38385 12:00:00 AM,11:10:00,01/01/38422 12:00:00 AM,1:30PM,22-2-22,Bulk solid waste deposited more than 24 hours ...,Responsible By Default,$100.00,$20.00,$10.00,$10.00,$0.00,$140.00,PAID IN FULL,0.0,0,"17397 PRAIRIE\nDetroit, MI\n(42.42031769500005...","17397 PRAIRIE\nDETROIT, MI 48221\n(42.42031769...",POINT (-3730.658950740272 4479.527240656317)


In [13]:
def get_violation_features(violations, parcels):
    features = []
    for radius in radiuses:
        print('radius:', radius, 'meters')
        with_parcel_nos = join_with_parcel_nos(violations[['geometry']], parcels, radius=radius)
        feature = with_parcel_nos.ParcelNo.value_counts().to_frame(name='Violations{}'.format(radius))
        features.append(feature)
    print('done joining')
    features = pd.concat(features, axis=1)
    features = features.fillna(0).astype(int)
    return features

violation_features_train = maybe_pickle('violation_features_train', lambda: get_violation_features(violations, parcels_train))
violation_features_test = maybe_pickle('violation_features_test', lambda: get_violation_features(violations, parcels_test))

generating violation_features_train
radius: 0 meters
radius: 20 meters
radius: 1000 meters
done joining
finished generating violation_features_train; 0:06:43.412692 elapsed
generating violation_features_test
radius: 0 meters
radius: 20 meters
radius: 1000 meters
done joining
finished generating violation_features_test; 0:02:38.310166 elapsed


In [14]:
def extract_lat_lon_crimes(df):
    df = pd.DataFrame(dict(Lat=df.LAT, Lon=df.LON))
    df[df.Lat < -90] = np.nan
    df[df.Lat > 90] = np.nan
    df[df.Lon < -180] = np.nan
    df[df.Lon > 180] = np.nan
    return df

crimes = maybe_pickle('crimes', lambda: read_csv('data/detroit-crime.csv', extract_lat_lon_crimes))
crimes.head()

generating crimes




finished generating crimes; 0:00:14.526138 elapsed


Unnamed: 0,ROWNUM,CASEID,INCINO,CATEGORY,OFFENSEDESCRIPTION,STATEOFFENSEFILECLASS,INCIDENTDATE,HOUR,SCA,PRECINCT,COUNCIL,NEIGHBORHOOD,CENSUSTRACT,ADDRESS,LON,LAT,LOCATION,geometry
0,53256,1953933,1506030028.1,ASSAULT,ASSAULT AND BATTERY/SIMPLE ASSAULT,13001.0,06/03/2015 12:00:00 AM,2,1007.0,10.0,City Council District 5,PETOSKEY-OTSEGO,5334.0,09100 PETOSKEY,-83.1221,42.3678,"PETOSKEY\n09100\n(42.3676, -83.1219)",POINT (-1820.420055835194 -1354.946072085274)
1,17631,1917717,1503010158.1,LARCENY,LARCENY - PARTS AND ACCESSORIES FROM VEHICLE,23006.0,03/01/2015 12:00:00 AM,9,608.0,6.0,City Council District 7,GRANDALE,5452.0,00 PLYMOUTH AND MANSFIELD,-83.2035,42.3724,"00 PLYMOUTH AND MANSFIELD\n(42.3725, -83.2033)",POINT (-8524.872550281949 -839.0228010952841)
2,11207,1910955,1502080223.1,STOLEN VEHICLE,VEHICLE THEFT,24001.0,02/08/2015 12:00:00 AM,18,1105.0,11.0,City Council District 3,OUTER DRIVE VAN DYKE,5051.0,00 E 7 MILE VAN DYKE,-83.0241,42.4338,"00 E 7 MILE VAN DYKE\n(42.4339, -83.0241)",POINT (6245.482136322492 5978.957355653335)
3,116589,2018186,1511090188.1,WEAPONS OFFENSES,WEAPONS OFFENSE (OTHER),52003.0,11/09/2015 12:00:00 AM,12,210.0,2.0,City Council District 6,TIREMAN,,06600 BARTON,-83.1381,42.3496,"BARTON\n06600\n(42.3494, -83.1379)",POINT (-3139.277059820476 -3376.139909471202)
4,85790,1986862,1508239803.1,LARCENY,LARCENY - PARTS AND ACCESSORIES FROM VEHICLE,23006.0,08/14/2015 12:00:00 AM,7,309.0,3.0,City Council District 6,WEST CANFIELD,5204.0,00900 W WILLIS,-83.0692,42.3481,"W WILLIS\n00900\n(42.3481, -83.0693)",POINT (2537.848657958498 -3543.003720249387)


In [15]:
def get_crime_categories(crimes, min_occurences=100):
    counts = crimes.CATEGORY.value_counts()
    categories = pd.Series(counts[counts >= min_occurences].index)
    categories.index = categories
    categories = categories.str.replace('TRAFFIC VIOLATIONS-', '')
    categories = categories.str.title().str.replace(' ', '')
    return categories

get_crime_categories(crimes).reset_index(name='FeatureName')

Unnamed: 0,index,FeatureName
0,TRAFFIC VIOLATIONS-MOTORCYCLE VIOLATIONS,MotorcycleViolations
1,ASSAULT,Assault
2,LARCENY,Larceny
3,DAMAGE TO PROPERTY,DamageToProperty
4,AGGRAVATED ASSAULT,AggravatedAssault
5,BURGLARY,Burglary
6,STOLEN VEHICLE,StolenVehicle
7,TRAFFIC VIOLATIONS-DRIVING ON SUSPENDED,DrivingOnSuspended
8,FRAUD,Fraud
9,ROBBERY,Robbery


In [16]:
def get_crime_features(crimes, parcels):
    features = []
    categories = get_crime_categories(crimes)    
    for radius in radiuses:
        print('radius:', radius)
        with_parcel_nos = join_with_parcel_nos(crimes[['CATEGORY', 'geometry']], parcels, radius=radius)
        features.append(with_parcel_nos.ParcelNo.value_counts().to_frame(name='Crimes{}'.format(radius)))
        for category, feature_name in categories.iteritems():
            feature_name += str(radius)
            pred = with_parcel_nos.CATEGORY == category
            feature = with_parcel_nos[pred].ParcelNo.value_counts().to_frame(name=feature_name)
            features.append(feature)
    print('done joining')
    features = pd.concat(features, axis=1)
    features = features.fillna(0).astype(int)
    return features

crime_features_train = maybe_pickle('crime_features_train', lambda: get_crime_features(crimes, parcels_train))
crime_features_test = maybe_pickle('crime_features_test', lambda: get_crime_features(crimes, parcels_test))

generating crime_features_train
radius: 0
radius: 20
radius: 1000
done joining
finished generating crime_features_train; 0:03:07.617084 elapsed
generating crime_features_test
radius: 0
radius: 20
radius: 1000
done joining
finished generating crime_features_test; 0:01:09.334629 elapsed


In [17]:
def create_dataset(parcels, features):
    features = pd.concat(features, axis=1)
    features = features.fillna(0).astype(int)

Unnamed: 0,Crimes0,MotorcycleViolations0,Assault0,Larceny0,DamageToProperty0,AggravatedAssault0,Burglary0,StolenVehicle0,DrivingOnSuspended0,Fraud0,Robbery0,DangerousDrugs0,ObstructingJudiciary0,WeaponsOffenses0,Escape0,Solicitation0,Bribery0,Arson0,OuilDisposeOfVehicleToAvoidForfeiture0,Runaway0,StolenProperty0,ObstructingThePolice0,FamilyOffense0,Homicide0,OtherBurglary0,Health-Safety0,Kidnaping0,Forgery0,Extortion0,PublicPeace0,Environment0,Crimes20,MotorcycleViolations20,Assault20,Larceny20,DamageToProperty20,AggravatedAssault20,Burglary20,StolenVehicle20,DrivingOnSuspended20,Fraud20,Robbery20,DangerousDrugs20,ObstructingJudiciary20,WeaponsOffenses20,Escape20,Solicitation20,Bribery20,Arson20,OuilDisposeOfVehicleToAvoidForfeiture20,Runaway20,StolenProperty20,ObstructingThePolice20,FamilyOffense20,Homicide20,OtherBurglary20,Health-Safety20,Kidnaping20,Forgery20,Extortion20,PublicPeace20,Environment20,Crimes1000,MotorcycleViolations1000,Assault1000,Larceny1000,DamageToProperty1000,AggravatedAssault1000,Burglary1000,StolenVehicle1000,DrivingOnSuspended1000,Fraud1000,Robbery1000,DangerousDrugs1000,ObstructingJudiciary1000,WeaponsOffenses1000,Escape1000,Solicitation1000,Bribery1000,Arson1000,OuilDisposeOfVehicleToAvoidForfeiture1000,Runaway1000,StolenProperty1000,ObstructingThePolice1000,FamilyOffense1000,Homicide1000,OtherBurglary1000,Health-Safety1000,Kidnaping1000,Forgery1000,Extortion1000,PublicPeace1000,Environment1000
1000001.0,78,19,4,25,1,4,1,4,6,3,2,1,0,0,3,0,1,0,0,0,1,0,0,0,0,2,0,0,0,0,0,105,27,6,32,2,4,1,5,8,3,5,1,0,0,3,2,1,0,0,0,1,0,0,0,0,3,0,0,0,0,0,3112,787,295,865,222,85,34,145,244,90,73,32,34,14,30,45,32,0,9,2,13,11,5,3,0,10,0,8,3,2,0
1000042.86,11,3,1,5,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,37,15,3,11,2,0,0,1,1,1,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3092,772,295,876,220,80,34,143,239,90,74,32,34,14,29,45,29,0,10,2,13,10,5,3,0,10,0,9,3,2,0
1000618.04,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2272,564,250,496,186,73,56,96,193,60,71,29,35,6,34,28,21,1,16,0,5,11,5,2,1,4,2,9,2,3,1
1000618.046,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2291,592,307,474,184,91,55,101,122,59,63,46,34,7,32,22,24,1,13,1,4,13,8,2,2,6,2,8,2,3,2
1000618.051,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2309,590,314,480,185,92,57,103,121,61,63,46,34,7,32,22,24,1,13,1,4,13,8,2,2,6,2,8,2,3,2


In [68]:
def create_dataset(parcels, *features):
    dataset = pd.concat(features, axis=1)
    labels = parcels.IsBlighted.astype(float)
    assert not dataset.isnull().any().any()
    assert not labels.isnull().any()
    print('data', dataset.shape)
    print('target', labels.shape)
    return Bunch(
        data=dataset.as_matrix(),
        feature_names=dataset.columns,
        target=labels.as_matrix(),
        target_names=labels.name,
    )

train = maybe_pickle('train', lambda: create_dataset(
    parcels_train,
    parcel_features_train,
    violation_features_train,
    crime_features_train,
    call_features_train,
), force=True)

test = maybe_pickle('test', lambda: create_dataset(
    parcels_test,
    parcel_features_test,
    violation_features_test,
    crime_features_test,
    call_features_test,
), force=True)

generating train
data (10444, 146)
target (10444,)
finished generating train; 0:00:00.032448 elapsed
generating test
data (2000, 146)
target (2000,)
finished generating test; 0:00:00.008785 elapsed
