In [7]:
import pandas as pd
import matplotlib.pyplot as plt
pd.options.display.max_columns = 200
%matplotlib inline

In [31]:
df = pd.read_csv('data/masterdf_20170920.csv', low_memory = False)

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,Incident Date,EAS,Incident_Year,Incident_Cat,Incident_Dummy,Neighborhood,Location_y,Address,Building_Cat,Yr_Property_Built,Num_Bathrooms,Num_Bedrooms,Num_Rooms,Num_Stories,Num_Units,Perc_Ownership,Land_Value,Property_Area,Assessed_Improvement_Val,Tot_Rooms,landval_psqft,count potential fire control,count all complaints,count all complaints not corrected,count potential fire control not corrected,count fire emergency safety,count potential fire cause,count fire emergency safety not corrected,count potential fire cause not corrected
0,0,2015-06-20,451005.0,2015.0,COOKING FIRE,1.0,SUNSET/PARKSIDE,"(37.7543289339354, -122.480327187833)",1532 NORIEGA ST,COMMERCIAL USE,1989.0,0.0,0.0,11.0,3.0,2.777778,1.0,438434.3,4135.0,262181.666667,11.0,106.030069,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2010-11-28,360149.0,2010.0,COOKING FIRE,1.0,MISSION,"(37.7645472195468, -122.418358468789)",135 CAPP ST,APARTMENT,1908.0,0.0,0.0,36.0,3.0,12.0,1.0,1365665.0,9318.0,566375.428571,36.0,146.56203,3.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0
2,2,2011-04-26,360149.0,2011.0,COOKING FIRE,1.0,MISSION,"(37.7645472195468, -122.418358468789)",135 CAPP ST,APARTMENT,1908.0,0.0,0.0,36.0,3.0,12.0,1.0,1365665.0,9318.0,566375.428571,36.0,146.56203,3.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0
3,3,2006-03-09,360149.0,2006.0,BUILDING FIRE,1.0,MISSION,"(37.7645472195468, -122.418358468789)",135 CAPP ST,APARTMENT,1908.0,0.0,0.0,36.0,3.0,12.0,1.0,1365665.0,9318.0,566375.428571,36.0,146.56203,3.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0
4,4,2004-05-28,360149.0,2004.0,OUTDOOR FIRE,1.0,MISSION,"(37.7645472195468, -122.418358468789)",135 CAPP ST,APARTMENT,1908.0,0.0,0.0,36.0,3.0,12.0,1.0,1365665.0,9318.0,566375.428571,36.0,146.56203,3.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0


In [6]:
mask = df['Num_Bathrooms'] + df['Num_Bedrooms'] == df['Tot_Rooms']
mask.sum()

3982

In [10]:
mask = df['Num_Rooms'] == df['Tot_Rooms']
mask.sum()

138370

In [32]:
def data_preprocessing(df, target_col='Incident_Cat', drop_cat=False):
    '''
    Create target from df. NaN becomes no incident

    INPUTS:
    df - Pandas DataFrame, including target
    target_col - str, column name of target
    drop_cat - bool, False if keeping all categories in dummification

    OUTPUTS:
    y - Pandas DataFrame, target values only
    df_final, Pandas DataFrame, includes engineered features and dummified cols
    '''
    # quant_cols =['Num_Bathrooms', 'Num_Bedrooms',
    #        'Num_Rooms', 'Num_Stories', 'Num_Units', 'Land_Value',
    #        'Property_Area', 'Assessed_Improvement_Val', 'Tot_Rooms' ,'age']
    df[target_col] = df[target_col].notnull()
    quant_cols =[ 'Num_Stories', 'Num_Units', 'Land_Value',
           'Property_Area', 'Assessed_Improvement_Val', 'Tot_Rooms' ,'age', target_col]

    cat_cols = ['Building_Cat','Neighborhood']

    # feature engineering
    df['age'] = 2016 - df['Yr_Property_Built']

    # dummification
    dummies = pd.get_dummies(df[cat_cols], drop_first=drop_cat)

    # target creation
#     df[target_col] = df[target_col].notnull()
#     y = df.pop(target_col)

    # final df
    df = df.loc[:, quant_cols]
    df_final = pd.concat([df, dummies], axis=1)

    return df_final

In [30]:
df.head()

Unnamed: 0.1,Unnamed: 0,Incident Date,EAS,Incident_Year,Incident_Dummy,Neighborhood,Location_y,Address,Building_Cat,Yr_Property_Built,Num_Bathrooms,Num_Bedrooms,Num_Rooms,Num_Stories,Num_Units,Perc_Ownership,Land_Value,Property_Area,Assessed_Improvement_Val,Tot_Rooms,landval_psqft,count potential fire control,count all complaints,count all complaints not corrected,count potential fire control not corrected,count fire emergency safety,count potential fire cause,count fire emergency safety not corrected,count potential fire cause not corrected,age
0,0,2015-06-20,451005.0,2015.0,1.0,SUNSET/PARKSIDE,"(37.7543289339354, -122.480327187833)",1532 NORIEGA ST,COMMERCIAL USE,1989.0,0.0,0.0,11.0,3.0,2.777778,1.0,438434.3,4135.0,262181.666667,11.0,106.030069,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.0
1,1,2010-11-28,360149.0,2010.0,1.0,MISSION,"(37.7645472195468, -122.418358468789)",135 CAPP ST,APARTMENT,1908.0,0.0,0.0,36.0,3.0,12.0,1.0,1365665.0,9318.0,566375.428571,36.0,146.56203,3.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0,108.0
2,2,2011-04-26,360149.0,2011.0,1.0,MISSION,"(37.7645472195468, -122.418358468789)",135 CAPP ST,APARTMENT,1908.0,0.0,0.0,36.0,3.0,12.0,1.0,1365665.0,9318.0,566375.428571,36.0,146.56203,3.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0,108.0
3,3,2006-03-09,360149.0,2006.0,1.0,MISSION,"(37.7645472195468, -122.418358468789)",135 CAPP ST,APARTMENT,1908.0,0.0,0.0,36.0,3.0,12.0,1.0,1365665.0,9318.0,566375.428571,36.0,146.56203,3.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0,108.0
4,4,2004-05-28,360149.0,2004.0,1.0,MISSION,"(37.7645472195468, -122.418358468789)",135 CAPP ST,APARTMENT,1908.0,0.0,0.0,36.0,3.0,12.0,1.0,1365665.0,9318.0,566375.428571,36.0,146.56203,3.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0,108.0


In [33]:
df_final = data_preprocessing(df)

In [34]:
df_final.head()

Unnamed: 0,Num_Stories,Num_Units,Land_Value,Property_Area,Assessed_Improvement_Val,Tot_Rooms,age,Incident_Cat,Building_Cat_APARTMENT,Building_Cat_COMMERCIAL USE,"Building_Cat_CONDO, ETC.",Building_Cat_DWELLING,Building_Cat_FLATS AND DUPLEX,Building_Cat_INDUSTRIAL USE,Building_Cat_OFFICE,Building_Cat_OTHER,Neighborhood_BAYVIEW HUNTERS POINT,Neighborhood_BERNAL HEIGHTS,Neighborhood_CASTRO/UPPER MARKET,Neighborhood_CHINATOWN,Neighborhood_EXCELSIOR,Neighborhood_FINANCIAL DISTRICT/SOUTH BEACH,Neighborhood_GLEN PARK,Neighborhood_HAIGHT ASHBURY,Neighborhood_HAYES VALLEY,Neighborhood_INNER RICHMOND,Neighborhood_INNER SUNSET,Neighborhood_JAPANTOWN,Neighborhood_LAKESHORE,Neighborhood_LINCOLN PARK,Neighborhood_LONE MOUNTAIN/USF,Neighborhood_MARINA,Neighborhood_MCLAREN PARK,Neighborhood_MISSION,Neighborhood_MISSION BAY,Neighborhood_NOB HILL,Neighborhood_NOE VALLEY,Neighborhood_NORTH BEACH,Neighborhood_OCEANVIEW/MERCED/INGLESIDE,Neighborhood_OUTER MISSION,Neighborhood_OUTER RICHMOND,Neighborhood_PACIFIC HEIGHTS,Neighborhood_PORTOLA,Neighborhood_POTRERO HILL,Neighborhood_PRESIDIO,Neighborhood_PRESIDIO HEIGHTS,Neighborhood_RUSSIAN HILL,Neighborhood_SEACLIFF,Neighborhood_SOUTH OF MARKET,Neighborhood_SUNSET/PARKSIDE,Neighborhood_TENDERLOIN,Neighborhood_TWIN PEAKS,Neighborhood_VISITACION VALLEY,Neighborhood_WEST OF TWIN PEAKS,Neighborhood_WESTERN ADDITION
0,3.0,2.777778,438434.3,4135.0,262181.666667,11.0,27.0,True,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,3.0,12.0,1365665.0,9318.0,566375.428571,36.0,108.0,True,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,3.0,12.0,1365665.0,9318.0,566375.428571,36.0,108.0,True,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3.0,12.0,1365665.0,9318.0,566375.428571,36.0,108.0,True,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,3.0,12.0,1365665.0,9318.0,566375.428571,36.0,108.0,True,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [36]:
df.groupby('Incident_Cat').size()

Incident_Cat
False    170541
True      24767
dtype: int64

In [47]:

mask = df['Num_Units'] < 1
df[mask].groupby(['Num_Units', 'Incident_Cat']).size()

Num_Units  Incident_Cat
0.000000   False            156
           True             152
0.003378   True               1
0.067138   False              1
0.071429   False              1
0.086957   False              3
0.088235   False              1
0.091954   True               1
0.097826   False              1
0.099631   True              10
0.104651   True              10
0.109489   True               3
0.110224   True               1
0.110741   False              1
0.111111   False           4132
           True            2217
0.114754   False              2
0.115385   False              1
0.118519   False              1
0.118644   True               2
0.123156   False              1
0.123519   False              1
0.124124   True               2
0.124153   True               4
0.125000   False            417
           True              68
0.125275   True               2
0.126079   False              1
0.126285   True              12
0.126531   True              16
                

In [48]:
mask.sum()

11215