In [1]:
# Pre-processing
import collections
import numpy as np
import pandas as pd
import math

# General visualization 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Geospatial visualization
import geopandas as gpd
import shapely
from shapely.geometry import Point
from geopy.geocoders import GoogleV3
from scipy import ndimage

In [272]:
# Display otpions
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('max_colwidth', 100)
np.set_printoptions(linewidth=100)

In [273]:
# 10 primary (more important) tables
primaryTables = [
    'nibrs_arrestee', 'nibrs_bias_motivation', 'nibrs_incident', 'nibrs_offender', 'nibrs_offense',
    'nibrs_property_desc', 'nibrs_property', 'nibrs_victim_injury', 'nibrs_victim_offender_rel', 'nibrs_victim'
]

# 24 lookup tables as a dictionary of {table_name: (oldCol, newCol)} format
lookupTables = {
    'nibrs_activity_type': ('activity_type_id', 'activity_type_name'),
    'nibrs_age': ('age_id', 'age_name'),
    'nibrs_arrest_type': ('arrest_type_id', 'arrest_type_name'), 
    'nibrs_assignment_type': ('assignment_type_id', 'assignment_type_name'),
    'nibrs_bias_list': ('bias_id', 'bias_name'),
    'nibrs_circumstances': ('circumstances_id', 'circumstances_name'),
    'nibrs_cleared_except': ('cleared_except_id', 'cleared_except_name'),
    'nibrs_criminal_act_type': ('criminal_act_id', 'criminal_act_name'),
    'nibrs_drug_measure_type': ('drug_measure_type_id', 'drug_measure_name'),
    'nibrs_ethnicity': ('ethnicity_id', 'ethnicity_name'),
    'nibrs_injury': ('injury_id', 'injury_name'),
    'nibrs_justifiable_force': ('justifiable_force_id', 'justifiable_force_name'),
    'nibrs_location_type': ('location_id', 'location_name'),
    'nibrs_month': ('nibrs_month_id', 'month_num'),
    'nibrs_offense_type' : ('offense_type_id', 'offense_name'),
    'nibrs_prop_desc_type': ('prop_desc_id', 'prop_desc_name'),
    'nibrs_prop_loss_type': ('prop_loss_id', 'prop_loss_name'),
    'nibrs_relationship': ('relationship_id', 'relationship_name'),
    'nibrs_suspected_drug_type': ('suspected_drug_type_id', 'suspected_drug_name'),
    'nibrs_using_list': ('suspect_using_id', 'suspect_using_name'),
    'nibrs_victim_type': ('victim_type_id', 'victim_type_name'),
    'nibrs_weapon_type': ('weapon_id', 'weapon_name'),
    'ref_race': ('race_id', 'race_desc'),
    'ref_state': ('state_id', 'state_name')
}

# 9 secondary (less important) tables
secondaryTables = [
    'agency_participation', 'cde_agencies', 'nibrs_arrestee_weapon', 'nibrs_criminal_act', 'nibrs_suspected_drug',
    'nibrs_suspect_using', 'nibrs_victim_circumstances', 'nibrs_victim_offense', 'nibrs_weapon'
]

In [274]:
def readCSVs(dataset):
    """
    Read DataFrames into a dictionary with table names as keys
    Args:
        dataset (string): NIBRS dataset name, e.g., PA-2016
    Return:
        dic (dictionary): dictionary of DataFrames
    """
    print('Reading all CSVs for %s dataset...' % dataset)
    dic = {}
    for table in primaryTables + list(lookupTables.keys()) + secondaryTables:
        dic[table] = pd.read_csv("%s/%s.csv" % (dataset, table))
    print('Finished reading %d CSVs' % len(dic))
    return dic

PA2016 = readCSVs('PA-2016')
# SC2016 = readCSVs('SC-2016')

Reading all CSVs for PA-2016 dataset...
Finished reading 43 CSVs


In [275]:
# Replace a col with a specified mapping dict:
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.replace.html
# https://stackoverflow.com/questions/20250771/remap-values-in-pandas-column-with-a-dict
def createMappings(dfs):
    """
    Create a nested dictionary of mappings 
    Args:
        dfs (dictionary): dictionary of DataFrames previously read
    Return:
        mappings (nested dictionary): {(oldCol, newCol): {oldValue: newValue}}
    """
    print('Creating lookup table mappings...')
    mappings = {}
    for table in lookupTables:
        # Only handle those with defined mapping
        if lookupTables[table] is not None:
            oldCol, newCol = lookupTables[table]
            mappings[(oldCol, newCol)] = dict(zip(dfs[table][oldCol], dfs[table][newCol]))
    print('Finished creating mappings from %d lookup tables' % len(mappings))
    return mappings

# maps = createMappings(SC2016)
maps = createMappings(PA2016)

Creating lookup table mappings...
Finished creating mappings from 24 lookup tables


In [276]:
def replaceAndRename(df, lookupTableNames):
    """
    Replace and rename DataFrame columns in place.
    Args:
        df (DataFrame): DataFrame to be replaced
        lookupTableNames (list): List of lookup table names for replacing and renaming
    """
    # Call df.replace() and df.rename() once for each changing column
    for lookupTableName in lookupTableNames:
        oldCol, newCol = lookupTables[lookupTableName]
        # dic: the nested dictionary required by df.replace()
        dic = {oldCol: maps[(oldCol, newCol)]}
        df.replace(dic, inplace=True)
        df.rename(columns={oldCol: newCol}, inplace=True)

print('Replacing and Renaming columns...')
replaceAndRename(PA2016['nibrs_incident'], ['nibrs_month'])
replaceAndRename(PA2016['nibrs_offense'], ['nibrs_location_type', 'nibrs_offense_type'])
replaceAndRename(PA2016['nibrs_offender'], ['ref_race', 'nibrs_ethnicity'])
replaceAndRename(PA2016['nibrs_arrestee'], ['nibrs_offense_type', 'ref_race', 'nibrs_ethnicity'])
replaceAndRename(PA2016['nibrs_victim'], ['nibrs_victim_type', 'ref_race', 'nibrs_ethnicity'])
replaceAndRename(PA2016['nibrs_victim_offender_rel'], ['nibrs_relationship'])
replaceAndRename(PA2016['nibrs_property'], ['nibrs_prop_loss_type'])
replaceAndRename(PA2016['nibrs_property_desc'], ['nibrs_prop_desc_type'])
print('Done')

Replacing and Renaming columns...
Done


Next we will create a dataframe containing all the columns with meaningful information associated with incident, victim, offender and offense.

In [277]:
# Select valid ages and rename some column so as not to mess up with offender's information
def formalizePersonalInfo(rawDF, identity):
    return rawDF[rawDF['age_id'] == 5.0].drop('age_id', axis=1)\
.rename(columns={"age_num": identity + "_age","sex_code": identity + "_sex", "race_desc": identity + "_race", \
                "ethnicity_name": identity + "_ethnicity", "resident_status_code": identity + "_resident_status"})

# First select the data frame with victim_id and offender_id, and merge other dataframes into it by matching victim_id.
dataframe = formalizePersonalInfo(PA2016["nibrs_victim_offender_rel"].merge(PA2016['nibrs_victim'], how='left', on='victim_id')\
                                  .merge(PA2016['nibrs_incident'], how='left', on='incident_id')\
                                  [['victim_id', 'offender_id', 'incident_id', 'relationship_name',\
                                    'victim_seq_num', 'victim_type_name','age_id', 'age_num', 'sex_code','month_num', \
                                    'race_desc', 'ethnicity_name', 'resident_status_code', 'incident_date']], 'victim')

# Continue merge dataframes to diversify the information
dataframe = formalizePersonalInfo(dataframe.merge(PA2016['nibrs_victim_injury'], how = 'left', on = 'victim_id')
                                  .merge(PA2016['nibrs_injury'], how = 'left', on = 'injury_id')\
                                  .drop(['injury_id', 'injury_code'], axis=1)\
                                  .merge(PA2016['nibrs_offender'], how = 'left',\
                                         on = ['incident_id', 'offender_id'])\
                                  .drop(['ff_line_number', 'age_range_low_num', 'age_range_high_num'], axis = 1)\
                                  .merge(PA2016['nibrs_offense'], how = 'left', on = 'incident_id')\
                                  .drop(['attempt_complete_flag', 'location_name', 'num_premises_entered',\
                                         'method_entry_code', 'ff_line_number'], axis=1)\
                                  .merge(PA2016['nibrs_bias_motivation'], how = 'left', on = 'offense_id')\
                                  .merge(PA2016['nibrs_bias_list'], how = 'left', on = 'bias_id')\
                                  .drop(['victim_id', 'offender_id', 'incident_id', 'offense_id', 'bias_id',\
                                         'bias_code'], axis=1), 'offender')

dataframe.tail()

Unnamed: 0,relationship_name,victim_seq_num,victim_type_name,victim_age,victim_sex,month_num,victim_race,victim_ethnicity,victim_resident_status,incident_date,injury_name,offender_seq_num,offender_age,offender_sex,offender_race,offender_ethnicity,offense_name,bias_name
1415,Relationship Unknown,1,Law Enforcement Officer,28.0,M,11,White,Not Hispanic or Latino,R,2016-11-29 00:00:00,,1,23.0,F,Black or African American,,Simple Assault,
1416,Victim Was Acquaintance,1,Individual,13.0,F,11,White,Not Hispanic or Latino,R,2016-11-01 00:00:00,,1,38.0,M,White,,Rape,
1417,Victim Was Acquaintance,1,Individual,22.0,F,12,White,Not Hispanic or Latino,R,2016-12-18 00:00:00,,1,24.0,M,White,,False Pretenses/Swindle/Confidence Game,
1418,Victim Was Acquaintance,1,Individual,22.0,F,12,White,Not Hispanic or Latino,R,2016-12-18 00:00:00,,1,24.0,M,White,,Simple Assault,
1419,Victim Was Acquaintance,1,Individual,33.0,M,11,White,Not Hispanic or Latino,N,2016-11-29 00:00:00,,1,54.0,M,White,,Intimidation,


Now we get the large dataframe that includes all the data needed to predict the offender.
In order to fit the dataframe into machine learning models, it's necessary to scale the different numerial feature and convert categorial features into One Hot Encoding features.

In [None]:
relationship_list = ['Victim Was Acquaintance' 'Victim Was Babysittee' 'Victim Was Boyfriend/Girlfriend'
 'Victim Was Child of Boyfriend or Girlfriend' 'Victim Was Child' 'Victim Was Common-Law Spouse'
 'Victim was Employee' 'Victim was Employer' 'Victim Was Friend' 'Victim Was Grandchild'
 'Victim Was Grandparent' 'Homosexual Relationship' 'Victim Was In-law' 'Victim Was Neighbor'
 'Victim Was Other Family Member' 'Victim was Otherwise Known' 'Victim Was Parent'
 'Relationship Unknown' 'Victim Was Sibling' 'Victim Was Stepchild' 'Victim Was Spouse'
 'Victim Was Stepparent' 'Victim Was Stepsibling' 'Victim Was Stranger' 'Victim Was Offender'
 'Victim was Ex-Spouse']

victim_type_name_list = ['Business' 'Financial Institution' 'Government' 'Individual' 'Law Enforcement Officer' 'Other'
 'Religious Organization' 'Society/Public' 'Unknown']

sex_list = ['F' 'nan' 'M' 'U']

race_list = ['Unknown' 'White' 'Black or African American' 'American Indian or Alaska Native' 'Asian'
 'Asian, Native Hawaiian, or Other Pacific Islander' 'Chinese' 'Japanese'
 'Native Hawaiian or Other Pacific Islander' 'Other' 'Multiple' 'Not Specified']

ethnicity_list = ['Multiple' 'Hispanic or Latino' 'Not Hispanic or Latino' 'Unknown']

resident_status_list = ['R' 'nan' 'N' 'U']

injury_name_list = ['Apparent Broken Bones' 'Possible Internal Injury' 'Severe Laceration' 'Minor Injury' 'None'
 'Other Major Injury' 'Loss of Teeth' 'Unconscious']

offense_name_list = ['Not Specified' 'Justifiable Homicide' 'False Pretenses/Swindle/Confidence Game' 'Statutory Rape'
 'Sexual Assault With An Object' 'Destruction/Damage/Vandalism of Property'
 'Family Offenses, Nonviolent' 'Theft of Motor Vehicle Parts or Accessories'
 'Pornography/Obscene Material' 'Sports Tampering' 'Driving Under the Influence'
 'Counterfeiting/Forgery' 'Welfare Fraud' 'Pocket-picking' 'Theft From Motor Vehicle'
 'Assisting or Promoting Prostitution' 'Drug/Narcotic Violations' 'Wire Fraud' 'Purse-snatching'
 'Runaway' 'Arson' 'Motor Vehicle Theft' 'Drunkenness' 'Shoplifting'
 'Operating/Promoting/Assisting Gambling' 'Bad Checks' 'Extortion/Blackmail' 'Aggravated Assault'
 'Stolen Property Offenses' 'Kidnapping/Abduction' 'Prostitution' 'Betting/Wagering'
 'Murder and Nonnegligent Manslaughter' 'Peeping Tom' 'Trespass of Real Property'
 'Drug Equipment Violations' 'Rape' 'Embezzlement' 'Negligent Manslaughter' 'Weapon Law Violations'
 'Robbery' 'Credit Card/Automated Teller Machine Fraud' 'Curfew/Loitering/Vagrancy Violations'
 'Sodomy' 'Intimidation' 'All Other Larceny' 'Impersonation' 'Theft From Building'
 'All Other Offenses' 'Burglary/Breaking & Entering' 'Theft From Coin-Operated Machine or Device'
 'Simple Assault' 'Liquor Law Violations' 'Disorderly Conduct' 'Gambling Equipment Violation'
 'Incest' 'Fondling' 'Bribery' 'Human Trafficking, Commercial Sex Acts'
 'Human Trafficking, Involuntary Servitude' 'Purchasing Prostitution' 'Identity Theft'
 'Hacking/Computer Invasion' 'Animal Cruelty']

bias_name_list = ['Anti-Native Hawaiian or Other Pacific Islander' 'Anti-Physical Disability'
 'Anti-Mental Disability' 'Anti-Male' 'Anti-Female' 'Anti-Transgender' 'Anti-Gender Non-Conforming'
 'Anti-White' 'Anti-Black or African American' 'Anti-American Indian or Alaska Native' 'Anti-Asian'
 'Anti-Multi-Racial Group' 'Anti-Jewish' 'Anti-Catholic' 'Anti-Protestant' 'Anti-Islamic (Muslem)'
 'Anti-Other Religion' 'Anti-Multi-Religious Group' 'Anti-Atheist/Agnosticism' 'Anti-Arab'
 'Anti-Hispanic or Latino' 'Anti-Not Hispanic or Latino' 'Anti-Male Homosexual (Gay)'
 'Anti-Female Homosexual (Lesbian)'
 'Anti-Lesbian, Gay, Bisexual, or Transgender, Mixed Group (LGBT)' 'Anti-Heterosexual'
 'Anti-Bisexual' 'None' 'Unknown' 'Anti-Mormon' "Anti-Jehovah's Witness" 'Anti-Eastern Orthodox'
 'Anti-Other Christian' 'Anti-Buddhist' 'Anti-Hindu' 'Anti-Sikh']

In [278]:
import datetime

# Convert incident_date to week_of_day
dataframe['incident_date'] = pd.to_datetime(dataframe['incident_date'])
dataframe['week_of_day'] = dataframe['incident_date'].apply(lambda x: x.weekday())

dataframe = dataframe.drop(['incident_date'], axis=1)

# Scale numerial features to 0~1.
def scaleFeature(rawDF, columnName):
    rawDF[columnName] = rawDF[columnName].apply(lambda x: x / dataframe[columnName].max())
    return rawDF

# Convert Categorical features into multiple binary values
def OHE(rawDF, columnName, columnList):
    return rawDF.join(pd.get_dummies(dataframe[columnName], columns=columnList), lsuffix='v', rsuffix='o').drop(columnName, axis=1)

# First deal with intput numerical features.
dataframe = scaleFeature(scaleFeature(scaleFeature(scaleFeature(dataframe, 'victim_seq_num'),\
                                                   'victim_age'), 'month_num'), 'week_of_day')

# Then convert input categorical features.
dataframe = OHE(OHE(OHE(OHE(OHE(OHE(OHE(dataframe, 'victim_type_name', victim_type_name_list), 'victim_sex', sex_list), 'victim_race', race_list), 'victim_ethnicity', ethnicity_list),\
                'victim_resident_status', resident_status_list), 'injury_name', injury_name_list), 'offense_name', offense_name_list)

# Repeat the same steps for output features.
dataframe = scaleFeature(scaleFeature(dataframe, 'offender_seq_num'), 'offender_age')

dataframe = OHE(OHE(OHE(OHE(OHE(dataframe, 'relationship_name', relationship_list), 'offender_sex', sex_list), 'offender_race', race_list), 'offender_ethnicity', ethnicity_list),\
                'bias_name', bias_name_list)

dataframe.tail()

Unnamed: 0,victim_seq_num,victim_age,month_num,offender_seq_num,offender_age,week_of_day,Individual,Law Enforcement Officer,Fv,Mv,Uv,American Indian or Alaska Native,Asianv,Black or African Americanv,Unknownv,Whitev,Hispanic or Latino,Not Hispanic or Latino,Unknowno,N,R,Uo,Apparent Broken Bones,Minor Injury,Nonev,Other Major Injury,Possible Internal Injury,Severe Laceration,Aggravated Assault,All Other Larceny,Burglary/Breaking & Entering,Credit Card/Automated Teller Machine Fraud,Destruction/Damage/Vandalism of Property,Drug Equipment Violations,Drug/Narcotic Violations,False Pretenses/Swindle/Confidence Game,Fondling,Intimidation,Kidnapping/Abduction,Murder and Nonnegligent Manslaughter,Negligent Manslaughter,Rape,Robbery,Sexual Assault With An Object,Shoplifting,Simple Assault,Sodomy,Statutory Rape,Theft From Building,Theft From Motor Vehicle,Weapon Law Violations,Homosexual Relationship,Relationship Unknown,Victim Was Acquaintance,Victim Was Babysittee,Victim Was Boyfriend/Girlfriend,Victim Was Child,Victim Was Child of Boyfriend or Girlfriend,Victim Was Common-Law Spouse,Victim Was Friend,Victim Was Grandchild,Victim Was Grandparent,Victim Was In-law,Victim Was Neighbor,Victim Was Offender,Victim Was Other Family Member,Victim Was Parent,Victim Was Sibling,Victim Was Spouse,Victim Was Stepchild,Victim Was Stepparent,Victim Was Stepsibling,Victim Was Stranger,Victim was Employee,Victim was Ex-Spouse,Victim was Otherwise Known,Fo,Mo,U,Asiano,Black or African Americano,Unknown,Whiteo,Anti-White,Noneo
1415,0.2,0.294737,0.916667,0.25,0.27381,0.166667,0,1,0,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1
1416,0.2,0.136842,0.916667,0.25,0.452381,0.166667,1,0,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1
1417,0.2,0.231579,1.0,0.25,0.285714,1.0,1,0,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1
1418,0.2,0.231579,1.0,0.25,0.285714,1.0,1,0,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1
1419,0.2,0.347368,0.916667,0.25,0.642857,0.166667,1,0,0,1,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1


Finally we have transformed the data well fitted into models! The next step is to split the input features and output features from the dataframe.

In [279]:
inputFeature = dataframe.iloc[:, :51]
outputFeature = dataframe.iloc[:, 51:]
print(inputFeature.tail())
print(outputFeature.tail())

      victim_seq_num  victim_age  month_num  offender_seq_num  offender_age  \
1415             0.2    0.294737   0.916667              0.25      0.273810   
1416             0.2    0.136842   0.916667              0.25      0.452381   
1417             0.2    0.231579   1.000000              0.25      0.285714   
1418             0.2    0.231579   1.000000              0.25      0.285714   
1419             0.2    0.347368   0.916667              0.25      0.642857   

      week_of_day  Individual  Law Enforcement Officer  Fv  Mv  Uv  \
1415     0.166667           0                        1   0   1   0   
1416     0.166667           1                        0   1   0   0   
1417     1.000000           1                        0   1   0   0   
1418     1.000000           1                        0   1   0   0   
1419     0.166667           1                        0   0   1   0   

      American Indian or Alaska Native  Asianv  Black or African Americanv  \
1415                      

Try multi output classifier with random forest classifier...

In [280]:
print(inputFeature.as_matrix().shape)
print(outputFeature.as_matrix().shape)

from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=100, random_state=1)
multi_target_forest = MultiOutputClassifier(forest, n_jobs=-1)
multi_target_forest.fit(inputFeature.as_matrix(), outputFeature.as_matrix())

(1412, 51)
(1412, 34)


MultiOutputClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False),
           n_jobs=-1)

In [282]:
multi_target_forest.predict(inputFeature.iloc[:10, :].as_matrix())

array([[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 1, 0, 1],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 1, 0, 1],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 1, 0, 1],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 1, 0, 1],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 1, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 1, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 1, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 1, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0

Try multi output classifier with naive Bayes...

In [283]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
multi_target_gaussian = MultiOutputClassifier(clf, n_jobs=-1)
multi_target_gaussian.fit(inputFeature.as_matrix(), outputFeature.as_matrix())
multi_target_gaussian.predict(inputFeature.iloc[:10, :].as_matrix())

array([[0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
        1, 0, 0, 1],
       [0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1,
        1, 0, 0, 1],
       [1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1,
        1, 0, 0, 1],
       [1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1,
        1, 0, 0, 1],
       [1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1,
        1, 0, 0, 1],
       [1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1,
        1, 0, 0, 1],
       [1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1,
        1, 0, 0, 1],
       [0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1,
        1, 0, 0, 1],
       [0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0

Explore the accuracy and balance of data...


Plot ROC...