# Homicide Dataset

## Loading and clean up

In [70]:
import numpy as np
import pandas as pd

In [71]:
homicide = pd.read_csv('homicide-reports.csv')
homicide.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Record ID,Agency Code,Agency Name,Agency Type,City,State,Year,Month,Incident,Crime Type,...,Victim Ethnicity,Perpetrator Sex,Perpetrator Age,Perpetrator Race,Perpetrator Ethnicity,Relationship,Weapon,Victim Count,Perpetrator Count,Record Source
0,1,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,January,1,Murder or Manslaughter,...,Unknown,Male,15,Native American/Alaska Native,Unknown,Acquaintance,Blunt Object,0,0,FBI
1,2,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,March,1,Murder or Manslaughter,...,Unknown,Male,42,White,Unknown,Acquaintance,Strangulation,0,0,FBI
2,3,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,March,2,Murder or Manslaughter,...,Unknown,Unknown,0,Unknown,Unknown,Unknown,Unknown,0,0,FBI
3,4,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,April,1,Murder or Manslaughter,...,Unknown,Male,42,White,Unknown,Acquaintance,Strangulation,0,0,FBI
4,5,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,April,2,Murder or Manslaughter,...,Unknown,Unknown,0,Unknown,Unknown,Unknown,Unknown,0,1,FBI


An error was thrown indicating column 16 (Perpetrator Age) contains mixed type. To resolve this, I'll convert the entire column to numeric and coerce non numeric values to NaN.

In [72]:
homicide['Perpetrator Age'] = pd.to_numeric(homicide['Perpetrator Age'], errors='coerce')
print(homicide['Perpetrator Age'].dtype)

float64


I actually want ints, not floats. I want to convert floats to int, but cannot because there is a null value somewhere.

In [73]:
print('Perpetrator Age:', homicide['Perpetrator Age'].isnull().any())
print('Homicide DF:', homicide.isnull().any())

Perpetrator Age: True
Homicide DF: Record ID                False
Agency Code              False
Agency Name              False
Agency Type              False
City                     False
State                    False
Year                     False
Month                    False
Incident                 False
Crime Type               False
Crime Solved             False
Victim Sex               False
Victim Age               False
Victim Race              False
Victim Ethnicity         False
Perpetrator Sex          False
Perpetrator Age           True
Perpetrator Race         False
Perpetrator Ethnicity    False
Relationship             False
Weapon                   False
Victim Count             False
Perpetrator Count        False
Record Source            False
dtype: bool


Only Perpetrator Age has a null value

In [74]:
homicide.dropna(axis=0, how='any', inplace=True)
homicide.isnull().any()

Record ID                False
Agency Code              False
Agency Name              False
Agency Type              False
City                     False
State                    False
Year                     False
Month                    False
Incident                 False
Crime Type               False
Crime Solved             False
Victim Sex               False
Victim Age               False
Victim Race              False
Victim Ethnicity         False
Perpetrator Sex          False
Perpetrator Age          False
Perpetrator Race         False
Perpetrator Ethnicity    False
Relationship             False
Weapon                   False
Victim Count             False
Perpetrator Count        False
Record Source            False
dtype: bool

In [75]:
homicide['Perpetrator Age'] = homicide['Perpetrator Age'].astype(int)
homicide['Perpetrator Age'].dtype

dtype('int64')

In [76]:
homicide.head()

Unnamed: 0,Record ID,Agency Code,Agency Name,Agency Type,City,State,Year,Month,Incident,Crime Type,...,Victim Ethnicity,Perpetrator Sex,Perpetrator Age,Perpetrator Race,Perpetrator Ethnicity,Relationship,Weapon,Victim Count,Perpetrator Count,Record Source
0,1,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,January,1,Murder or Manslaughter,...,Unknown,Male,15,Native American/Alaska Native,Unknown,Acquaintance,Blunt Object,0,0,FBI
1,2,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,March,1,Murder or Manslaughter,...,Unknown,Male,42,White,Unknown,Acquaintance,Strangulation,0,0,FBI
2,3,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,March,2,Murder or Manslaughter,...,Unknown,Unknown,0,Unknown,Unknown,Unknown,Unknown,0,0,FBI
3,4,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,April,1,Murder or Manslaughter,...,Unknown,Male,42,White,Unknown,Acquaintance,Strangulation,0,0,FBI
4,5,AK00101,Anchorage,Municipal Police,Anchorage,Alaska,1980,April,2,Murder or Manslaughter,...,Unknown,Unknown,0,Unknown,Unknown,Unknown,Unknown,0,1,FBI


In [77]:
homicide.shape

(638453, 24)

## Exploration

In [78]:
ma_homicides = homicide[homicide['State'] == 'Massachusetts']
print(ma_homicides.shape)
ma_homicides.head()

(6036, 24)


Unnamed: 0,Record ID,Agency Code,Agency Name,Agency Type,City,State,Year,Month,Incident,Crime Type,...,Victim Ethnicity,Perpetrator Sex,Perpetrator Age,Perpetrator Race,Perpetrator Ethnicity,Relationship,Weapon,Victim Count,Perpetrator Count,Record Source
10230,10231,MA00102,Bourne,Municipal Police,Barnstable,Massachusetts,1980,January,1,Murder or Manslaughter,...,Unknown,Unknown,0,Unknown,Unknown,Unknown,Knife,0,1,FBI
10231,10232,MA00102,Bourne,Municipal Police,Barnstable,Massachusetts,1980,August,1,Murder or Manslaughter,...,Not Hispanic,Male,15,White,Not Hispanic,Family,Blunt Object,0,0,FBI
10232,10233,MA00109,Mashpee,Municipal Police,Barnstable,Massachusetts,1980,September,1,Murder or Manslaughter,...,Not Hispanic,Male,21,White,Not Hispanic,Acquaintance,Blunt Object,0,1,FBI
10233,10234,MA00201,Adams,Municipal Police,Berkshire,Massachusetts,1980,August,1,Murder or Manslaughter,...,Not Hispanic,Male,22,White,Not Hispanic,Stranger,Blunt Object,0,1,FBI
10234,10235,MA00212,Lanesboro,Municipal Police,Berkshire,Massachusetts,1980,June,1,Murder or Manslaughter,...,Not Hispanic,Unknown,0,Unknown,Unknown,Unknown,Strangulation,0,0,FBI


In [79]:
print(homicide.columns)
len(homicide.columns)

Index(['Record ID', 'Agency Code', 'Agency Name', 'Agency Type', 'City',
       'State', 'Year', 'Month', 'Incident', 'Crime Type', 'Crime Solved',
       'Victim Sex', 'Victim Age', 'Victim Race', 'Victim Ethnicity',
       'Perpetrator Sex', 'Perpetrator Age', 'Perpetrator Race',
       'Perpetrator Ethnicity', 'Relationship', 'Weapon', 'Victim Count',
       'Perpetrator Count', 'Record Source'],
      dtype='object')


24

In [80]:
homicide = homicide.iloc[:,4:24]
homicide.head()

Unnamed: 0,City,State,Year,Month,Incident,Crime Type,Crime Solved,Victim Sex,Victim Age,Victim Race,Victim Ethnicity,Perpetrator Sex,Perpetrator Age,Perpetrator Race,Perpetrator Ethnicity,Relationship,Weapon,Victim Count,Perpetrator Count,Record Source
0,Anchorage,Alaska,1980,January,1,Murder or Manslaughter,Yes,Male,14,Native American/Alaska Native,Unknown,Male,15,Native American/Alaska Native,Unknown,Acquaintance,Blunt Object,0,0,FBI
1,Anchorage,Alaska,1980,March,1,Murder or Manslaughter,Yes,Male,43,White,Unknown,Male,42,White,Unknown,Acquaintance,Strangulation,0,0,FBI
2,Anchorage,Alaska,1980,March,2,Murder or Manslaughter,No,Female,30,Native American/Alaska Native,Unknown,Unknown,0,Unknown,Unknown,Unknown,Unknown,0,0,FBI
3,Anchorage,Alaska,1980,April,1,Murder or Manslaughter,Yes,Male,43,White,Unknown,Male,42,White,Unknown,Acquaintance,Strangulation,0,0,FBI
4,Anchorage,Alaska,1980,April,2,Murder or Manslaughter,No,Female,30,Native American/Alaska Native,Unknown,Unknown,0,Unknown,Unknown,Unknown,Unknown,0,1,FBI


In [81]:
columns = homicide.columns
columns

Index(['City', 'State', 'Year', 'Month', 'Incident', 'Crime Type',
       'Crime Solved', 'Victim Sex', 'Victim Age', 'Victim Race',
       'Victim Ethnicity', 'Perpetrator Sex', 'Perpetrator Age',
       'Perpetrator Race', 'Perpetrator Ethnicity', 'Relationship', 'Weapon',
       'Victim Count', 'Perpetrator Count', 'Record Source'],
      dtype='object')

In [82]:
homicide['Crime Type'].value_counts()

Murder or Manslaughter        629337
Manslaughter by Negligence      9116
Name: Crime Type, dtype: int64

In [83]:
homicide['Crime Solved'].value_counts()

Yes    448172
No     190281
Name: Crime Solved, dtype: int64

In [84]:
# Percent of unsolved crimes
print(190281 / (190281 + 448172))

0.29803446768986913


## Exploring unsolved crimes

In [85]:
unsolved = homicide[homicide['Crime Solved'] == 'No']
print(unsolved.shape)
unsolved.head()

(190281, 20)


Unnamed: 0,City,State,Year,Month,Incident,Crime Type,Crime Solved,Victim Sex,Victim Age,Victim Race,Victim Ethnicity,Perpetrator Sex,Perpetrator Age,Perpetrator Race,Perpetrator Ethnicity,Relationship,Weapon,Victim Count,Perpetrator Count,Record Source
2,Anchorage,Alaska,1980,March,2,Murder or Manslaughter,No,Female,30,Native American/Alaska Native,Unknown,Unknown,0,Unknown,Unknown,Unknown,Unknown,0,0,FBI
4,Anchorage,Alaska,1980,April,2,Murder or Manslaughter,No,Female,30,Native American/Alaska Native,Unknown,Unknown,0,Unknown,Unknown,Unknown,Unknown,0,1,FBI
8,Anchorage,Alaska,1980,June,2,Murder or Manslaughter,No,Male,32,White,Unknown,Unknown,0,Unknown,Unknown,Unknown,Firearm,0,0,FBI
10,Anchorage,Alaska,1980,July,1,Murder or Manslaughter,No,Male,36,Native American/Alaska Native,Unknown,Unknown,0,Unknown,Unknown,Unknown,Unknown,0,1,FBI
14,Anchorage,Alaska,1980,August,2,Murder or Manslaughter,No,Male,48,White,Unknown,Unknown,0,Unknown,Unknown,Unknown,Handgun,0,0,FBI


In [86]:
unsolved['Victim Ethnicity'].value_counts()

Unknown         113271
Not Hispanic     51918
Hispanic         25092
Name: Victim Ethnicity, dtype: int64

In [87]:
unsolved['Weapon'].value_counts()

Handgun          100545
Firearm           22578
Knife             20204
Blunt Object      15833
Unknown           14783
Shotgun            4682
Strangulation      3701
Rifle              3401
Fire               2034
Suffocation         990
Gun                 882
Drowning            205
Drugs               192
Explosives          149
Poison               78
Fall                 24
Name: Weapon, dtype: int64

In [88]:
unsolved_victims = unsolved.loc[:,'Victim Sex':'Victim Ethnicity']
unsolved_victims.head(10)

Unnamed: 0,Victim Sex,Victim Age,Victim Race,Victim Ethnicity
2,Female,30,Native American/Alaska Native,Unknown
4,Female,30,Native American/Alaska Native,Unknown
8,Male,32,White,Unknown
10,Male,36,Native American/Alaska Native,Unknown
14,Male,48,White,Unknown
26,Male,27,Black,Unknown
32,Male,25,White,Unknown
33,Male,24,Native American/Alaska Native,Unknown
34,Male,33,White,Unknown
38,Female,22,White,Unknown


### Victim Sex

In [89]:
unsolved_victims['Victim Sex'].value_counts()

Male       156552
Female      33076
Unknown       653
Name: Victim Sex, dtype: int64

### Victim Age

In [90]:
# Check if any victim ages are less than 1.  Is 0 indicating a baby under 1 years old or unknown age???
check_victim_ages = unsolved_victims[unsolved_victims['Victim Age'] < 1]
print(check_victim_ages.shape)
check_victim_ages.head()

(1025, 4)


Unnamed: 0,Victim Sex,Victim Age,Victim Race,Victim Ethnicity
604,Female,0,White,Unknown
1511,Female,0,White,Not Hispanic
1958,Male,0,White,Not Hispanic
2945,Male,0,White,Not Hispanic
3491,Female,0,White,Unknown


In [91]:
# Looking at one of these rows, there's really not much information to tell me anything about this age 0 victim
homicide.iloc[1958]

City                                Los Angeles
State                                California
Year                                       1980
Month                                  November
Incident                                      1
Crime Type               Murder or Manslaughter
Crime Solved                                 No
Victim Sex                                 Male
Victim Age                                    0
Victim Race                               White
Victim Ethnicity                   Not Hispanic
Perpetrator Sex                         Unknown
Perpetrator Age                               0
Perpetrator Race                        Unknown
Perpetrator Ethnicity                   Unknown
Relationship                            Unknown
Weapon                             Blunt Object
Victim Count                                  0
Perpetrator Count                             0
Record Source                               FBI
Name: 1958, dtype: object

### Victim Race

In [92]:
unsolved_victims['Victim Race'].value_counts()

Black                            101124
White                             82236
Unknown                            3084
Asian/Pacific Islander             2917
Native American/Alaska Native       920
Name: Victim Race, dtype: int64

### Victim Ethnicity

In [93]:
unsolved_victims['Victim Ethnicity'].value_counts()

Unknown         113271
Not Hispanic     51918
Hispanic         25092
Name: Victim Ethnicity, dtype: int64

## Data Preprocessing

In [451]:
solved = homicide[homicide['Crime Solved'] == 'Yes']
solved.head()

Unnamed: 0,City,State,Year,Month,Incident,Crime Type,Crime Solved,Victim Sex,Victim Age,Victim Race,Victim Ethnicity,Perpetrator Sex,Perpetrator Age,Perpetrator Race,Perpetrator Ethnicity,Relationship,Weapon,Victim Count,Perpetrator Count,Record Source
0,Anchorage,Alaska,1980,January,1,Murder or Manslaughter,Yes,Male,14,Native American/Alaska Native,Unknown,Male,15,Native American/Alaska Native,Unknown,Acquaintance,Blunt Object,0,0,FBI
1,Anchorage,Alaska,1980,March,1,Murder or Manslaughter,Yes,Male,43,White,Unknown,Male,42,White,Unknown,Acquaintance,Strangulation,0,0,FBI
3,Anchorage,Alaska,1980,April,1,Murder or Manslaughter,Yes,Male,43,White,Unknown,Male,42,White,Unknown,Acquaintance,Strangulation,0,0,FBI
5,Anchorage,Alaska,1980,May,1,Murder or Manslaughter,Yes,Male,30,White,Unknown,Male,36,White,Unknown,Acquaintance,Rifle,0,0,FBI
6,Anchorage,Alaska,1980,May,2,Murder or Manslaughter,Yes,Female,42,Native American/Alaska Native,Unknown,Male,27,Black,Unknown,Wife,Knife,0,0,FBI


In [452]:
# Replace 'Unknown' with NaN, then drop those rows
solved = solved.replace('Unknown', np.nan)
solved.isnull().values.any()

True

In [453]:
solved.dropna(axis=0, how='any', inplace=True)
solved.isnull().values.any()

False

In [454]:
solved.shape

(155853, 20)

In [455]:
# There's still plenty of rows to work with after dropping all the Unknowns!
solved.head()

Unnamed: 0,City,State,Year,Month,Incident,Crime Type,Crime Solved,Victim Sex,Victim Age,Victim Race,Victim Ethnicity,Perpetrator Sex,Perpetrator Age,Perpetrator Race,Perpetrator Ethnicity,Relationship,Weapon,Victim Count,Perpetrator Count,Record Source
44,Juneau,Alaska,1980,November,4,Manslaughter by Negligence,Yes,Female,26,Native American/Alaska Native,Not Hispanic,Male,26,Native American/Alaska Native,Not Hispanic,Wife,Drowning,0,0,FBI
52,Jefferson,Alabama,1980,July,1,Murder or Manslaughter,Yes,Male,23,Black,Not Hispanic,Male,32,Black,Not Hispanic,Acquaintance,Handgun,0,0,FBI
53,Jefferson,Alabama,1980,July,2,Murder or Manslaughter,Yes,Male,42,White,Not Hispanic,Female,40,White,Not Hispanic,Husband,Handgun,0,0,FBI
54,Jefferson,Alabama,1980,July,3,Murder or Manslaughter,Yes,Male,33,Black,Not Hispanic,Male,26,Black,Not Hispanic,Acquaintance,Handgun,0,1,FBI
58,Jefferson,Alabama,1980,September,41,Murder or Manslaughter,Yes,Male,46,White,Not Hispanic,Male,22,White,Not Hispanic,Father,Rifle,0,0,FBI


In [456]:
# Drop the following columns: City State, Year, Month, Incident, Crime Type, Crime Solved, Victim Count, Perpetrator Count, Record Source
solved = solved.loc[:,'Victim Sex':'Weapon']
solved.head()

Unnamed: 0,Victim Sex,Victim Age,Victim Race,Victim Ethnicity,Perpetrator Sex,Perpetrator Age,Perpetrator Race,Perpetrator Ethnicity,Relationship,Weapon
44,Female,26,Native American/Alaska Native,Not Hispanic,Male,26,Native American/Alaska Native,Not Hispanic,Wife,Drowning
52,Male,23,Black,Not Hispanic,Male,32,Black,Not Hispanic,Acquaintance,Handgun
53,Male,42,White,Not Hispanic,Female,40,White,Not Hispanic,Husband,Handgun
54,Male,33,Black,Not Hispanic,Male,26,Black,Not Hispanic,Acquaintance,Handgun
58,Male,46,White,Not Hispanic,Male,22,White,Not Hispanic,Father,Rifle


In [457]:
solved['Relationship'].value_counts()

Acquaintance            50723
Stranger                42520
Friend                  11540
Wife                     9878
Girlfriend               6460
Husband                  4348
Family                   3808
Son                      3500
Neighbor                 3078
Boyfriend                2882
Daughter                 2643
Brother                  2470
In-Law                   2023
Father                   1815
Mother                   1675
Common-Law Wife          1195
Common-Law Husband       1021
Ex-Wife                   855
Stepfather                659
Boyfriend/Girlfriend      639
Stepson                   540
Sister                    479
Stepdaughter              312
Ex-Husband                303
Employer                  241
Employee                  159
Stepmother                 87
Name: Relationship, dtype: int64

In [458]:
# Label Encoding
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [459]:
# Function for label encoding multiple columns
le_variables = {}
def label_encoder(df, df_columns):
    for column in df_columns:
        le = LabelEncoder()
        labels = le.fit_transform(df[column])
        le_variables[column] = le
        df['LE_' + column] = labels

In [460]:
# Label encode all categorical columns 
columns_to_exclude = ['Victim Age', 'Perpetrator Age']
columns_to_encode = [column for column in solved.columns if column not in columns_to_exclude]
label_encoder(solved, columns_to_encode)
solved.head()

Unnamed: 0,Victim Sex,Victim Age,Victim Race,Victim Ethnicity,Perpetrator Sex,Perpetrator Age,Perpetrator Race,Perpetrator Ethnicity,Relationship,Weapon,LE_Victim Sex,LE_Victim Race,LE_Victim Ethnicity,LE_Perpetrator Sex,LE_Perpetrator Race,LE_Perpetrator Ethnicity,LE_Relationship,LE_Weapon
44,Female,26,Native American/Alaska Native,Not Hispanic,Male,26,Native American/Alaska Native,Not Hispanic,Wife,Drowning,0,2,1,1,2,1,26,1
52,Male,23,Black,Not Hispanic,Male,32,Black,Not Hispanic,Acquaintance,Handgun,1,1,1,1,1,1,0,8
53,Male,42,White,Not Hispanic,Female,40,White,Not Hispanic,Husband,Handgun,1,3,1,0,3,1,15,8
54,Male,33,Black,Not Hispanic,Male,26,Black,Not Hispanic,Acquaintance,Handgun,1,1,1,1,1,1,0,8
58,Male,46,White,Not Hispanic,Male,22,White,Not Hispanic,Father,Rifle,1,3,1,1,3,1,12,11


In [461]:
# Function for one hot encoding
def oh_encoder(df, df_columns):
    new_df = df
    for column in df_columns:
        ohe = OneHotEncoder()
        feature_array = ohe.fit_transform(df[[column]]).toarray()
        
        # Prepend 'victim' or 'perpetrator' if column name starts with either
        # Required to differentiate victim and perpetrator sex, race, ethnicity once encoded
        column_names = list(le_variables[column[3:]].classes_)
        if column.startswith('LE_Victim'):
            column_names = ['Victim ' + name for name in column_names]
        elif column.startswith('LE_Perpetrator'):
            column_names = ['Perpetrator ' + name for name in column_names]
        
        # Concatenate new one-hot encoded dfs with original df
        encoded_df = pd.DataFrame(feature_array, columns = column_names)
        new_df = pd.concat([new_df.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)
    return new_df

In [462]:
columns_to_oh_encode = [column for column in solved.columns if column.startswith('LE_')]
solved = oh_encoder(solved, columns_to_oh_encode)
solved.head()

Unnamed: 0,Victim Sex,Victim Age,Victim Race,Victim Ethnicity,Perpetrator Sex,Perpetrator Age,Perpetrator Race,Perpetrator Ethnicity,Relationship,Weapon,...,Fire,Firearm,Gun,Handgun,Knife,Poison,Rifle,Shotgun,Strangulation,Suffocation
0,Female,26,Native American/Alaska Native,Not Hispanic,Male,26,Native American/Alaska Native,Not Hispanic,Wife,Drowning,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Male,23,Black,Not Hispanic,Male,32,Black,Not Hispanic,Acquaintance,Handgun,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Male,42,White,Not Hispanic,Female,40,White,Not Hispanic,Husband,Handgun,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Male,33,Black,Not Hispanic,Male,26,Black,Not Hispanic,Acquaintance,Handgun,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Male,46,White,Not Hispanic,Male,22,White,Not Hispanic,Father,Rifle,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [463]:
solved.columns

Index(['Victim Sex', 'Victim Age', 'Victim Race', 'Victim Ethnicity',
       'Perpetrator Sex', 'Perpetrator Age', 'Perpetrator Race',
       'Perpetrator Ethnicity', 'Relationship', 'Weapon', 'LE_Victim Sex',
       'LE_Victim Race', 'LE_Victim Ethnicity', 'LE_Perpetrator Sex',
       'LE_Perpetrator Race', 'LE_Perpetrator Ethnicity', 'LE_Relationship',
       'LE_Weapon', 'Victim Female', 'Victim Male',
       'Victim Asian/Pacific Islander', 'Victim Black',
       'Victim Native American/Alaska Native', 'Victim White',
       'Victim Hispanic', 'Victim Not Hispanic', 'Perpetrator Female',
       'Perpetrator Male', 'Perpetrator Asian/Pacific Islander',
       'Perpetrator Black', 'Perpetrator Native American/Alaska Native',
       'Perpetrator White', 'Perpetrator Hispanic', 'Perpetrator Not Hispanic',
       'Acquaintance', 'Boyfriend', 'Boyfriend/Girlfriend', 'Brother',
       'Common-Law Husband', 'Common-Law Wife', 'Daughter', 'Employee',
       'Employer', 'Ex-Husband', 'Ex-Wif

# Supervised Learning

## K-Nearest Neighbors

The following fits the solved dataframe with all categorcal columns encoded into a K-Nearest Neighbors algorithm.

In [312]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

In [480]:
# Reset index to prevent columns being converted to NaN values
X1 = solved[['Victim Age']].reset_index(drop=True)
X2 = solved.loc[:,'Victim Female':'Victim Not Hispanic'].reset_index(drop=True)
X3 = solved.loc[:,'Blunt Object':]
X = pd.concat([X1, X2, X3], axis=1)
display(X.head())

y = solved.loc[:,'Perpetrator Female':'Wife']
y.head()

Unnamed: 0,Victim Age,Victim Female,Victim Male,Victim Asian/Pacific Islander,Victim Black,Victim Native American/Alaska Native,Victim White,Victim Hispanic,Victim Not Hispanic,Blunt Object,...,Fire,Firearm,Gun,Handgun,Knife,Poison,Rifle,Shotgun,Strangulation,Suffocation
0,26,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,23,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,42,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,33,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,46,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


Unnamed: 0,Perpetrator Female,Perpetrator Male,Perpetrator Asian/Pacific Islander,Perpetrator Black,Perpetrator Native American/Alaska Native,Perpetrator White,Perpetrator Hispanic,Perpetrator Not Hispanic,Acquaintance,Boyfriend,...,Mother,Neighbor,Sister,Son,Stepdaughter,Stepfather,Stepmother,Stepson,Stranger,Wife
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [481]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3,random_state = 42)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [482]:
knn.score(X_test, y_test)

0.13469928993070407

WOW, that is a really terrible accuracy score!!!!
Accuracy: 13.47%

## K-Nearest Neighbors - Take 2

Let's try that again, but this time, the ages will be categorized instead

In [471]:
solved2 = solved.copy()
solved2.head()

Unnamed: 0,Victim Sex,Victim Age,Victim Race,Victim Ethnicity,Perpetrator Sex,Perpetrator Age,Perpetrator Race,Perpetrator Ethnicity,Relationship,Weapon,...,Fire,Firearm,Gun,Handgun,Knife,Poison,Rifle,Shotgun,Strangulation,Suffocation
0,Female,26,Native American/Alaska Native,Not Hispanic,Male,26,Native American/Alaska Native,Not Hispanic,Wife,Drowning,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Male,23,Black,Not Hispanic,Male,32,Black,Not Hispanic,Acquaintance,Handgun,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Male,42,White,Not Hispanic,Female,40,White,Not Hispanic,Husband,Handgun,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Male,33,Black,Not Hispanic,Male,26,Black,Not Hispanic,Acquaintance,Handgun,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Male,46,White,Not Hispanic,Male,22,White,Not Hispanic,Father,Rifle,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [472]:
# Age ranges: < 15, 15-24, 25-34, 35-44, 45-54, 55-64, 65+
def categorize_age(age):
    if age < 15:
        return '<15'
    elif age >= 15 and age <= 24:
        return '15-24'
    elif age >= 25 and age <= 34:
        return '25-34'
    elif age >= 35 and age <= 44:
        return '35-44'
    elif age >= 45 and age <= 54:
        return '45-54'
    elif age >= 55 and age <= 64:
        return '55-64'
    elif age >= 65:
        return '65+'

In [473]:
# solved2['Victim Age'].apply(str)
# solved2['Perpetrator Age'].apply(str)

solved2['Victim Age'] = solved2['Victim Age'].apply(categorize_age)
solved2['Perpetrator Age'] = solved2['Perpetrator Age'].apply(categorize_age)

solved2.head()

Unnamed: 0,Victim Sex,Victim Age,Victim Race,Victim Ethnicity,Perpetrator Sex,Perpetrator Age,Perpetrator Race,Perpetrator Ethnicity,Relationship,Weapon,...,Fire,Firearm,Gun,Handgun,Knife,Poison,Rifle,Shotgun,Strangulation,Suffocation
0,Female,25-34,Native American/Alaska Native,Not Hispanic,Male,25-34,Native American/Alaska Native,Not Hispanic,Wife,Drowning,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Male,15-24,Black,Not Hispanic,Male,25-34,Black,Not Hispanic,Acquaintance,Handgun,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Male,35-44,White,Not Hispanic,Female,35-44,White,Not Hispanic,Husband,Handgun,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Male,25-34,Black,Not Hispanic,Male,25-34,Black,Not Hispanic,Acquaintance,Handgun,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Male,45-54,White,Not Hispanic,Male,15-24,White,Not Hispanic,Father,Rifle,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [474]:
label_encoder(solved2, ['Victim Age', 'Perpetrator Age'])
solved2 = oh_encoder(solved2, ['LE_Victim Age', 'LE_Perpetrator Age'])
print(solved2.columns)
solved2.head()

Index(['Victim Sex', 'Victim Age', 'Victim Race', 'Victim Ethnicity',
       'Perpetrator Sex', 'Perpetrator Age', 'Perpetrator Race',
       'Perpetrator Ethnicity', 'Relationship', 'Weapon', 'LE_Victim Sex',
       'LE_Victim Race', 'LE_Victim Ethnicity', 'LE_Perpetrator Sex',
       'LE_Perpetrator Race', 'LE_Perpetrator Ethnicity', 'LE_Relationship',
       'LE_Weapon', 'Victim Female', 'Victim Male',
       'Victim Asian/Pacific Islander', 'Victim Black',
       'Victim Native American/Alaska Native', 'Victim White',
       'Victim Hispanic', 'Victim Not Hispanic', 'Perpetrator Female',
       'Perpetrator Male', 'Perpetrator Asian/Pacific Islander',
       'Perpetrator Black', 'Perpetrator Native American/Alaska Native',
       'Perpetrator White', 'Perpetrator Hispanic', 'Perpetrator Not Hispanic',
       'Acquaintance', 'Boyfriend', 'Boyfriend/Girlfriend', 'Brother',
       'Common-Law Husband', 'Common-Law Wife', 'Daughter', 'Employee',
       'Employer', 'Ex-Husband', 'Ex-Wif

Unnamed: 0,Victim Sex,Victim Age,Victim Race,Victim Ethnicity,Perpetrator Sex,Perpetrator Age,Perpetrator Race,Perpetrator Ethnicity,Relationship,Weapon,...,Victim 55-64,Victim 65+,Victim <15,Perpetrator 15-24,Perpetrator 25-34,Perpetrator 35-44,Perpetrator 45-54,Perpetrator 55-64,Perpetrator 65+,Perpetrator <15
0,Female,25-34,Native American/Alaska Native,Not Hispanic,Male,25-34,Native American/Alaska Native,Not Hispanic,Wife,Drowning,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,Male,15-24,Black,Not Hispanic,Male,25-34,Black,Not Hispanic,Acquaintance,Handgun,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,Male,35-44,White,Not Hispanic,Female,35-44,White,Not Hispanic,Husband,Handgun,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,Male,25-34,Black,Not Hispanic,Male,25-34,Black,Not Hispanic,Acquaintance,Handgun,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,Male,45-54,White,Not Hispanic,Male,15-24,White,Not Hispanic,Father,Rifle,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [485]:
victim_1 = solved2.loc[:,'Victim 15-24':'Victim <15'].reset_index(drop=True)
victim_2 = solved2.loc[:,'Victim Female':'Victim Not Hispanic'].reset_index(drop=True)
victim_3 = solved2.loc[:,'Blunt Object':'Suffocation']
X2 = pd.concat([victim_1, victim_2, victim_3], axis=1)
display(X2.head())

perp_1 = solved2.loc[:,'Perpetrator Female':'Wife']
perp_2 = solved2.loc[:, 'Perpetrator 15-24':]
y2 = pd.concat([perp_1, perp_2], axis=1)
y2.head()

Unnamed: 0,Victim 15-24,Victim 25-34,Victim 35-44,Victim 45-54,Victim 55-64,Victim 65+,Victim <15,Victim Female,Victim Male,Victim Asian/Pacific Islander,...,Fire,Firearm,Gun,Handgun,Knife,Poison,Rifle,Shotgun,Strangulation,Suffocation
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


Unnamed: 0,Perpetrator Female,Perpetrator Male,Perpetrator Asian/Pacific Islander,Perpetrator Black,Perpetrator Native American/Alaska Native,Perpetrator White,Perpetrator Hispanic,Perpetrator Not Hispanic,Acquaintance,Boyfriend,...,Stepson,Stranger,Wife,Perpetrator 15-24,Perpetrator 25-34,Perpetrator 35-44,Perpetrator 45-54,Perpetrator 55-64,Perpetrator 65+,Perpetrator <15
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [488]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size = 0.3, random_state = 42)

knn2 = KNeighborsClassifier(n_neighbors=15)
knn2.fit(X_train2, y_train2)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=15, p=2,
           weights='uniform')

In [489]:
knn2.score(X_test2, y_test2)

0.043096073231243051