# Team *Memento Mori* - be mindful of death!

In this project, we plan to analyze CDC Data to predict the likelihood of an individual dying from a natural cause or an unnatural cause.

In [1]:
%matplotlib notebook
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

sns.set_style('whitegrid')

In [2]:
## Import the data
### Codes
json2015 = pd.read_json('2015_codes.json')

### Data -- a user might need to change the path if the data are stored somewhere else
df2015 = pd.read_csv('2015_data.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
## Take a sample from the data so that we aren't lugging around a 1.6gB array
df_sample = df2015.sample(frac = 0.1, random_state = 33)

List the columns of the DataFrame `df2015`.

## Natural/Unnatural encoding

We now encode the deaths which we rule unnatural.  By an **unnatural death**, we mean a death which is caused by one of the following causes:
1. Accident: 
    1. unintentional injuries
    2. transportation
    3. motor vehicle, land/water/air/space
    4. nontransport accidents
    5. Falls
    6. Accidental discharge of firearms
    7. Accidental drowning
    8. Accidental exposure to smoke, fire, flames
    9. Accidental poisoning
2. Homicide:
    1. Assault, not by the below causes
    2. Assault with firearms
    3. Assault with unspecified means
3. Suicide:
    1. Suicide by means other than the causes below
    2. Suicide by discharge of firearm
    3. Suidice by unspecified means
4. Execution
5. Undetermined intent
    1. U.I. except those listed below
    2. Discharge of firearms, undetermined intent
    3. Other
6. Operations of war
7. Complications from medical and surgical care

These correspond to cause of death codes 112-135 in the feature `113_cause_recode`.

In [None]:
#obtain these codes from the 2015 json file
causes_113 = json2015[json2015['113_cause_recode'].isnull() == False]['113_cause_recode']
causes_113.index = list(map(int,causes_113.index))
causes_113.name = '113_cause_recode_json'

#extract unnatural causes labels
unnatural_causes = causes_113.index[111:]
# df_sample['113_cause_recode']

Now label each instance as a natural -- $0$ -- or unnatural -- $1$ -- death.

In [None]:
def class_label(row):
    if row['113_cause_recode'] in unnatural_causes:
        row['unnatural'] = 1
    else:
        row['unnatural'] = 0
    return row

In [None]:
df_sample = df_sample.copy().apply(lambda x: class_label(x), axis = 1)

## `education`: cleaning the 1989_recoding problem

In the feature `education_revision_2003` some of the instances are using a 1989 code, and these are the codes which are `nan` values.  Unstated education levels are encoded with a `99`.  The `nan` value in `education_revision_2003` is indicated with a 0 in the `education_reporting_flag` feature.  We use the feature `education_reporting_flag` to create a new feature, `education` which return the 2003 education revision and updates the instances which use the 1989 revision.  Values of `9`, which are unreported, are retained.

In [None]:
def education_clean(row):
    if row['education_reporting_flag'] == 0:
        revis_1989 = row['education_1989_revision']
        
        # need to reclassify based on the 2003 education revision
        if 0<= revis_1989 <= 8:
            row['education'] = 1
        elif 9<= revis_1989 <=11:
            row['education'] = 2
        elif revis_1989 == 12:
            row['education'] = 3
        elif revis_1989 == 13:
            row['education'] = 4
        elif 14<= revis_1989 <= 15:
            row['education'] = 5
        elif revis_1989 == 16:
            row['education'] = 6
        elif revis_1989 == 17: #some fudging is going on here -- not sure if >=5 years of college corresponds
                                # to a Master's degree or PhD/Professional degree...
            row['education'] = 7
        elif revis_1989 == 99:
            row['education'] = 9
            
    elif row['education_reporting_flag'] == 1:
        row['education'] = int(row['education_2003_revision'])
     
    return row

In [None]:
df_sample = df_sample.apply(education_clean, axis = 1)

In [None]:
# We are missing about 4.5% of the education data -- use a simple imputer?
df_sample[df_sample.education == 9].shape[0]/df_sample.shape[0]

Let us examine the distribution of the feature `education`, splitting into two plots based on `unnatural`.  
- The feature `education` is an ordinal variable, since it measures the amount of formal education the decedent has received.
- 


## `age`
There needs to be some cleaning of the data to determine the age of the decedent, particularly in the case of the death of an infant.  I'm not sure what's going on with the feature `detail_age` or if that has been condensed down to an integer representing number of years lived.  

Note that ages which are missing are encoded as 999.  There aren't many, so we can just fill them with the median age for the particular value of `sex`.

In [None]:
df_sample.loc[df_sample.detail_age == 999].shape[0]

In [None]:
M_med_age = df_sample[(df_sample.detail_age != 999) & (df_sample.sex == "M")].detail_age.median()
F_med_age = df_sample[(df_sample.detail_age != 999) & (df_sample.sex == "F")].detail_age.median()
def fill_missing_age(row):
    if row['detail_age'] == 999:
        if row['sex'] == 'M':
            row['detail_age'] = M_med_age
        if row['sex'] == 'F':
            row['detail_age'] = F_med_age
    return row

In [None]:
df_sample = df_sample.copy().apply(fill_missing_age, axis = 1)

The number of instances when the age is recorded as 1, but the child died between the time of birth and the age of 1 is tiny compared to the number of observations: about $0.25\%$ of the instances correspond to this scenario.

In [None]:
df_sample.detail_age.max()

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (8,4))

#first plot is of the age distribution of natural deaths
# drop the values of 999, which are missing
sns.histplot(data = df_sample[(df_sample.detail_age != 999)&(df_sample.unnatural == 0)],
            x = 'detail_age',
            ax = ax[0],
            hue = 'sex',
             stat = 'probability',
             hue_order = ['F','M'],
            bins = np.arange(1,df_sample.detail_age[df_sample.detail_age < 999].max()))
ax[0].set_title('Age of natural deaths,\ncolored by sex of decedent')
ax[0].set_xticks(range(0,115,10))

#second plot is of the age distribution of unnatural deaths
# drop the values of 999, which are missing
sns.histplot(data = df_sample[(df_sample.detail_age != 999)&(df_sample.unnatural == 1)],
            x = 'detail_age',
            ax = ax[1],
            hue = 'sex',
             stat = 'probability',
             hue_order = ["F","M"],
            bins = np.arange(1,df_sample.detail_age[df_sample.detail_age < 999].max()))
ax[1].set_title('Age of unnatural deaths,\ncolored by sex of decedent')
ax[1].set_xticks(range(0,115,10))
fig.tight_layout()

Note the difference in the $y$-scales.

It looks like there is a considerable difference in the distribution of `detailed_age` based on whether or not the death was natural.  Moreover, the `sex` of the decedent changes the distribution of the age unnatural deaths considerably.

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (8,4))

#first plot is of the age distribution of natural deaths
sns.histplot(data = df_sample[(df_sample.unnatural == 0)],
            x = 'detail_age',
            ax = ax[0],
            hue = 'marital_status',
             hue_order = ['W','M','S','D','U'],
             stat = 'probability',
            bins = np.arange(1,df_sample.detail_age.max()))
ax[0].set_title('Age of natural deaths,\ncolored by matrial status of decedent')
ax[0].set_xticks(range(0,115,10))

#second plot is of the age distribution of unnatural deaths
sns.histplot(data = df_sample[(df_sample.unnatural == 1)],
            x = 'detail_age',
            ax = ax[1],
            hue = 'marital_status',
             hue_order = ['W','M','S','D','U'],
             stat = 'probability',
            bins = np.arange(1,df_sample.detail_age.max()))
ax[1].set_title('Age of unnatural deaths,\ncolored by marital status of decedent')
ax[1].set_xticks(range(0,115,10))
fig.tight_layout()

Natural deaths occur more frequently later in life for married decedents (M) or widow/widower decedents (W).  Single decedents (S) have a much higher probability of dying due to an unnatural than a natural cause at an early age.  It is interesting to note that the distribution of age for married decedents from unnatural causes looks quite symmetric, as does the distribution for divorced due to unnatural causes.

## `manner_of_death`

## `race`

Do some aggregation on `df_sample` to compute probability of unnatural death for each race.

In [None]:
race_dct = {key: val for key, val in zip(sorted(df_sample.race.unique()),
    ["White","Black", "American Indian", "Chinese",
     "Japanese", "Hawaiian", "Filipino","Asian Indian",
     "Korean", "Samoan", "Vietnamese", "Guamanian",
     "Other Asian\nor P.I.","Combined \no. Asian or P.I."])}

unnat_by_race = df_sample.groupby('race').agg({"unnatural":[np.sum, len]})['unnatural']
unnat_by_race.loc[:,"cond_rel_freq"]= unnat_by_race['sum']/unnat_by_race['len']
def assign_race(row):
    row['Race'] = race_dct[row.name]
    return row
unnat_by_race = unnat_by_race.copy().apply(assign_race, axis = 1)
unnat_by_race.columns = ['unnatural_deaths','total_deaths', 'cond_rel_freq','race']

In [None]:
unnat_by_race

Let $R$ be the random variable indicating a decedent's race, and $U$ be the random variable indicating the type of death (unnatural = 1, natural = 0).  We plot the distribution $P(U = 1|R = r)$ for each race $r\in \mathtt{race\underline{\,\,\,}dct.values}$.

In [None]:
fig, ax = plt.subplots(2,1,figsize = (8,8))

ax[0].bar(x = unnat_by_race.race,
      height= unnat_by_race.cond_rel_freq,
      alpha = 0.7)
ax[0].set_ylabel('$P(U = 1|R = r)$')
ax[0].set_xticklabels(unnat_by_race.race, rotation = 70)
ax[0].set_title('Probability of unnatural death, by race')

ax[1].bar(x = unnat_by_race.race,
          height = np.log10(unnat_by_race.total_deaths),
          alpha = 0.7
         )
ax[1].set_ylabel('$\log(\mathtt{count})$')
ax[1].set_xticklabels(unnat_by_race.race, rotation = 70)
ax[1].set_title('Log deaths by unnatural causes, by race')
fig.tight_layout()

# Trying to predict `unnatural`: plot things with this as hue

## Features to do EDA on: `detail_age`, `marital_status`, `education`, `sex`, `resident_status`, `race`.

### `detail_age`

In [None]:
fig, ax = plt.subplots(1,1,figsize = (6,4))

sns.histplot(data = df_sample,
            x = 'detail_age',
            hue = 'unnatural',
            stat='probability', 
            kde = True);
ax.set_xlabel('decedent age $x$', fontsize = 14)
ax.set_ylabel('probability of decedent\ndying at age $x$', fontsize = 14)
fig.tight_layout()

In [None]:
from matplotlib.patches import Patch
fig, ax = plt.subplots(1,1,figsize = (6,4))

sns.boxenplot(x="detail_age", y="unnatural", 
            data=df_sample, orient="h",
            palette={1:"red", 0:"blue"}, ax=ax,)

color_patches = [
    Patch(facecolor="red", label="unnatural"),
    Patch(facecolor="blue", label="natural")]
    
ax.set_ylabel('')
ax.legend(handles = color_patches);



#### Conclusions:
1. Different means and different skewness
2. Many outliers for the `unnatural = 0` group.
3. This makes sense -- natural deaths tend to happen later in life, while there are certainly outliers which happen early, due to childhood/adolescent disease, as well as disease in early adulthood.  Unnatural deaths are more likely to occur when a person is out in the world exposed to danger -- this more frequently happens to middle-aged adults as they go about their business and work.


A printout of the summary statistics of the distribution of `detail_age`, by `unnatural`:

In [None]:
age_group = df_sample.groupby('unnatural')

def quantile(x,q):
    return np.quantile(x,q)
age_group_agg = age_group.agg({"detail_age":[("mean",np.mean),
                             ("min",np.min), 
                             ("q_25",lambda x: quantile(x,0.25)),
                             ('median',np.median), 
                             ("q_75",lambda x: quantile(x,0.75)),
                             ('max',np.max), ]})['detail_age']
print('Summary of the feature detail_age, by unnatural:')
print('====================================================')
print(age_group_agg)

### `education`

The variable `education_2003_recode` has been cleaned up and is in the variable `education`

In [None]:
fig, ax = plt.subplots(1,1,figsize = (6,4))

sns.histplot(data = df_sample,
            x = 'education',
            hue = 'unnatural',
#             stat = 'probability',
            bins = np.arange(1,10)-0.5
            );
fig.tight_layout()

Unnatural deaths increase as education level increases until a person has completed high-school or obtained a GED, then they began to decrease after completing some college.  

For natural deaths, there is a slight decrease from pre-high school to some high-school, but this can be explained by variable `detail_age`.

### `marital_status`

In [None]:
fig, ax = plt.subplots(1,1,figsize = (6,4))

sns.histplot(data = df_sample,
            x = 'marital_status',
            hue = 'unnatural',
#             stat = 'probability',
#             bins = np.arange(1,10)-0.5
            );
fig.tight_layout()

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (10,6))

#first plot is of the age distribution of natural deaths
sns.histplot(data = df_sample[(df_sample.unnatural == 0)],
            x = 'detail_age',
            ax = ax[0],
            hue = 'marital_status',
             hue_order = ['W','M','S','D','U'],
             stat = 'probability',
            bins = np.arange(1,df_sample.detail_age.max()))
ax[0].set_title('Age of natural deaths,\ncolored by matrial status of decedent')
ax[0].set_xticks(range(0,115,10))

#second plot is of the age distribution of unnatural deaths
sns.histplot(data = df_sample[(df_sample.unnatural == 1)],
            x = 'detail_age',
            ax = ax[1],
            hue = 'marital_status',
             hue_order = ['W','M','S','D','U'],
             stat = 'probability',
            bins = np.arange(1,df_sample.detail_age.max()))
ax[1].set_title('Age of unnatural deaths,\ncolored by marital status of decedent')
ax[1].set_xticks(range(0,115,10))
fig.tight_layout()

### `resident_status`

In [None]:
fig, ax = plt.subplots(1,1,figsize = (6,4))

sns.histplot(data = df_sample,
            x = 'resident_status',
            hue = 'unnatural',
#             stat = 'probability',
             bins = [1,2,3,4,5]
            );
ax.set_xticks([i + 0.5 for i in range(1,5)])
ax.set_xticklabels([i for i in range(1,5)])
fig.tight_layout()

In [None]:
df_sample.resident_status.value_counts()

# Checking Ramazan's work:
Ramazan used the features `['education_2003_revision', 'month_of_death', 'age_recode_52', 'detail_age', 'day_of_week_of_death', 'activity_code']`, but I am going to replace `education_2003_revision` with the cleaned education feature `education`.

In [None]:
feats_to_use = ['education',
 'month_of_death',
#  'age_recode_52',
 'detail_age',
 'day_of_week_of_death',
 'activity_code', 'marital_status',
               'sex']

In [None]:
for x in feats_to_use:
    print("Unique features of {}:{} ".format(x,sorted(df_sample[x].unique())))
    print()

In [None]:
# Make a custom transformer to code the features marital_status and sex
# as well as scale the detail_age
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler

class CustomTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.StandardScaler = StandardScaler() #to scale the detail_age of decedent
        self.OneHotEncoderMS = OneHotEncoder() #to encode marital_status of decedent
        self.OneHotEncoderSex = OneHotEncoder() #to encode sex of decedent
        
    def fit(self, X, y = None):
        #fit the StandardScaler for detail_age of decedent
        self.StandardScaler.fit(X[['detail_age']])
        
        #fit the onehot for marital_status of decedent
        self.OneHotEncoderMS.fit(X[['marital_status']])
        
        #fit the onehot for sex of decedent
        self.OneHotEncoderSex.fit(X[['sex']])
        return self
        
    def transform(self, X, y = None):
        copy_X = X.copy()
        
        #scale detail_age of decedent
        copy_X['detail_age'] = self.StandardScaler.transform(copy_X[['detail_age']])
        
        #encode marital_status of decedent
        ms_encoded = pd.DataFrame(self.OneHotEncoderMS.transform(copy_X[['marital_status']]).toarray(),
                               columns = self.OneHotEncoderMS.categories_[0],
                                 index = copy_X.index)
        
        #encode sex of decedent, removing one column (sex_M = 1-sex_F)
        sex_encoded = pd.DataFrame(self.OneHotEncoderSex.transform(copy_X[['sex']]).toarray(),
                               columns = ["sex_{}".format(x) for x in self.OneHotEncoderSex.categories_[0]],
                                  index = copy_X.index)
        sex_encoded = sex_encoded.copy()[['sex_M']]
#         print((sex_encoded.index != ms_encoded.index).sum())
        
        #now merge the two encoded dataframes from above
        copy_X = copy_X.join([ms_encoded, sex_encoded])
        
        return copy_X
        

In [None]:
# make a train_test_split to df_sample
train, test = train_test_split(df_sample, test_size = 0.2, 
                              random_state = 1907, 
                              shuffle = True,
                              stratify = df_sample.unnatural)

#Now transform the ***training*** data
data_transformer = CustomTransformer()
data_transformer.fit(train)
train = data_transformer.transform(train.copy())

#And transform the test data
test = data_transformer.transform(test.copy())

In [None]:
train.columns

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import recall_score, f1_score, confusion_matrix, classification_report, roc_curve

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
updated_ftu = ['education',
              'month_of_death',
              #  'age_recode_52',
              'detail_age',
              'day_of_week_of_death',
              #'activity_code',
               'D', 'M', 'S', 'U', 'W', 'sex_M']

n_splits = 10
n_models = 5
kfold = StratifiedKFold(n_splits = n_splits,
                        shuffle = True, 
                        random_state = 1907)
#axis 0: index of the kfold 
#axis 1: for the score type: recall, f1
#axis 2: for the model number
results = np.zeros((n_splits, 2, n_models))

#might use this, but probably not.
model_lst = [LogisticRegression(max_iter = 10000),
            LogisticRegression(max_iter = 10000, class_weight = 'balanced'),
            RandomForestClassifier(n_estimators = 100, max_depth = 8),
            RandomForestClassifier(n_estimators=100, max_depth=8 ,class_weight = 'balanced')]

i = 0
for tt_ix, ho_ix in kfold.split(train, train.unnatural):
    
    #make the train and holdhout sets
    train_tt, train_ho = train.iloc[tt_ix], train.iloc[ho_ix]
    
    #fit the five models
    
    #Model 0: Classify as all zeros
    all_zeros = np.zeros(train_ho.unnatural.shape[0])
    results[i,0,0] = f1_score(all_zeros, train_ho.unnatural, zero_division = 0)
    results[i,1,0] = recall_score(all_zeros, train_ho.unnatural, zero_division = 0)
    print(f'Model 0 {i}th validation \n', 
          classification_report(train_ho['unnatural'].values,
                                all_zeros),'\n')
    
    #Model 1: Logistic Regression: no weights
    lr1 = LogisticRegression(max_iter = 10000)
    lr1.fit(train_tt[updated_ftu], train_tt.unnatural)
    lr1_pred = lr1.predict(train_ho[updated_ftu])
    results[i,0,1] = f1_score(lr1_pred, train_ho.unnatural)
    results[i,1,1] = recall_score(lr1_pred, train_ho.unnatural)
    print(f'Model 1 {i}th validation \n', 
          classification_report(train_ho['unnatural'].values,
                                lr1_pred),'\n')
    
    #Model 2: Logistic Regressin: weights
    lr2 = LogisticRegression(class_weight = 'balanced')
    lr2.fit(train_tt[updated_ftu],train_tt.unnatural)
    lr2_pred = lr2.predict(train_ho[updated_ftu])
    results[i,0,2] = f1_score(lr2_pred, train_ho.unnatural)
    results[i,1,2] = recall_score(lr2_pred, train_ho.unnatural)
    print(f'Model 2 {i}th validation \n', 
          classification_report(train_ho['unnatural'].values,
                                lr2_pred),'\n')
    
    
    #Model 3: Random Forest: no weights
    rf1 = RandomForestClassifier()
    rf1.fit(train_tt[updated_ftu],train_tt.unnatural)
    rf1_pred = rf1.predict(train_ho[updated_ftu])
    results[i,0,3] = f1_score(rf1_pred, train_ho.unnatural)
    results[i,1,3] = recall_score(rf1_pred, train_ho.unnatural)
    print(f'Model 3 {i}th validation \n', 
          classification_report(train_ho['unnatural'].values,
                    rf1_pred),'\n')
    print(rf1.feature_importances_)
    
    #Model 4: Random Forest: weights
    rf2 = RandomForestClassifier(class_weight='balanced')
    rf2.fit(train_tt[updated_ftu],train_tt.unnatural)
    rf2_pred = rf2.predict(train_ho[updated_ftu])
    results[i,0,4] = f1_score(rf2_pred, train_ho.unnatural)
    results[i,1,4] = recall_score(rf2_pred, train_ho.unnatural)
    print(f'Model 4 {i}th validation \n', 
          classification_report(train_ho['unnatural'].values,
                                rf2_pred),'\n')
    print(rf2.feature_importances_)
    
    
    #Model 5
    
    
    i+= 1
    

In [None]:
results.mean(axis= 0)