# IMPORT MODULES

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from scipy.stats import chi2_contingency, pointbiserialr
from imblearn.under_sampling import RandomUnderSampler

# DATA UNDERSTANDING

### READING DATA and IDENTIFYING TARGET VARIABLES

In [None]:
data_frame = pd.read_csv('Mortality Dataset 2021 11 20.csv')
pd.set_option('display.max_columns', 70)
data_frame.head()

In [None]:
data_frame.shape

### Target Variable

In [None]:
data_frame['mortality'] = data_frame['DEATH_5']+data_frame['DEATH_10']
data_frame.shape

In [None]:
print(data_frame['mortality'].unique())

## EXPLORATORY DATA ANALYSIS

## ABT REPORT

In [None]:
categorical = data_frame.select_dtypes(include = 'object')
categorical.columns

In [None]:
len(categorical.columns)

In [None]:
numerical = data_frame.select_dtypes(include = 'number')
numerical.columns

In [None]:
len(numerical.columns)

### Continuous Features

In [None]:
abt_cont = pd.DataFrame(columns=['count','percent_missing','cardinality','Min_val','1st_Quartile','mean_val','median_val','3rd_Quartile','Max_val','StdDev'])

In [None]:
abt_cont['mean_val']=numerical.mean()
# print(type(numerical.mean()))

In [None]:
abt_cont['median_val']=numerical.median()

In [None]:
abt_cont['count']=numerical.count()

In [None]:
abt_cont['Min_val']=numerical.min()

In [None]:
abt_cont['Max_val']=numerical.max()

In [None]:
abt_cont['StdDev']=numerical.std()

In [None]:
abt_cont['cardinality']=numerical.nunique()

In [None]:
abt_cont['percent_missing'] = (numerical.isna().sum()/275190)*100

In [None]:
abt_cont['1st_Quartile'] = numerical.quantile(0.25)

In [None]:
abt_cont['3rd_Quartile'] = numerical.quantile(0.75)

In [None]:
pd.set_option('display.max_rows', 71)
abt_cont

### Categorical Variables

In [None]:
abt_cat = pd.DataFrame(columns=['count','percent_missing','cardinality','1st_Mode','1st_Mode_percent','1st_Mode_Freq'])

In [None]:
abt_cat['count'] = categorical.count()

In [None]:
abt_cat['cardinality'] = categorical.nunique()

In [None]:
abt_cat['percent_missing'] = (categorical.isna().sum()/275190)*100

In [None]:
abt_cat['1st_Mode'] = categorical.mode().T
# print(modes)

for col in categorical.columns:
    abt_cat.at[col,'1st_Mode_Freq'] = categorical[col].value_counts()[abt_cat.at[col,'1st_Mode']]
    abt_cat.at[col,'1st_Mode_percent'] = (abt_cat.at[col,'1st_Mode_Freq'] / len(categorical)) * 100

In [None]:
pd.set_option('display.max_rows',5)
abt_cat

### VISUALIZATIONS

### Continuous

In [None]:
num_cols = 5
num_rows = 14
fig, axes = plt.subplots(num_rows, num_cols, figsize= (15,70))

for i, col in enumerate(numerical.columns):
    row = i // num_cols
    col_idx = i % num_cols
    ax = axes[row, col_idx]
    ax.boxplot(numerical[col])
    ax.set_title("Plot for {}".format(col))
    

### Categorical

In [None]:
print(categorical.columns)

In [None]:
for column in categorical.columns:
    value_counts = categorical[column].value_counts()
    # print(value_counts)
    # Create figure and axis for the bar plot
    plt.figure(figsize=(8, 6))
    plt.bar(value_counts.index, value_counts.values)
    
    # # Add labels and title
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.title(f'Bar Plot for {column}')


## DATA QUALITY PLAN

### MISSING VALUES

In [None]:
null_count = data_frame.isnull().sum()
pd.set_option('display.max_rows', 71)
print(null_count.sort_values(ascending=False))

### DUPLICATES

In [None]:
duplicates = data_frame[data_frame.duplicated()]
print(duplicates)

### REMOVING DUPLICATES

In [None]:
data_frame1 = data_frame.drop_duplicates(keep='last')
data_frame1.shape
data_frame1.head()

### HANDLING OUTLIERS

In [None]:
data_frame1['mortality'].head()

In [None]:
data_frame1.shape
data_frame1['SYSTOLIC'].unique()

In [None]:
sns.boxplot(x='mortality', y='SYSTOLIC', data=data_frame1)

# Add labels and title
plt.xlabel('Mortality')
plt.ylabel('Systolic')
plt.title('Box Plot of Value by Category')

# Show plot
plt.show()

In [None]:
data_frame1['SYSTOLIC']= pd.cut(x=data_frame1['SYSTOLIC'], bins = [0, 119, 129, 139, 179, 200], right=False, labels=["<120", "120-129", "130-139", "140-179", ">=180"], include_lowest=True)


In [None]:
bin_counts = data_frame1['SYSTOLIC'].value_counts().sort_index()
plt.bar(bin_counts.index, bin_counts.values)

# Add labels and title
plt.xlabel('Systolic Range')
plt.ylabel('Frequency')
plt.title('Histogram of Systolic Range')

# Show plot
plt.show()

In [None]:
print(data_frame1['mortality'].unique())

sns.boxplot(x='mortality', y='AGE', data=data_frame1)

# Add labels and title
plt.xlabel('Mortality')
plt.ylabel('AGE')
plt.title('Box Plot of Value by Category')

# Show plot
plt.show()

In [None]:
data_frame1['AGE'] = pd.cut(x=data_frame1['AGE'], bins=[65, 70, 75, 80, 85, 90, 95, 100],
                     right = False, labels=['65-69', '70-74', '75-79',
                            '80-84', '85-89', '90-94', '>=95'],include_lowest = True)

In [None]:
bin_counts = data_frame1['AGE'].value_counts().sort_index()
plt.bar(bin_counts.index, bin_counts.values)

# Add labels and title
plt.xlabel('Age Range')
plt.ylabel('Frequency')
plt.title('Histogram of AGE')

# Show plot
plt.show()

In [None]:
sns.boxplot(x='mortality', y='BMI', data=data_frame1)

# Add labels and title
plt.xlabel('Mortality')
plt.ylabel('BMI')
plt.title('Box Plot of Value by Category')

# Show plot
plt.show()

In [None]:
data_frame1.loc[:, 'BMI'] = pd.cut(x=data_frame1['BMI'], bins=[10, 18.4, 24.9, 39.9, 49.9,
np.inf] ,right = False, labels=['<18.5','18.5-24.9','25-39.9','40-49.9', '>=50' ],include_lowest = True)

In [None]:
bin_counts = data_frame1['BMI'].value_counts().sort_index()
plt.bar(bin_counts.index, bin_counts.values)

# Add labels and title
plt.xlabel('BMI')
plt.ylabel('Frequency')
plt.title('Histogram of BMI Range')

# Show plot
plt.show()

In [None]:
sns.boxplot(x='mortality', y='A1C', data=data_frame1)

# Add labels and title
plt.xlabel('Mortality')
plt.ylabel('A1C')
plt.title('Box Plot of Value by Category')

# Show plot
plt.show()

In [None]:

data_frame1['A1C']= pd.cut(x=data_frame1['A1C'], bins = [-np.inf, 7.9, 9.0, np.inf] , right=False, labels=["<8", "8-9", ">9" ], include_lowest=True)

In [None]:
bin_counts = data_frame1['A1C'].value_counts().sort_index()
plt.bar(bin_counts.index, bin_counts.values)

# Add labels and title
plt.xlabel('A1C')
plt.ylabel('Frequency')
plt.title('Histogram of A1C')

# Show plot
plt.show()

In [None]:
sns.boxplot(x='mortality', y='SERUMALB', data=data_frame1)

# Add labels and title
plt.xlabel('Mortality')
plt.ylabel('SERUMALB')
plt.title('Box Plot of Value by Category')

# Show plot
plt.show()

In [None]:

data_frame1['SERUMALB']= pd.cut(x=data_frame1['SERUMALB'], bins = [-np.inf, 3.49, np.inf] , right=False, labels=["<3.5", ">=3.5" ], include_lowest=True)

In [None]:
bin_counts = data_frame1['SERUMALB'].value_counts().sort_index()
plt.bar(bin_counts.index, bin_counts.values)

# Add labels and title
plt.xlabel('SERUMALB')
plt.ylabel('Frequency')
plt.title('Histogram of SERUMALB Range')

# Show plot
plt.show()

In [None]:
sns.boxplot(x='mortality', y='SERUMCRE', data=data_frame1)

# Add labels and title
plt.xlabel('Mortality')
plt.ylabel('SERUMCRE')
plt.title('Box Plot of Value by Category')

# Show plot
plt.show()

In [None]:

data_frame1['SERUMCRE']= pd.cut(x=data_frame1['SERUMCRE'], bins = [-np.inf, 1.49, 3.00, np.inf] , right=False, labels=["<1.5", "1.5-3.0", ">3.0"], include_lowest=True)

In [None]:
bin_counts = data_frame1['SERUMCRE'].value_counts().sort_index()
plt.bar(bin_counts.index, bin_counts.values)

# Add labels and title
plt.xlabel('SERUMCRE')
plt.ylabel('Frequency')
plt.title('Histogram of SERUMCRE Range')

# Show plot
plt.show()

In [None]:
sns.boxplot(x='mortality', y='N_IP', data=data_frame1)

# Add labels and title
plt.xlabel('Mortality')
plt.ylabel('N_IP')
plt.title('Box Plot of Value by Category')

# Show plot
plt.show()

In [None]:
data_frame1['N_IP']= pd.cut(x=data_frame1['N_IP'], bins = [0, 5, np.inf]  , right=False, labels=["0-5", ">5"  ], include_lowest=True)

In [None]:
bin_counts = data_frame1['N_IP'].value_counts().sort_index()
plt.bar(bin_counts.index, bin_counts.values)

# Add labels and title
plt.xlabel('N_IP')
plt.ylabel('Frequency')
plt.title('Histogram of N_IP Range')

# Show plot
plt.show()

In [None]:
sns.boxplot(x='mortality', y='N_OP', data=data_frame1)

# Add labels and title
plt.xlabel('Mortality')
plt.ylabel('N_OP')
plt.title('Box Plot of Value by Category')

# Show plot
plt.show()

In [None]:

data_frame1['N_OP']= pd.cut(x=data_frame1['N_OP'], bins = [0, 5, 30, np.inf] , right=False, labels=["0-5", "6-30", ">30" ], include_lowest=True)

In [None]:
bin_counts = data_frame1['N_OP'].value_counts().sort_index()
plt.bar(bin_counts.index, bin_counts.values)

# Add labels and title
plt.xlabel('N_OP')
plt.ylabel('Frequency')
plt.title('Histogram of N_OP Range')

# Show plot
plt.show()

In [None]:
sns.boxplot(x='mortality', y='DIASTOLIC', data=data_frame1)

# Add labels and title
plt.xlabel('Mortality')
plt.ylabel('DIASTOLIC')
plt.title('Box Plot of Value by Category')

# Show plot
plt.show()

In [None]:

data_frame1['DIASTOLIC']= pd.cut(x=data_frame1['DIASTOLIC'], bins = [-np.inf, 79, 89, np.inf] , right=False, labels=["<80", "80-89", ">=90"], include_lowest=True)

In [None]:
bin_counts = data_frame1['DIASTOLIC'].value_counts().sort_index()
plt.bar(bin_counts.index, bin_counts.values)

# Add labels and title
plt.xlabel('DIASTOLIC')
plt.ylabel('Frequency')
plt.title('Histogram of DIASTOLIC Range')

# Show plot
plt.show()

In [None]:
sns.boxplot(x='mortality', y='TRI', data=data_frame1)

# Add labels and title
plt.xlabel('Mortality')
plt.ylabel('TRI')
plt.title('Box Plot of Value by Category')

# Show plot
plt.show()

In [None]:

data_frame1['TRI']= pd.cut(x=data_frame1['TRI'], bins = [-np.inf, 149.99, 199.99, np.inf] , right=False, labels=["<150", "150-199.99", ">=200"], include_lowest=True)

In [None]:
bin_counts = data_frame1['TRI'].value_counts().sort_index()
plt.bar(bin_counts.index, bin_counts.values)

# Add labels and title
plt.xlabel('TRI')
plt.ylabel('Frequency')
plt.title('Histogram of TRI Range')

# Show plot
plt.show()

In [None]:
sns.boxplot(x='mortality', y='LDL', data=data_frame1)

# Add labels and title
plt.xlabel('Mortality')
plt.ylabel('LDL')
plt.title('Box Plot of Value by Category')

# Show plot
plt.show()

In [None]:

data_frame1['LDL']= pd.cut(x=data_frame1['LDL'], bins = [-np.inf, 99.99, 129.99, 159.99, 189.99, np.inf] , right=False, labels=["<100", "100-129.99","130-159.99","160-189.99",">=190"], include_lowest=True)

In [None]:
bin_counts = data_frame1['LDL'].value_counts().sort_index()
plt.bar(bin_counts.index, bin_counts.values)

# Add labels and title
plt.xlabel('LDL')
plt.ylabel('Frequency')
plt.title('Histogram of LDL Range')

# Show plot
plt.show()

In [None]:
sns.boxplot(x='mortality', y='HDL', data=data_frame1)

# Add labels and title
plt.xlabel('Mortality')
plt.ylabel('HDL')
plt.title('Box Plot of Value by Category')

# Show plot
plt.show()

In [None]:

data_frame1['HDL']= pd.cut(x=data_frame1['HDL'], bins = [-np.inf, 39.99, 59.99, np.inf] , right=False, labels=["<40", "40-59.99", ">=60" ], include_lowest=True)

In [None]:
bin_counts = data_frame1['HDL'].value_counts().sort_index()
plt.bar(bin_counts.index, bin_counts.values)

# Add labels and title
plt.xlabel('HDL')
plt.ylabel('Frequency')
plt.title('Histogram of HDL Range')

# Show plot
plt.show()

In [None]:
sns.boxplot(x='mortality', y='FRAILTY', data=data_frame1)

# Add labels and title
plt.xlabel('Mortality')
plt.ylabel('FRAILTY')
plt.title('Box Plot of Value by Category')

# Show plot
plt.show()

In [None]:

data_frame1['FRAILTY']= pd.cut(x=data_frame1['FRAILTY'], bins = [0.00, 0.10, 0.20, 0.30, 0.40,np.inf] , right=False, labels=["Non-frail", "Pre-frail", "Mild", "Moderate", "Severe"], include_lowest=True)

In [None]:
bin_counts = data_frame1['FRAILTY'].value_counts().sort_index()
plt.bar(bin_counts.index, bin_counts.values)

# Add labels and title
plt.xlabel('FRAILTY')
plt.ylabel('Frequency')
plt.title('Histogram of FRAILTY Range')

# Show plot
plt.show()

In [None]:
data_frame1.info()

### HANDLING NULL VALUES

In [None]:
data_frame1 = data_frame1.drop('MICROALB',axis=1)

In [None]:
data_frame1['LDL'] = data_frame1['LDL'].cat.add_categories('Missing')

data_frame1['HDL'] = data_frame1['HDL'].cat.add_categories('Missing')

data_frame1['SERUMALB'] = data_frame1['SERUMALB'].cat.add_categories('Missing')

data_frame1['TRI'] = data_frame1['TRI'].cat.add_categories('Missing')

data_frame1['SERUMCRE'] = data_frame1['SERUMCRE'].cat.add_categories('Missing')


# LDL "<100", "100-129.99","130-159.99","160-189.99",">=190"
# HDL "<40", "40-59.99", ">=60"
# SERUMALB "<3.5", ">=3.5"
# TRI "<150", "150-199.99", ">=200"
# SERUMCRE"<1.5", "1.5-3.0", ">3.0"
data_frame1['LDL'].unique()

In [None]:
data_frame1['HDL'].unique()

In [None]:
data_frame1['LDL'] = data_frame1['LDL'].fillna('Missing')
data_frame1['HDL'] = data_frame1['HDL'].fillna('Missing')
data_frame1['SERUMALB'] = data_frame1['SERUMALB'].fillna('Missing')
data_frame1['TRI'] = data_frame1['TRI'].fillna('Missing')
data_frame1['SERUMCRE'] = data_frame1['SERUMCRE'].fillna('Missing')

In [None]:
null_count = data_frame1.isnull().sum()
pd.set_option('display.max_rows', 70)
print(null_count.sort_values(ascending=False))

### DECODING VARIABLES

In [None]:
dict_race = {1: "White",
            2: "Black",
            3: "Other"}

data_frame1['RACE'] = data_frame1['RACE'].replace(dict_race)
print(data_frame1.head())

### HANDLING IRREGULAR CARDINALITY

In [None]:
data_frame1.info()
data_frame_copy = data_frame1

In [None]:
for col in data_frame1.columns:
        data_frame1[col] = data_frame1[col].astype(bool)

data_frame1.info()

In [None]:
for col in data_frame1.columns:
    print("##########")
    print(data_frame1[col].value_counts())
    print("##########")