In [None]:
!pip install pgmpy

# Necessary Library Imports

In [None]:
import numpy as np 
import pandas as pd
import os

import matplotlib.pyplot as plt
import seaborn as sns
from pgmpy.models import BayesianModel

Metric libraries

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

Read CSV data file

In [None]:
os.listdir("../input/framingham/")

In [None]:
data_link = "../input/framingham/framingham.csv"

In [None]:
df = pd.read_csv(data_link)

Properties

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
nulls = df.isna().sum()
nulls.plot(kind="bar")


In [None]:
print(nulls)

In [None]:
df[df.isna().any(axis=1) ]

582 rows have null value

In [None]:
df["BMI"] = df["BMI"].fillna(df["BMI"].mean())
df['cigsPerDay'] = df['cigsPerDay'].fillna(df['cigsPerDay'].mean())
df['heartRate'] = df['heartRate'].fillna(df['heartRate'].mean())
df['BPMeds'] = df['BPMeds'].fillna(df['BPMeds'].mode()[0])
df['totChol'] = df['totChol'].fillna(df['totChol'].mean())

In [None]:
df['age'] = df['age'].astype("category")

In [None]:
nulls = df.isna().sum()
nulls.plot(kind="bar")
print(nulls)

Drop remaining

In [None]:
df = df.dropna()

In [None]:
df.shape

DATA CLEANING

In [None]:
def systolic_bp_eval(sys_bp):
    
    '''
    0 stands for normal as systolic -> less than 120 mm Hg
    1 - At risk -> 120–139
    2 - hyper tension -> systolic: 140 mm Hg or higher diastolic
    '''
    if sys_bp < 120 :
        return 0
    elif  sys_bp >= 120 and sys_bp < 140 :
        return 1
    else :
        return 2

In [None]:
def diast_bp_eval(d_bp):
    
    '''
    0 stands for normal as systolic -> less than 120 mm Hg
    1 - At risk -> 120–139
    2 - hyper tension -> systolic: 140 mm Hg or higher diastolic
    '''
    if d_bp < 80 :
        return 0
    elif  d_bp >= 80 and d_bp < 90 :
        return 1
    else :
        return 2

In [None]:
def total_cholestrol_eval(t_chol):
    
    '''
    High cholestrol -> 1 -> 240 and above
    normal -> 0 ->  below 240
    '''
    
    if t_chol >= 240 :
        return 1
    else:
        return 0

In [None]:
def to_range(x,interval = 10):
    
    start = int(x/interval) * interval
    end = start + (interval - 1)
    
    return str(start) + "-" + str(end)

In [None]:
df['sysBP'] = df['sysBP'].apply(lambda x : systolic_bp_eval(x) )
df['diaBP'] = df['diaBP'].apply(lambda x : diast_bp_eval(x) )
df['totChol'] = df['totChol'].apply(lambda x : total_cholestrol_eval(x) )
df['age'] = df["age"].apply(lambda x : to_range(x))
df['cigsPerDay'] = df["cigsPerDay"].apply(lambda x : to_range(x))
df['glucose'] = df["glucose"].apply(lambda x : to_range(x))
df['heartRate'] = df["heartRate"].apply(lambda x : to_range(x))
df['BMI'] = df["BMI"].apply(lambda x : to_range(x))

In [None]:
df.head()

## String encoder

In [None]:
from sklearn import preprocessing
from collections import defaultdict

d = defaultdict(preprocessing.LabelEncoder)
df = df.apply(lambda x: d[x.name].fit_transform(x))

In [None]:
df.head()

# Feature Selection

We will use the best 10 features for modelling

In [None]:
# Identify the features with the most importance for the outcome variable Heart Disease

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# separate independent & dependent variables
X = df.iloc[:,0:14]  #independent columns
y = df.iloc[:,-1]    #target column i.e price range

# apply SelectKBest class to extract top 10 best features
best_features = SelectKBest(score_func=chi2, k=10)
fit = best_features.fit(X,y)
df_scores = pd.DataFrame(fit.scores_)
df_columns = pd.DataFrame(X.columns)

#concat two dataframes for better visualization 
feature_scores = pd.concat([df_columns,df_scores],axis=1)
feature_scores.columns = ['Features','Score']  #naming the dataframe columns
print(feature_scores.nlargest(11,'Score')) 

In [None]:
feature_scores = feature_scores.sort_values(by='Score', ascending=False)

In [None]:
feature_scores

In [None]:
plt.figure(figsize=(20,5))
sns.barplot(x='Features', y='Score', data=feature_scores, palette = "GnBu_d")
plt.box(False)
plt.title('Feature importance', fontsize=16)
plt.xlabel('\n Features', fontsize=14)
plt.ylabel('Importance \n', fontsize=14)
plt.xticks(fontsize=12)
plt.xticks(rotation=70)
plt.yticks(fontsize=12)
plt.show()

In [None]:
features_ten =  list(feature_scores["Features"])[:10]
features_ten

In [None]:
features_ten.append("TenYearCHD")


In [None]:
features_ten

In [None]:

df = df[features_ten] 

In [None]:
float_vals = df.dtypes[df.dtypes == np.float].index.tolist()
df[float_vals] = df[float_vals].astype(int)

Handling Data Imbalance

In [None]:
# Shuffle df
shuffled_df = df.sample(frac=1,random_state=4)

# Put all the fraud class in a separate dataset.
CHD_df = shuffled_df.loc[shuffled_df['TenYearCHD'] == 1]

#Randomly select 492 observations from the non-fraud (majority class)
non_CHD_df = shuffled_df.loc[shuffled_df['TenYearCHD'] == 0].sample(n=611,random_state=42)

# Concatenate both dataframes again
normalized_df = pd.concat([CHD_df, non_CHD_df])

# check new class counts
normalized_df.TenYearCHD.value_counts()


In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(normalized_df,test_size=0.30,random_state = 340)

# Bayesian Belief Networks

In [None]:
features_ten

In [None]:
architecture = [('age', 'TenYearCHD'),
 ('prevalentHyp', 'age'),
 ('sysBP', 'age'),
 ('diaBP', 'age'),
 ('diabetes', 'age'),
 ('BPMeds', 'age'),
 ('cigsPerDay', 'age'),
 ('prevalentStroke', 'BPMeds')]


In [None]:
model = BayesianModel(architecture)

In [None]:
import networkx as nx
import pylab as plt
nx.draw(model, with_labels=True)
plt.show()

In [None]:
columns = []

for col_1, col_2 in architecture:
    columns.extend([col_1,col_2])


In [None]:
columns = list(set(columns))
columns

In [None]:
import time

In [None]:
start = time.time()
model.fit(train[columns])
end = time.time()

bayes_duration = end - start

In [None]:
columns.remove('TenYearCHD')

In [None]:
samples = test[columns].iloc[:200]
predictions = model.predict(samples)

In [None]:
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, accuracy_score

locs = list(samples.index)
precision,recall,f_score,_ = precision_recall_fscore_support(test.loc[locs,'TenYearCHD'], predictions, average='macro')

accuracy = accuracy_score(test.loc[locs,'TenYearCHD'], predictions)
results_dict = {
    "precision":precision,
    "recall":recall,
    "f_score": f_score,
    "accuracy":accuracy,
    "time":bayes_duration
}

In [None]:
overall_results = {}
overall_results['Bayesian'] = results_dict

In [None]:
roc_auc_score(test.loc[locs,'TenYearCHD'], predictions)

In [None]:
roc_curve(test.loc[locs,'TenYearCHD'], predictions)

In [None]:
# AU ROC CURVE KNN
'''the AUC ROC Curve is a measure of performance based on plotting the true positive and false positive rate 
and calculating the area under that curve.The closer the score to 1 the better the algorithm's ability to 
distinguish between the two outcome classes.'''

fpr, tpr, _ = roc_curve(test.loc[locs,'TenYearCHD'], predictions)
auc = roc_auc_score(test.loc[locs,'TenYearCHD'], predictions)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.box(False)
plt.title ('ROC CURVE KNN')
plt.show()

print(f"The score for the AUC ROC Curve is: {round(auc,3)*100}%")

# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc_model = RandomForestClassifier()

In [None]:
start = time.time()
rfc_model.fit(train[columns],train.iloc[:,-1])
end = time.time()

forest_duration = end - start



In [None]:
samples = test[columns].iloc[:200]
predictions = rfc_model.predict(samples)

In [None]:
locs = list(samples.index)
precision,recall,f_score,_ = precision_recall_fscore_support(test.loc[locs,'TenYearCHD'], predictions, average='macro')

accuracy = accuracy_score(test.loc[locs,'TenYearCHD'], predictions)
results_dict = {
    "precision":precision,
    "recall":recall,
    "f_score": f_score,
    "accuracy":accuracy,
    "time": forest_duration
}

In [None]:
overall_results['random forest'] = results_dict

In [None]:
results_df = pd.DataFrame(overall_results).T

In [None]:
for col in list(results_df.columns):
    plt.title(col)
    results_df[col].plot(kind="bar")
    plt.savefig(col+".png")

In [None]:
results_df