**UCD_PA Specialist Certificate in Data Analytics Essentials -  Machine Learning (ML) Assignment**
# Assignment Title: 10-year risk of future coronary heart disease (CHD) Prediction  machine learning

## DataSet Description

DataSet Source: [framingham-heart-study-dataset](https://www.kaggle.com/datasets/aasheesh200/framingham-heart-study-dataset)

## Import Libraries

In [3]:
# import libraries

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# data visualisation
import seaborn as sns   #Seaborn for data visualisation
import matplotlib.pyplot as plt
#setting the standard color code ..styling
sns.set(color_codes=True) 
# sklearn libraries
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV

# imblearn libraries
from imblearn.over_sampling import RandomOverSampler


# import warnings
# warnings.simplefilter(action="ignore", category=FutureWarning)

## Custom Python Functions

In [3]:
## plot distributions of features in the supplied data set
def plot_feature_distributions(dataframe):
    """
    Plots the distribution of each feature in the dataset.

    Parameters: 
        dataframe: Pandas dataframe
    
    Returns: 
        Display hist plots for dataframe
    """
    dataframe.hist(figsize=(20,10))
    plt.show()


In [None]:
def get_model_results( df, target, sampler, scaler, model):
    """
    Build and evaluation model
    - prepare and split dataframe to X and y
    - split into training and test data , test size=0.3
    - resample training data
    - build and fit model
    - evaluate and output model scores
        - ROC Score
        - Classification Report
        - Confusion Matrix

    Parameters:
        df: dataframe
        target: label/ classifier columns name
        sampler:  Resampling method
        scaler: Scaling method
        model: ML algorithm (e.g. sklearn LogisticRegression)

    Returns:
        Prints model Scores

    """
    # Prep Data
    X, y = prep_data(df=df, target=target)

    # Split Data into training and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    #resample training data
    X_resampled, y_resampled = sampler.fit_resample(X_train,y_train)

    # create Pipeline
    pipeline = make_pipeline(scaler, model)

    # Fit your training model to your training set
    pipeline.fit(X_resampled, y_resampled)

    # Obtain the predicted values and probabilities from the model 
    predicted = pipeline.predict(X_test)
    
    try:
        probs = pipeline.predict_proba(X_test)
        print('ROC Score:')
        print(roc_auc_score(y_test, probs[:,1]))
    except AttributeError:
        pass

    # Print the ROC curve, classification report and confusion matrix
    print('\nClassification Report:')
    print(classification_report(y_test, predicted))
    print('\nConfusion Matrix:')
    print(confusion_matrix(y_test, predicted))

In [1]:
# Evaluate model
def evaluate_LR(y_test, y_pred):
    """ 
    Evaluate classification model
     - Calculate and print the classification report
     - Calculate and plot  Confusion Matrix

    Parameters:
        y_test: y_test data
        y_pred: predicted y data

    Returns:
        Prints classification report
        display confusrtion matrix chart
        cm:  confusion matrix
        scores:  accuracy, precision, f1

    """
  
    #print classification report)
    print(classification_report(y_test, y_pred))

    # Calculate the accuracy
    acc = accuracy_score(y_test, y_pred)
    
    # Calculate the precision
    precision = precision_score(y_test, y_pred)

    f1 = f1_score(y_test, y_pred)

    scores = (acc, precision, f1)

    # show confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(
        cm, annot=[
            [f"TP={cm[0][0]:.0f}",f"FP={cm[0][1]:.0f}"], 
            [f"FN={cm[1][0]:.0f}", f"TN={cm[1][1]:.0f}"]],
            fmt='', 
            cmap="Blues")
    plt.show()
    return cm, scores

## Import Dataset

In [5]:
# import  CSV file data into pandas dataframe 
df = pd.read_csv('DataSets/framingham.csv')

# Print Shape
print(df.shape)


(4238, 16)


Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


## Exploratory Data Analysis

In [6]:
# show first 5 rows of dataframe
df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [7]:
# Describe data
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
male,4238.0,0.429212,0.495022,0.0,0.0,0.0,1.0,1.0
age,4238.0,49.584946,8.57216,32.0,42.0,49.0,56.0,70.0
education,4133.0,1.97895,1.019791,1.0,1.0,2.0,3.0,4.0
currentSmoker,4238.0,0.494101,0.500024,0.0,0.0,0.0,1.0,1.0
cigsPerDay,4209.0,9.003089,11.920094,0.0,0.0,0.0,20.0,70.0
BPMeds,4185.0,0.02963,0.169584,0.0,0.0,0.0,0.0,1.0
prevalentStroke,4238.0,0.005899,0.076587,0.0,0.0,0.0,0.0,1.0
prevalentHyp,4238.0,0.310524,0.462763,0.0,0.0,0.0,1.0,1.0
diabetes,4238.0,0.02572,0.158316,0.0,0.0,0.0,0.0,1.0
totChol,4188.0,236.721585,44.590334,107.0,206.0,234.0,263.0,696.0


In [9]:
# Describe data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4238 entries, 0 to 4237
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   male             4238 non-null   int64  
 1   age              4238 non-null   int64  
 2   education        4133 non-null   float64
 3   currentSmoker    4238 non-null   int64  
 4   cigsPerDay       4209 non-null   float64
 5   BPMeds           4185 non-null   float64
 6   prevalentStroke  4238 non-null   int64  
 7   prevalentHyp     4238 non-null   int64  
 8   diabetes         4238 non-null   int64  
 9   totChol          4188 non-null   float64
 10  sysBP            4238 non-null   float64
 11  diaBP            4238 non-null   float64
 12  BMI              4219 non-null   float64
 13  heartRate        4237 non-null   float64
 14  glucose          3850 non-null   float64
 15  TenYearCHD       4238 non-null   int64  
dtypes: float64(9), int64(7)
memory usage: 529.9 KB


In [10]:
# check for duplicates
df.duplicated().sum()

0

In [11]:
# check for null values
df.isna().sum()

male                 0
age                  0
education          105
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64