In [1]:
### Author : Prasad Meesala
# Importing the necessary modules

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model, svm, ensemble, model_selection

In [7]:
# Collecting the data

df = pd.read_csv(r"C:\Users\meesa\Desktop\VSC-ML\Datasets\framingham.csv")
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4238 entries, 0 to 4237
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   male             4238 non-null   int64  
 1   age              4238 non-null   int64  
 2   education        4133 non-null   float64
 3   currentSmoker    4238 non-null   int64  
 4   cigsPerDay       4209 non-null   float64
 5   BPMeds           4185 non-null   float64
 6   prevalentStroke  4238 non-null   int64  
 7   prevalentHyp     4238 non-null   int64  
 8   diabetes         4238 non-null   int64  
 9   totChol          4188 non-null   float64
 10  sysBP            4238 non-null   float64
 11  diaBP            4238 non-null   float64
 12  BMI              4219 non-null   float64
 13  heartRate        4237 non-null   float64
 14  glucose          3850 non-null   float64
 15  TenYearCHD       4238 non-null   int64  
dtypes: float64(9), int64(7)
memory usage: 529.9 KB
None


Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [3]:
# Data preprocessing and data wrangling

df.dropna(inplace = True)
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3656 entries, 0 to 4237
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   male             3656 non-null   int64  
 1   age              3656 non-null   int64  
 2   education        3656 non-null   float64
 3   currentSmoker    3656 non-null   int64  
 4   cigsPerDay       3656 non-null   float64
 5   BPMeds           3656 non-null   float64
 6   prevalentStroke  3656 non-null   int64  
 7   prevalentHyp     3656 non-null   int64  
 8   diabetes         3656 non-null   int64  
 9   totChol          3656 non-null   float64
 10  sysBP            3656 non-null   float64
 11  diaBP            3656 non-null   float64
 12  BMI              3656 non-null   float64
 13  heartRate        3656 non-null   float64
 14  glucose          3656 non-null   float64
 15  TenYearCHD       3656 non-null   int64  
dtypes: float64(9), int64(7)
memory usage: 485.6 KB
None


Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [5]:
# Training the models with the data

X = df.drop("TenYearCHD", axis = 1).values
y = df['TenYearCHD'].values

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.3)

lg = linear_model.LogisticRegression(max_iter = 10000)
svmc = svm.SVC(kernel = "linear")
rf = ensemble.RandomForestClassifier(n_estimators = 50)

lg.fit(X_train, y_train)
svmc.fit(X_train, y_train)
rf.fit(X_train, y_train)

print(lg.score(X_test, y_test), svmc.score(X_test, y_test), rf.score(X_test, y_test))

0.8623518687329079 0.8587055606198724 0.8559708295350957


In [6]:
# Splitting the data into folds and evaluating the model performance

avg = lambda x : sum(x) / len(x)

lg_scores = []
svmc_scores, rf_scores = lg_scores.copy(), lg_scores.copy()

folds = model_selection.StratifiedKFold(n_splits = 4)

for train_index, test_index in folds.split(X, y):
    X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]

    lg = linear_model.LogisticRegression(max_iter = 10000)
    svmc = svm.SVC(kernel = "linear")
    rf = ensemble.RandomForestClassifier(n_estimators = 50)

    lg.fit(X_train, y_train)
    svmc.fit(X_train, y_train)
    rf.fit(X_train, y_train)

    lg_scores.append(lg.score(X_test, y_test))
    svmc_scores.append(svmc.score(X_test, y_test))
    rf_scores.append(rf.score(X_test, y_test))

print(avg(lg_scores), avg(svmc_scores), avg(rf_scores))

0.8536652078774617 0.8476477024070022 0.8471006564551423
