In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [3]:
data = pd.read_csv('framingham.csv')
data.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [4]:
# renaming TenYearCHD to CHD
data.rename(columns={"TenYearCHD": "CHD"}, inplace=True)#inplace=True returns None,the data is renamed in place
                                        #inplace=False returns a copy of the object with the operation performed. 

In [5]:
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:,:-1], data.iloc[:,-1], test_size=0.14, random_state=0)

train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

train_data.to_csv('./heart_study_train.csv') #create csv file of each
test_data.to_csv('./heart_study_test.csv')

In [6]:
train_data.iloc[:,:5] #dataframe.iloc[row, column]

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay
1073,1,48,1.0,0,0.0
953,0,39,1.0,1,5.0
2584,0,43,1.0,1,1.0
3068,0,46,3.0,1,30.0
1826,0,38,2.0,0,0.0
...,...,...,...,...,...
1033,0,44,2.0,0,0.0
3264,0,51,1.0,1,2.0
1653,1,39,3.0,1,20.0
2607,0,57,1.0,0,0.0


In [11]:
#From pairplot and heatmap we see that sysBP and diaBP are highly correlated
#And currentSmoker and cigsPerDay are highly correlated
# dropping features which are highly correlated
features_to_drop = ['currentSmoker', 'diaBP']

train_data.drop(features_to_drop, axis=1, inplace=True)

In [12]:
train_data.head()

Unnamed: 0,male,age,education,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,BMI,heartRate,glucose,CHD
1073,1,48,1.0,0.0,0.0,0,0,0,181.0,153.0,29.34,103.0,88.0,0
953,0,39,1.0,5.0,0.0,0,0,0,170.0,137.5,27.35,67.0,70.0,0
2584,0,43,1.0,1.0,0.0,0,0,0,256.0,129.0,25.89,96.0,72.0,0
3068,0,46,3.0,30.0,0.0,0,0,0,196.0,114.0,21.01,60.0,69.0,0
1826,0,38,2.0,0.0,0.0,0,0,0,167.0,102.5,22.58,60.0,57.0,0


In [13]:
#it will impute or drop missing values for all features, whether some features are needed or not
#the steps best for model preparation is : EDA -> Preprocessing (Missing values, Outliers, Normalise etc.) -> Model Fitting and Prediction
#EDA->swarmplot,violinplot,countplot,pairplot,heatmap

missing_values_count = train_data.isnull().sum()
missing_values_count = missing_values_count[missing_values_count > 0]
missing_values_percent = (missing_values_count * 100) / (train_data.shape[0])

print(max(missing_values_percent))

9.133296763576523


In [15]:
#Maximum missing percentage is 9% approx so imputation will be done
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')
#Simple imputer mean is a strategy to handle missing data in a dataset.
#It replaces the missing values with the mean of the column

In [16]:
new_train_data = pd.DataFrame(imputer.fit_transform(train_data))
new_train_data.columns = train_data.columns
new_train_data.index = train_data.index

In [17]:
new_train_data.isnull().sum()

male               0
age                0
education          0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
BMI                0
heartRate          0
glucose            0
CHD                0
dtype: int64

In [18]:
train_data = new_train_data.copy()

In [19]:
#Conclusion of Boxplot :Outliers found in features named ['totChol', 'sysBP', 'BMI','heartRate', 'glucose']
# Outliers handling
#Outliers are those data points that are significantly different from the rest of the dataset. 
print('Number of training examples to be deleted for outliers removal is ',len(train_data[train_data['sysBP'] > 220]) + len(train_data[train_data['BMI'] > 43]) + len(
    train_data[train_data['heartRate'] > 125]) + len(train_data[train_data['glucose'] > 200]) + len(
    train_data[train_data['totChol'] > 450]))

Number of training examples to be deleted for outliers removal is  51


In [20]:
# deleting outliers

train_data = train_data[~(train_data['sysBP'] > 220)]
train_data = train_data[~(train_data['BMI'] > 43)]
train_data = train_data[~(train_data['heartRate'] > 125)]
train_data = train_data[~(train_data['glucose'] > 200)]
train_data = train_data[~(train_data['totChol'] > 450)]
print(train_data.shape)

(3597, 14)


In [21]:
# Standardise some features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
cols_to_standardise = ['age','totChol','sysBP','BMI', 'heartRate', 'glucose', 'cigsPerDay']#from graph of Normalisation Checking
train_data[cols_to_standardise] = scaler.fit_transform(train_data[cols_to_standardise])

In [22]:
#Test Data Preprocessing Similar to Train Data
# dropping unwanted features as done in train data
test_data.drop(features_to_drop, axis=1, inplace=True)

# imputing missing values if any
imputer = SimpleImputer(strategy='most_frequent')
new_test_data = pd.DataFrame(imputer.fit_transform(test_data))
new_test_data.columns = test_data.columns
new_test_data.index = test_data.index

test_data = new_test_data.copy()


In [23]:
# Standardising features
scaler = StandardScaler()
test_data[cols_to_standardise] = scaler.fit_transform(test_data[cols_to_standardise])

In [None]:
#Logistic Regression

In [24]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()

In [25]:
X_train = train_data.loc[:,train_data.columns != 'CHD']
y_train = train_data.loc[:,'CHD']
X_test = test_data.loc[:, test_data.columns !='CHD']
y_test = test_data.loc[:, 'CHD']

In [35]:
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)

In [27]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [28]:
log_reg_accuracy = accuracy_score(y_pred_log, y_test) * 100
print('Accuracy Score for logistic regression is %f'%log_reg_accuracy)

Accuracy Score for logistic regression is 85.353535


In [29]:
log_train_score = log_reg.score(X_train, y_train) * 100
print('Train score for Logistic Regression is %f'%log_train_score)
#The score method returns the mean accuracy of the model across all classes
#the accuracy_score method calculates the accuracy for each class separately and returns the average accuracy across all classes

Train score for Logistic Regression is 85.682513


In [30]:
print('Difference between train and test score for Logistic Regression is %f'%(log_train_score - log_reg_accuracy))

Difference between train and test score for Logistic Regression is 0.328978


In [31]:
confusion_matrix(y_pred_log, y_test)
#represents the prediction summary in matrix form
#The matrix compares the actual target values with those predicted by the machine learning model.

array([[500,  87],
       [  0,   7]], dtype=int64)

In [32]:
print(classification_report(y_pred_log, y_test))

              precision    recall  f1-score   support

         0.0       1.00      0.85      0.92       587
         1.0       0.07      1.00      0.14         7

    accuracy                           0.85       594
   macro avg       0.54      0.93      0.53       594
weighted avg       0.99      0.85      0.91       594



In [36]:
#Decision Tree Classifier

In [37]:
from sklearn.tree import DecisionTreeClassifier


In [38]:
dt_clf = DecisionTreeClassifier(min_samples_split=40, random_state=0) 
#min_samples_split->The minimum number of samples required to split a node. 
# for min_samples_split as 180 I got a better accuracy and train score and difference was less
# but f1 score was very bad for positive class
# and setting min_samples_split as 40, we got good results for all metrics

In [39]:
dt_clf.fit(X_train, y_train)
y_pred_dt = dt_clf.predict(X_test)

In [40]:
dt_accuracy = accuracy_score(y_pred_dt, y_test)*100
print('Accuracy score for Decision tree is %f'%dt_accuracy)

Accuracy score for Decision tree is 82.828283


In [41]:
dt_train_score = dt_clf.score(X_train, y_train)*100
print('Train score for Decision tree is %f'%dt_train_score)

Train score for Decision tree is 87.823186


In [42]:
print('Difference between train and test scores for Decision tree is : %f'%(dt_train_score - dt_accuracy))

Difference between train and test scores for Decision tree is : 4.994903


In [43]:
confusion_matrix(y_pred_dt, y_test)

array([[478,  80],
       [ 22,  14]], dtype=int64)

In [44]:
print(classification_report(y_pred_dt, y_test))

              precision    recall  f1-score   support

         0.0       0.96      0.86      0.90       558
         1.0       0.15      0.39      0.22        36

    accuracy                           0.83       594
   macro avg       0.55      0.62      0.56       594
weighted avg       0.91      0.83      0.86       594



In [46]:
# Exporting the tree in text format
from sklearn.tree import export_text
dt_text_format = export_text(dt_clf, feature_names=list(train_data.columns[:13]))
print('Decision tree in text format : \n%s'%dt_text_format)

Decision tree in text format : 
|--- age <= -0.12
|   |--- sysBP <= 2.82
|   |   |--- cigsPerDay <= 0.05
|   |   |   |--- sysBP <= -0.22
|   |   |   |   |--- heartRate <= -1.11
|   |   |   |   |   |--- heartRate <= -1.20
|   |   |   |   |   |   |--- totChol <= -0.69
|   |   |   |   |   |   |   |--- totChol <= -0.79
|   |   |   |   |   |   |   |   |--- BMI <= -1.03
|   |   |   |   |   |   |   |   |   |--- class: 0.0
|   |   |   |   |   |   |   |   |--- BMI >  -1.03
|   |   |   |   |   |   |   |   |   |--- class: 0.0
|   |   |   |   |   |   |   |--- totChol >  -0.79
|   |   |   |   |   |   |   |   |--- class: 0.0
|   |   |   |   |   |   |--- totChol >  -0.69
|   |   |   |   |   |   |   |--- class: 0.0
|   |   |   |   |   |--- heartRate >  -1.20
|   |   |   |   |   |   |--- class: 0.0
|   |   |   |   |--- heartRate >  -1.11
|   |   |   |   |   |--- glucose <= -0.87
|   |   |   |   |   |   |--- age <= -0.70
|   |   |   |   |   |   |   |--- totChol <= -1.52
|   |   |   |   |   |   |   |   |

In [47]:
#Random Forest Classifier

In [48]:
from sklearn.ensemble import RandomForestClassifier

In [49]:
rf_clf = RandomForestClassifier(n_estimators = 150,min_samples_split=10,random_state=0)
#n_estimators->represents the number of trees in the forest.

In [50]:
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)

In [51]:
rf_accuracy = accuracy_score(y_pred_rf, y_test)*100
print('Accuracy score for Random Forest is %f'%rf_accuracy)

Accuracy score for Random Forest is 84.680135


In [52]:
rf_train_score = rf_clf.score(X_train, y_train)*100
print('Train score for Random Forest is %f'%rf_train_score)

Train score for Random Forest is 91.631915


In [53]:
print('Difference between train and test scores for Random Forest is : %f'%(rf_train_score - rf_accuracy))

Difference between train and test scores for Random Forest is : 6.951781


In [54]:
confusion_matrix(y_pred_rf, y_test)

array([[499,  90],
       [  1,   4]], dtype=int64)

In [55]:
print(classification_report(y_pred_rf, y_test))

              precision    recall  f1-score   support

         0.0       1.00      0.85      0.92       589
         1.0       0.04      0.80      0.08         5

    accuracy                           0.85       594
   macro avg       0.52      0.82      0.50       594
weighted avg       0.99      0.85      0.91       594



In [None]:
#Conclusion:
#Logistic Regression model has least difference(0.328978) in train and test scores and has accuracy of 85% approx So, it does not overfit the data. The F1-score is not good for positive class
#So, Decision Tree Classifier was taken and we saw for min_samples_split as 40, we got almost good accuracy as logistic regression but better f1-score than logistic regression.
#We also tried Random Forest Classifier with n_estimators as 150 and min_samples_split as 10. We got f1-score as the worst in all three models.
#The hyperparameters(A hyperparameter is a parameter whose value is set before the learning process begins.) were tested and tuned repeatedly to get better accuracy with better f1-score.
#Thus, Decision Tree model is recommended for our dataset.