In [8]:
from google.colab import drive
drive.mount('/content/drive', timeout_ms=300000)  # Increased timeout to 5 minutes (300,000 milliseconds)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
import pandas as pd
df= pd.read_csv('/content/drive/MyDrive/datasets_4123_6408_framingham.csv')
df.head()


Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [10]:
df.shape


(4240, 16)

In [11]:
df.drop('education', axis=1, inplace=True)

In [12]:
df.isnull().sum()

male                 0
age                  0
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

In [13]:
#defining binary columns
bin_cols=["male", "currentSmoker", "prevalentStroke", "prevalentHyp", "diabetes"]

#filling missing values for binary features with most frequent value(mode)

for col in bin_cols:
    mode_values = df[col].mode()[0]
    df[col].fillna(mode_values, inplace=True)



In [14]:
df.isnull().sum()

male                 0
age                  0
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

In [15]:
import numpy as np
#filling missing values for numeric features

numeric_cols=["cigsPerDay", "BPMeds", "totChol", "BMI", "heartRate", "glucose"]

for col in numeric_cols:
    median_val=df[col].median()
    df[col].fillna(median_val, inplace=True)

df.isnull().sum()

male               0
age                0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
dtype: int64

In [16]:
df["TenYearCHD"].value_counts()
#biased data as more value is of no

TenYearCHD
0    3596
1     644
Name: count, dtype: int64

In [17]:
from sklearn.utils import resample

#separating majority and minority classes

df_majority = df[df["TenYearCHD"]==0]
df_minority = df[df["TenYearCHD"]==1]

#upsample minority class

df_minority_upsampled = resample(df_minority, replace=True, n_samples=len(df_majority), random_state=42)

#combining majority class with upsampled minority class

df_balanced = pd.concat([df_majority, df_minority_upsampled])

In [18]:
df_balanced["TenYearCHD"].value_counts()

TenYearCHD
0    3596
1    3596
Name: count, dtype: int64

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#separate features (X) and target variable(y)

X = df_balanced.drop(columns=["TenYearCHD"])
y = df_balanced["TenYearCHD"]

#Split data into training and testing sets (80% train, 20% test)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
X_train

Unnamed: 0,male,age,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose
1905,0,64,0,0.0,0.0,0,1,0,229.0,145.0,85.0,29.67,70.0,74.0
2075,0,37,1,20.0,0.0,0,0,0,166.0,112.0,73.5,21.64,75.0,93.0
1128,0,63,1,10.0,0.0,0,1,0,236.0,189.0,103.0,27.91,60.0,74.0
1782,0,65,0,0.0,0.0,0,1,0,245.0,171.0,89.0,23.07,82.0,93.0
241,1,65,1,15.0,0.0,0,1,0,219.0,148.0,90.0,29.35,77.0,97.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
624,1,63,1,20.0,0.0,0,1,0,269.0,180.0,101.0,24.42,72.0,84.0
485,1,54,1,40.0,0.0,0,0,0,230.0,145.0,90.0,25.72,75.0,85.0
4232,1,68,0,0.0,0.0,0,1,0,176.0,168.0,97.0,23.14,60.0,79.0
952,1,66,1,30.0,0.0,0,0,1,234.0,114.5,62.5,28.62,75.0,216.0


In [21]:
#Initialize standard scaler

scaler = StandardScaler()

#Fit scaler to training data and transform both training and testing data

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [22]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(X_train_scaled, y_train)
y_pred = rf_model.predict(X_test_scaled)


In [23]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

accuracy_score(y_test, y_pred)



0.9749826268241835

In [24]:
confusion_matrix(y_test, y_pred)

array([[704,  31],
       [  5, 699]])

In [25]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.96      0.98       735
           1       0.96      0.99      0.97       704

    accuracy                           0.97      1439
   macro avg       0.98      0.98      0.97      1439
weighted avg       0.98      0.97      0.97      1439



In [26]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

In [27]:
classifiers = [RandomForestClassifier(), AdaBoostClassifier(), GradientBoostingClassifier(), LogisticRegression(), SVC(), KNeighborsClassifier(), DecisionTreeClassifier(), GaussianNB(), XGBClassifier()]

#create a dictionary to store the result

results = {}
results_df = pd.DataFrame(columns=['Model', 'Accuracy', 'F1-Score', 'Precision', 'Recall']) # Initialize results_df

#Train and evaluate each classifier

for clf in classifiers:
    clf_name = clf.__class__.__name__
    clf.fit(X_train_scaled, y_train)
    y_pred = clf.predict(X_test_scaled)

    #calculate accuracy
    accuracy =  accuracy_score(y_test, y_pred)
    print(f"{clf_name} Accuracy: {accuracy}")

    #classification report
    print(f"Classification Report for {clf_name}:")
    print(classification_report(y_test, y_pred))

    #confusion matrix

    print(f"Confusion Matrix for {clf_name}:")
    print(confusion_matrix(y_test, y_pred))
    print("="*50)

    #calculate evaluation metrics

    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    f1_score = report['weighted avg']['f1-score']
    precision = report['weighted avg']['precision']
    recall = report['weighted avg']['recall']

    #append results to dataframe
    results_df = pd.concat([results_df, pd.DataFrame([{'Model': clf_name, 'Accuracy': accuracy, 'F1-Score': f1_score, 'Precision': precision, 'Recall': recall}])], ignore_index=True)

results_df

RandomForestClassifier Accuracy: 0.9715079916608756
Classification Report for RandomForestClassifier:
              precision    recall  f1-score   support

           0       0.99      0.95      0.97       735
           1       0.95      0.99      0.97       704

    accuracy                           0.97      1439
   macro avg       0.97      0.97      0.97      1439
weighted avg       0.97      0.97      0.97      1439

Confusion Matrix for RandomForestClassifier:
[[699  36]
 [  5 699]]


  results_df = pd.concat([results_df, pd.DataFrame([{'Model': clf_name, 'Accuracy': accuracy, 'F1-Score': f1_score, 'Precision': precision, 'Recall': recall}])], ignore_index=True)


AdaBoostClassifier Accuracy: 0.6719944405837387
Classification Report for AdaBoostClassifier:
              precision    recall  f1-score   support

           0       0.69      0.66      0.67       735
           1       0.66      0.68      0.67       704

    accuracy                           0.67      1439
   macro avg       0.67      0.67      0.67      1439
weighted avg       0.67      0.67      0.67      1439

Confusion Matrix for AdaBoostClassifier:
[[486 249]
 [223 481]]
GradientBoostingClassifier Accuracy: 0.7289784572619875
Classification Report for GradientBoostingClassifier:
              precision    recall  f1-score   support

           0       0.76      0.69      0.72       735
           1       0.70      0.77      0.74       704

    accuracy                           0.73      1439
   macro avg       0.73      0.73      0.73      1439
weighted avg       0.73      0.73      0.73      1439

Confusion Matrix for GradientBoostingClassifier:
[[508 227]
 [163 541]]
Logist

Unnamed: 0,Model,Accuracy,F1-Score,Precision,Recall
0,RandomForestClassifier,0.971508,0.971508,0.97241,0.971508
1,AdaBoostClassifier,0.671994,0.672015,0.672474,0.671994
2,GradientBoostingClassifier,0.728978,0.728702,0.73132,0.728978
3,LogisticRegression,0.658791,0.65883,0.659053,0.658791
4,SVC,0.683113,0.683126,0.683656,0.683113
5,KNeighborsClassifier,0.787352,0.783833,0.812481,0.787352
6,DecisionTreeClassifier,0.917999,0.917638,0.928593,0.917999
7,GaussianNB,0.583044,0.530092,0.635597,0.583044
8,XGBClassifier,0.906185,0.905977,0.912148,0.906185


In [28]:
# # from sklearn.ensemble import RandomForestClassifier
# # from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# # from sklearn.model_selection import train_test_split # import train_test_split
# # from sklearn.preprocessing import StandardScaler # import StandardScaler


# # Assuming 'data' is your pandas DataFrame and 'target_variable' is the name of your target variable column
# X = data.drop('target_variable', axis=1)  # Replace 'target_variable' with the actual column name
# y = data['target_variable']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Split the dataset



# # Create an instance of StandardScaler
# scaler = StandardScaler()

# X_train_scaled = scaler.fit_transform(X_train) # Scale training data
# X_test_scaled = scaler.transform(X_test) # Scale testing data


#instantiate the RandomForestClassifier
rf_classifier = RandomForestClassifier()


#train the randomForestClassifier
rf_classifier.fit(X_train_scaled, y_train)


#predict on the test set
y_pred_rf= rf_classifier.predict(X_test_scaled)


#calculate accuracy
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Classifier Accuracy: ", accuracy_rf)



#classification report
print("Classification Report for Random Forest Classifier: ")
print(classification_report(y_test, y_pred_rf))


#confusion matrix
print("Confusion Matrix for Random Forest Classifier: ")
print(confusion_matrix(y_test, y_pred_rf))

Random Forest Classifier Accuracy:  0.9728978457261988
Classification Report for Random Forest Classifier: 
              precision    recall  f1-score   support

           0       0.99      0.95      0.97       735
           1       0.95      0.99      0.97       704

    accuracy                           0.97      1439
   macro avg       0.97      0.97      0.97      1439
weighted avg       0.97      0.97      0.97      1439

Confusion Matrix for Random Forest Classifier: 
[[701  34]
 [  5 699]]


In [29]:
#test1
print("predicted class ", rf_classifier.predict(X_test_scaled[10].reshape(1,-1))[0])
print("actual class", y_test.iloc[10])

#test2
print("predicted class", rf_classifier.predict(X_test_scaled[200].reshape(1,-1))[0])
print("actual class", y_test.iloc[200])

#test3
print("predicted class", rf_classifier.predict(X_test_scaled[200].reshape(1,-1))[0])
print("actual class", y_test.iloc[179])

predicted class  0
actual class 0
predicted class 1
actual class 1
predicted class 1
actual class 1


In [30]:
import pickle
import os

os.makedirs("models", exist_ok=True)

# Make sure this cell has been executed before running this code
pickle.dump(rf_classifier, open("models/rf_classifier.pkl", 'wb'))
pickle.dump(scaler, open("models/scaler.pkl", 'wb'))

In [31]:
#load the RandomForestClassifier model

with open("models/rf_classifier.pkl", "rb") as file:
    rf_classifier = pickle.load(file)

#load the scaler

with open("models/scaler.pkl", "rb") as file:
    scaler = pickle.load(file)

In [34]:
def pred(rf_classifier, scaler, male, age, currentSmoker, cigsPerDay, BPMeds, prevalentStroke, prevalentHyp, diabetes, totChol, sysBP, diaBP, BMI, heartRate, glucose):

    #encoding categorical variable

    male_encoded = 1 if male.lower() == "male" else 0
    currentSmoker_encoded = 1 if currentSmoker.lower() =="yes" else 0
    BPMeds_encoded = 1 if BPMeds.lower() == "yes" else 0
    prevalentStroke_encoded = 1 if prevalentStroke.lower() == "yes" else 0
    prevalentHyp_encoded = 1 if prevalentHyp.lower() == "yes" else 0
    diabetes_encoded = 1 if diabetes.lower() == "yes" else 0


    #prepare features array
    features = [[male_encoded, age, currentSmoker_encoded, cigsPerDay, BPMeds_encoded, prevalentStroke_encoded, prevalentHyp_encoded, diabetes_encoded, totChol, sysBP, diaBP, BMI, heartRate, glucose]]

    #scalling
    scaled_features = scaler.transform(features)

    #predict by model
    result = rf_classifier.predict(scaled_features)

    return result[0]

In [36]:
#test 1

male = "female"
age = 56.00
currentSmoker = "yes"
cigsPerDay = 0.00
BPMeds = "yes"
prevalentStroke = "no"
prevalentHyp = "yes"
diabetes = "no"
totChol = 280.00
sysBP = 145.00
diaBP = 100.90
BMI = 26.00
heartRate = 80.00
glucose = 78.0

result = pred(rf_classifier, scaler, male, age, currentSmoker, cigsPerDay, BPMeds, prevalentStroke, prevalentHyp, diabetes, totChol, sysBP, diaBP, BMI, heartRate, glucose )

if result==1:
    print("This patient has heart disease...")
else:
    print("This patient has no heart disease...")


This patient has no heart disease...




In [38]:
#test 2

male = "male"
age = 50.00
currentSmoker = "yes"
cigsPerDay = 7.00
BPMeds = "yes"
prevalentStroke = "yes"
prevalentHyp = "yes"
diabetes = "yes"
totChol = 280.00
sysBP = 145.00
diaBP = 100.90
BMI = 26.00
heartRate = 80.00
glucose = 78.0

result = pred(rf_classifier, scaler, male, age, currentSmoker, cigsPerDay, BPMeds, prevalentStroke, prevalentHyp, diabetes, totChol, sysBP, diaBP, BMI, heartRate, glucose )

if result==1:
    print("This patient has heart disease...")
else:
    print("This patient has no heart disease...")


This patient has no heart disease...


