In [58]:
from sklearn.feature_selection import mutual_info_regression
import os
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
import plotly.express as px
%matplotlib inline

sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix, classification_report)
from sklearn.exceptions import NotFittedError

from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier,
                              GradientBoostingClassifier, BaggingClassifier, HistGradientBoostingClassifier)
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier

from imblearn.under_sampling import RandomUnderSampler, TomekLinks, ClusterCentroids, NearMiss, EditedNearestNeighbours
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
from imblearn.pipeline import Pipeline

In [59]:
data= pd.read_csv('/kaggle/input/framingham/framingham.csv')
data

Unnamed: 0,Sex,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,male,39,4.0,No,0.0,0.0,0,0,No,195.0,106.0,70.0,26.97,80.0,77.0,0
1,female,46,2.0,No,0.0,0.0,0,0,No,250.0,121.0,81.0,28.73,95.0,76.0,0
2,male,48,1.0,Yes,20.0,0.0,0,0,No,245.0,127.5,80.0,25.34,75.0,70.0,0
3,female,61,3.0,Yes,30.0,0.0,0,1,No,225.0,150.0,95.0,28.58,65.0,103.0,1
4,female,46,3.0,Yes,23.0,0.0,0,0,No,285.0,130.0,84.0,23.10,85.0,85.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4235,female,48,2.0,Yes,20.0,,0,0,No,248.0,131.0,72.0,22.00,84.0,86.0,0
4236,female,44,1.0,Yes,15.0,0.0,0,0,No,210.0,126.5,87.0,19.16,86.0,,0
4237,female,52,2.0,No,0.0,0.0,0,0,No,269.0,133.5,83.0,21.47,80.0,107.0,0
4238,male,40,3.0,No,0.0,0.0,0,1,No,185.0,141.0,98.0,25.60,67.0,72.0,0


In [60]:
negative_class = data[data['TenYearCHD'] == 0]
positive_class = data[data['TenYearCHD'] == 1]

negative_sample = negative_class.sample(n=60, random_state=42)
positive_sample = positive_class.sample(n=40, random_state=42)  

df = pd.concat([negative_sample, positive_sample])
data = data.drop(df.index)  
df.head(100)

Unnamed: 0,Sex,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
3615,male,38,3.0,Yes,20.0,0.0,0,0,No,270.0,120.0,75.0,23.76,83.0,96.0,0
1488,male,37,4.0,Yes,60.0,0.0,0,0,No,254.0,122.5,82.5,23.87,88.0,83.0,0
3824,female,39,3.0,Yes,8.0,0.0,0,0,No,192.0,109.0,61.0,23.36,64.0,84.0,0
1267,female,46,3.0,No,0.0,0.0,0,0,No,190.0,128.0,74.0,23.01,95.0,78.0,0
2022,female,45,2.0,Yes,9.0,0.0,0,0,No,208.0,137.0,82.0,24.35,85.0,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1297,female,55,2.0,Yes,20.0,0.0,0,0,No,250.0,138.0,87.0,25.33,95.0,,1
619,female,51,1.0,Yes,20.0,0.0,0,0,No,264.0,139.5,89.0,29.38,70.0,76.0,1
2404,male,51,1.0,Yes,15.0,0.0,0,0,No,220.0,125.0,82.0,24.10,60.0,73.0,1
1298,male,40,1.0,No,0.0,0.0,0,1,No,175.0,173.0,59.0,27.99,70.0,75.0,1


In [61]:
data

Unnamed: 0,Sex,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,male,39,4.0,No,0.0,0.0,0,0,No,195.0,106.0,70.0,26.97,80.0,77.0,0
1,female,46,2.0,No,0.0,0.0,0,0,No,250.0,121.0,81.0,28.73,95.0,76.0,0
2,male,48,1.0,Yes,20.0,0.0,0,0,No,245.0,127.5,80.0,25.34,75.0,70.0,0
3,female,61,3.0,Yes,30.0,0.0,0,1,No,225.0,150.0,95.0,28.58,65.0,103.0,1
4,female,46,3.0,Yes,23.0,0.0,0,0,No,285.0,130.0,84.0,23.10,85.0,85.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4235,female,48,2.0,Yes,20.0,,0,0,No,248.0,131.0,72.0,22.00,84.0,86.0,0
4236,female,44,1.0,Yes,15.0,0.0,0,0,No,210.0,126.5,87.0,19.16,86.0,,0
4237,female,52,2.0,No,0.0,0.0,0,0,No,269.0,133.5,83.0,21.47,80.0,107.0,0
4238,male,40,3.0,No,0.0,0.0,0,1,No,185.0,141.0,98.0,25.60,67.0,72.0,0


In [62]:
import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import EditedNearestNeighbours
from collections import Counter

data['Sex'] = data['Sex'].map({'male': 1, 'female': 0})
data['currentSmoker'] = data['currentSmoker'].map({'Yes': 1, 'No': 0})
data['diabetes'] = data['diabetes'].map({'Yes': 1, 'No': 0})
data['cigsPerDay'] = data['cigsPerDay'].fillna(data['cigsPerDay'].mean())
data['BPMeds'] = data['BPMeds'].fillna(data['BPMeds'].median())
data['totChol'] = data['totChol'].fillna(data['totChol'].mean())
data['BMI'] = data['BMI'].fillna(data['BMI'].median())
data['heartRate'] = data['heartRate'].fillna(data['heartRate'].median())
data['glucose'] = data['glucose'].fillna(data['glucose'].median())
data['education'] = data['education'].fillna(data['education'].mode()[0])

data['totChol'] = data['totChol'].apply(lambda x: 300 if x > 300 else x)
data['cigsPerDay'] = data['cigsPerDay'].apply(lambda x: 30 if x > 30 else x)
data['sysBP'] = data['sysBP'].apply(lambda x: 170 if x > 170 else x)
data['glucose'] = data['glucose'].apply(lambda x: 200 if x > 200 else x)

target = data['TenYearCHD']
features = data.drop(['TenYearCHD'], axis=1)
X = features.values
y = target.values

initial_counts = Counter(y)
majority_count = initial_counts[0]

ros = RandomOverSampler(sampling_strategy={1: majority_count})
X_over, y_over = ros.fit_resample(X, y)
X_over = pd.DataFrame(X_over).fillna(method='ffill').values

enn = EditedNearestNeighbours()
X_resampled, y_resampled = enn.fit_resample(X_over, y_over)

x_train, x_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42
)

model = ExtraTreesClassifier(random_state=42, class_weight={0: 1, 1: 6.9})
model.fit(x_train, y_train)
train_preds = model.predict(x_train)
train_accuracy = accuracy_score(y_train, train_preds)
print("Training Accuracy: {:.2f}%".format(train_accuracy * 100))

negative_class = data[data['TenYearCHD'] == 0]
positive_class = data[data['TenYearCHD'] == 1]
negative_sample = negative_class.sample(n=60, random_state=42)
positive_sample = positive_class.sample(n=40, random_state=42)
df = pd.concat([negative_sample, positive_sample])
data = data.drop(df.index)

df_features = df.drop(['TenYearCHD'], axis=1)
df_target = df['TenYearCHD']
df_predictions = model.predict(df_features)

df_results = pd.DataFrame({
    'Actual': df_target.values,
    'Predicted': df_predictions
})
df_results['Correct'] = df_results['Actual'] == df_results['Predicted']
correct_predictions_count = df_results['Correct'].sum()

summary_row = pd.DataFrame({
    'Actual': ['-'],
    'Predicted': ['-'],
    'Correct': [f'{correct_predictions_count} out of 100']
})
df_results = pd.concat([df_results, summary_row], ignore_index=True)

df_accuracy = accuracy_score(df_target, df_predictions)
df_results.loc[df_results.index[-1], 'Accuracy'] = f"{df_accuracy * 100:.2f}%"
print(df_results)

  X_over = pd.DataFrame(X_over).fillna(method='ffill').values


Training Accuracy: 100.00%
    Actual Predicted        Correct Accuracy
0        0         0           True      NaN
1        0         0           True      NaN
2        0         0           True      NaN
3        0         0           True      NaN
4        0         0           True      NaN
..     ...       ...            ...      ...
96       1         1           True      NaN
97       1         0          False      NaN
98       1         1           True      NaN
99       1         1           True      NaN
100      -         -  98 out of 100   98.00%

[101 rows x 4 columns]


