In [4]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv('heart.csv')

In [6]:
df.shape

(746, 12)

In [7]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,56,M,ASY,120,85,0,Normal,140,N,0.0,Up,0
1,52,M,ATA,140,100,0,Normal,138,Y,0.0,Up,0
2,56,M,ASY,120,100,0,Normal,120,Y,1.5,Flat,1
3,69,M,ASY,140,110,1,Normal,109,Y,1.5,Flat,1
4,76,M,NAP,104,113,0,LVH,120,N,3.5,Down,1


In [8]:
df.nunique()

Age                49
Sex                 2
ChestPainType       4
RestingBP          63
Cholesterol       221
FastingBS           2
RestingECG          3
MaxHR             109
ExerciseAngina      2
Oldpeak            43
ST_Slope            3
HeartDisease        2
dtype: int64

In [9]:
df.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [10]:
df.duplicated().sum()

0

## Label Encoder 

In [11]:
from sklearn.preprocessing import LabelEncoder

In [12]:
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])

In [13]:
df['ChestPainType'] = le.fit_transform(df['ChestPainType'])

In [14]:
df['RestingECG'] = le.fit_transform(df['RestingECG'])

In [15]:
df['ExerciseAngina'] = le.fit_transform(df['ExerciseAngina'])

In [16]:
df['ST_Slope'] = le.fit_transform(df['ST_Slope'])

In [17]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,56,1,0,120,85,0,1,140,0,0.0,2,0
1,52,1,1,140,100,0,1,138,1,0.0,2,0
2,56,1,0,120,100,0,1,120,1,1.5,1,1
3,69,1,0,140,110,1,1,109,1,1.5,1,1
4,76,1,2,104,113,0,0,120,0,3.5,0,1


In [18]:
df.describe()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
count,746.0,746.0,746.0,746.0,746.0,746.0,746.0,746.0,746.0,746.0,746.0,746.0
mean,52.882038,0.756032,0.840483,133.022788,244.635389,0.16756,0.931635,140.226542,0.384718,0.901609,1.410188,0.477212
std,9.505888,0.429762,0.958198,17.28275,59.153524,0.373726,0.631939,24.524107,0.486855,1.072861,0.598076,0.499816
min,28.0,0.0,0.0,92.0,85.0,0.0,0.0,69.0,0.0,-0.1,0.0,0.0
25%,46.0,1.0,0.0,120.0,207.25,0.0,1.0,122.0,0.0,0.0,1.0,0.0
50%,54.0,1.0,1.0,130.0,237.0,0.0,1.0,140.0,0.0,0.5,1.0,0.0
75%,59.0,1.0,2.0,140.0,275.0,0.0,1.0,160.0,1.0,1.5,2.0,1.0
max,77.0,1.0,3.0,200.0,603.0,1.0,2.0,202.0,1.0,6.2,2.0,1.0


In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X = df.drop('HeartDisease', axis=1)

In [21]:
y = df['HeartDisease']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Neural

In [23]:
from sklearn.metrics import accuracy_score


In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

# prepare configuration for cross validation test harness
seed = 7
# prepare models
models = []
models.append(('LR', LogisticRegression()))  # 4
models.append(('SVM', SVC(kernel='linear'))) # 3
models.append(('AB', AdaBoostClassifier())) # 2
models.append(('GBM', GradientBoostingClassifier())) # 1

# evaluate each model in turn
results = {}
for name, model in models:
    model.fit(X_train, y_train)
    accuracy0 = model.score(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc

    print("train %s: %.3f%%" % (name, accuracy0 * 100))
    print("test %s: %.3f%%" % (name, acc * 100))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


train LR: 86.590%
test LR: 83.482%
train SVM: 87.165%
test SVM: 83.036%
train AB: 92.146%
test AB: 82.589%
train GBM: 97.510%
test GBM: 83.036%


## Normalization


In [25]:
# min max scaler
from sklearn.preprocessing import MinMaxScaler

In [26]:

# normalize data between 0 and 1
scaler = MinMaxScaler()
data_normalized = scaler.fit_transform(df)

In [27]:
data_normalized

array([[0.57142857, 1.        , 0.        , ..., 0.01587302, 1.        ,
        0.        ],
       [0.48979592, 1.        , 0.33333333, ..., 0.01587302, 1.        ,
        0.        ],
       [0.57142857, 1.        , 0.        , ..., 0.25396825, 0.5       ,
        1.        ],
       ...,
       [0.08163265, 1.        , 0.        , ..., 0.01587302, 0.5       ,
        1.        ],
       [0.79591837, 0.        , 0.66666667, ..., 0.26984127, 0.5       ,
        0.        ],
       [0.53061224, 1.        , 0.        , ..., 0.17460317, 0.5       ,
        1.        ]])

In [28]:
df_normalized = pd.DataFrame(data_normalized, columns=df.columns)

In [29]:
df_normalized.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,0.571429,1.0,0.0,0.259259,0.0,0.0,0.5,0.533835,0.0,0.015873,1.0,0.0
1,0.489796,1.0,0.333333,0.444444,0.028958,0.0,0.5,0.518797,1.0,0.015873,1.0,0.0
2,0.571429,1.0,0.0,0.259259,0.028958,0.0,0.5,0.383459,1.0,0.253968,0.5,1.0
3,0.836735,1.0,0.0,0.444444,0.048263,1.0,0.5,0.300752,1.0,0.253968,0.5,1.0
4,0.979592,1.0,0.666667,0.111111,0.054054,0.0,0.0,0.383459,0.0,0.571429,0.0,1.0


In [30]:
X_train, X_test, y_train, y_test = train_test_split(df_normalized.drop('HeartDisease', axis=1), df_normalized['HeartDisease'], test_size=0.3, random_state=42)

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

# prepare configuration for cross validation test harness
seed = 7
# prepare models
models = []
models.append(('LR', LogisticRegression()))  # 4
models.append(('SVM', SVC(kernel='linear'))) # 3
models.append(('AB', AdaBoostClassifier())) # 2
models.append(('GBM', GradientBoostingClassifier())) # 1


# evaluate each model in turn
results = {}
for name, model in models:
    model.fit(X_train, y_train)
    accuracy0 = model.score(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc

    print("train %s: %.3f%%" % (name, accuracy0 * 100))
    print("test %s: %.3f%%" % (name, acc * 100))


train LR: 86.207%
test LR: 83.929%
train SVM: 86.782%
test SVM: 82.589%
train AB: 92.146%
test AB: 82.589%
train GBM: 97.510%
test GBM: 83.036%


## use Feature Selection

In [32]:
from sklearn.feature_selection import chi2,SelectKBest

In [33]:
selector = SelectKBest(chi2, k=10)
df_new = selector.fit_transform(df_normalized.drop(['HeartDisease'], axis=1), df_normalized['HeartDisease'])

In [34]:
# show the selected features
features = df_normalized.drop(['HeartDisease'], axis=1).columns[selector.get_support()]
features.values

array(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol',
       'FastingBS', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope'],
      dtype=object)

In [35]:
df_featuresSelected = pd.DataFrame(df_new, columns=features)

In [36]:
df_featuresSelected.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,0.571429,1.0,0.0,0.259259,0.0,0.0,0.533835,0.0,0.015873,1.0
1,0.489796,1.0,0.333333,0.444444,0.028958,0.0,0.518797,1.0,0.015873,1.0
2,0.571429,1.0,0.0,0.259259,0.028958,0.0,0.383459,1.0,0.253968,0.5
3,0.836735,1.0,0.0,0.444444,0.048263,1.0,0.300752,1.0,0.253968,0.5
4,0.979592,1.0,0.666667,0.111111,0.054054,0.0,0.383459,0.0,0.571429,0.0


In [37]:
X_train, X_test, y_train, y_test = train_test_split(df_featuresSelected, df_normalized['HeartDisease'], test_size=0.3, random_state=42)

In [38]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

# prepare configuration for cross validation test harness
seed = 7
# prepare models
models = []
models.append(('LR', LogisticRegression()))  # 4
models.append(('SVM', SVC(kernel='linear'))) # 3
models.append(('AB', AdaBoostClassifier())) # 2
models.append(('GBM', GradientBoostingClassifier())) # 1


# evaluate each model in turn
results = {}
for name, model in models:
    model.fit(X_train, y_train)
    accuracy0 = model.score(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc

    print("train %s: %.3f%%" % (name, accuracy0 * 100))
    print("test %s: %.3f%%" % (name, acc * 100))


train LR: 86.207%
test LR: 84.375%
train SVM: 86.973%
test SVM: 82.589%
train AB: 91.379%
test AB: 83.036%
train GBM: 97.126%
test GBM: 82.589%


## Outlier

In [39]:
import matplotlib.pyplot as plt

# Define columns of interest
columns = df.columns.tolist()

# Calculate the IQR for each column
Q1 = df[columns].quantile(0.25)
Q3 = df[columns].quantile(0.75)
IQR = Q3 - Q1

# Define outlier threshold
outlier_threshold = 1.5

# Identify outliers in each column
outliers = (df[columns] < (Q1 - outlier_threshold * IQR)) | (df[columns] > (Q3 + outlier_threshold * IQR))

# Count the number of outliers in each column before removing
num_outliers_before = outliers.sum()

# Filter the DataFrame to remove outliers
df_filtered = df[~outliers.any(axis=1)]

# Calculate the IQR for each column after removing outliers
Q1_filtered = df_filtered[columns].quantile(0.25)
Q3_filtered = df_filtered[columns].quantile(0.75)
IQR_filtered = Q3_filtered - Q1_filtered

# Identify outliers in each column after removing
outliers_filtered = (df_filtered[columns] < (Q1_filtered - outlier_threshold * IQR_filtered)) | (df_filtered[columns] > (Q3_filtered + outlier_threshold * IQR_filtered))

# Count the number of outliers in each column after removing
num_outliers_after = outliers_filtered.sum()

# Create a boxplot for each column after removing outliers
fig, axs = plt.subplots(1, len(columns), figsize=(20, 4))

# Iterate over each column and plot the boxplot
for i, column in enumerate(columns):
    axs[i].boxplot(df_filtered[column])
    axs[i].set_title(column)
    axs[i].set_xlabel('Column')
    axs[i].set_ylabel('Value')
    axs[i].text(0.85, 0.85, f'Outliers: {num_outliers_after[column]}', transform=axs[i].transAxes)

plt.tight_layout()
plt.show()

# Output the number of outliers before and after removing
print("Number of Outliers Before Removing:", num_outliers_before)
print("Number of Outliers After Removing:", num_outliers_after)

df = df_filtered
print(df.shape)


ModuleNotFoundError: No module named 'matplotlib._docstring'

In [None]:
q1 = df.quantile(0.25)
q3 = df.quantile(0.75)
iqr = q3 - q1

  q1 = df.quantile(0.25)
  q3 = df.quantile(0.75)


In [None]:
df = df[~((df < (q1 - 1.5 * iqr)) | (df > (q3 + 1.5 * iqr))).any(axis=1)]

  df = df[~((df < (q1 - 1.5 * iqr)) | (df > (q3 + 1.5 * iqr))).any(axis=1)]


In [None]:
df.shape

(580, 12)

In [None]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
4,76,M,NAP,104,113,0,LVH,120,N,3.5,Down,1
5,38,M,ASY,92,117,0,Normal,134,Y,2.5,Flat,1
7,59,M,ASY,130,126,0,Normal,125,N,0.0,Flat,1
9,50,M,ASY,140,129,0,Normal,135,N,0.0,Up,0
10,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1


In [None]:
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
df['ChestPainType'] = le.fit_transform(df['ChestPainType'])
df['RestingECG'] = le.fit_transform(df['RestingECG'])
df['ExerciseAngina'] = le.fit_transform(df['ExerciseAngina'])
df['ST_Slope'] = le.fit_transform(df['ST_Slope'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['HeartDisease'], axis=1), df['HeartDisease'], test_size=0.3, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

# prepare configuration for cross validation test harness
seed = 7
# prepare models
models = []
models.append(('LR', LogisticRegression()))  # 4
models.append(('SVM', SVC(kernel='linear'))) # 3
models.append(('AB', AdaBoostClassifier())) # 2
models.append(('GBM', GradientBoostingClassifier())) # 1


# evaluate each model in turn
results = {}
for name, model in models:
    model.fit(X_train, y_train)
    accuracy0 = model.score(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc

    print("train %s: %.3f%%" % (name, accuracy0 * 100))
    print("test %s: %.3f%%" % (name, acc * 100))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


train LR: 85.468%
test LR: 84.483%
train LDA: 84.729%
test LDA: 84.483%
train KNN: 77.340%
test KNN: 67.241%
train CART: 100.000%
test CART: 77.586%
train NB: 84.483%
test NB: 86.207%
train SVM: 85.468%
test SVM: 85.057%
train RF: 100.000%
test RF: 87.356%
train ET: 100.000%
test ET: 87.356%
train AB: 90.394%
test AB: 80.460%
train GBM: 97.291%
test GBM: 86.207%
train Qua: 54.433%
test Qua: 52.299%




## use a NNS

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Assuming you have your dataset loaded into X (input features) and y (target variable)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop(['HeartDisease'], axis=1), df['HeartDisease'], test_size=0.3, random_state=42)

# Standardize the input features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create an MLP classifier with three hidden layers, each containing 100 neurons
mlp = MLPClassifier(hidden_layer_sizes=(20, 20, 20), max_iter=300)

# Train the MLP classifier on the training data
mlp.fit(X_train, y_train)

# Predict the target variable for the test data
y_pred = mlp.predict(X_test)

# Evaluate the accuracy of the classifier
print("Accuracy:", mlp.score(X_train, y_train))
accuracy = mlp.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.9789272030651341
Accuracy: 0.8035714285714286


