In [None]:
#LIBRARIES

In [48]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score



In [65]:
import warnings
from sklearn.exceptions import ConvergenceWarning

# Suppress ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)


#LOAD THE DATASET

In [66]:
# Load the dataset from CSV file
df = pd.read_csv("heart.csv")

# Display the first few rows of the dataset to understand its structure
print(df.head())


   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   52    1   0       125   212    0        1      168      0      1.0      2   
1   53    1   0       140   203    1        0      155      1      3.1      0   
2   70    1   0       145   174    0        1      125      1      2.6      0   
3   61    1   0       148   203    0        1      161      0      0.0      2   
4   62    0   0       138   294    1        1      106      0      1.9      1   

   ca  thal  target  
0   2     3       0  
1   0     3       0  
2   0     3       0  
3   1     3       0  
4   3     2       0  


#DATA EXPLORATION

In [67]:
# Get information about the dataset (number of rows, columns, data types, etc.)
print(df.info())

# Summary statistics of numerical columns
print(df.describe())

# Check for any missing values
print(df.isnull().sum())

# Check the distribution of the target variable
print(df['target'].value_counts())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB
None
               age          sex           cp     trestbps        chol  \
count  1025.000000  1025.000000  1025.000000  1025.000000  1025.00000   
mean     54.434146     0.695610     0.942439   131.611707   246.0

In [None]:
# DATA PREPROCESSING

In [68]:
# Separate features and target variable
X = df.drop('target', axis=1)
y = df['target']

In [69]:
# Identify numerical and categorical features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns  # Assuming categorical columns are of type 'object'


In [70]:
# Pipeline for numerical features
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Replace missing values with the mean of the column
    ('scaler', StandardScaler())  # Scale numerical features
])

In [71]:
# Pipeline for categorical features
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Replace missing values with the most frequent value of the column
    ('encoder', OneHotEncoder())  # Encode categorical variables
])

In [72]:
# Column transformer to apply different preprocessing steps to different columns
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
])


In [73]:
# Transform the features
X_preprocessed = preprocessor.fit_transform(X)


In [74]:
# Display the shape of the preprocessed features
print("Shape of preprocessed features:", X_preprocessed.shape)


Shape of preprocessed features: (1025, 13)


In [None]:
#FEATURE SELECTION

In [75]:
# Initialize Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)


In [76]:
# Fit Random Forest Classifier on preprocessed features
rf_classifier.fit(X_preprocessed, y)

In [77]:
# Get feature importances
feature_importances = rf_classifier.feature_importances_


In [78]:
# Create a DataFrame to display feature importances
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)


In [61]:
# Display the top N most important features
top_n = 10
print("Top", top_n, "most important features:")
print(feature_importance_df.head(top_n))


Top 10 most important features:
     Feature  Importance
2         cp    0.134201
7    thalach    0.120473
11        ca    0.116755
9    oldpeak    0.116151
12      thal    0.097043
0        age    0.089313
4       chol    0.078930
3   trestbps    0.074253
8      exang    0.059592
10     slope    0.048738


In [None]:
#MODEL SELECTION 

In [79]:
# Initialize classifiers
classifiers = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(random_state=42),
    "Neural Network": MLPClassifier(random_state=42)
}


In [80]:
# Initialize MLPClassifier with a higher max_iter value
mlp_classifier = MLPClassifier(random_state=42, max_iter=500)


In [81]:
# Evaluate classifiers using cross-validation
for clf_name, clf in classifiers.items():
    scores = cross_val_score(clf, X_preprocessed, y, cv=5, scoring='accuracy')
    print(f"{clf_name}: Accuracy: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")

# You can choose the best-performing classifier based on the accuracy scores


Logistic Regression: Accuracy: 0.8459 (+/- 0.0557)
Decision Tree: Accuracy: 1.0000 (+/- 0.0000)
Random Forest: Accuracy: 0.9971 (+/- 0.0117)
SVM: Accuracy: 0.9220 (+/- 0.0638)
Neural Network: Accuracy: 0.9473 (+/- 0.0285)


In [82]:
# Choose the best-performing classifier based on the accuracy scores
best_classifier_name = "Decision Tree"  # Replace this with the best classifier name based on your evaluation


In [83]:
# Initialize the best-performing classifier
best_classifier = classifiers[best_classifier_name]

In [84]:
# Train the classifier on the entire dataset
best_classifier.fit(X_preprocessed, y)

In [85]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Assuming you have true labels for the new data stored in a variable called 'true_labels'
# Calculate predictions on the training data to evaluate the model performance
train_predictions = best_classifier.predict(X_preprocessed)

In [86]:
# Evaluate the model using various evaluation metrics
accuracy = accuracy_score(y, train_predictions)
precision = precision_score(y, train_predictions)
recall = recall_score(y, train_predictions)
f1 = f1_score(y, train_predictions)
roc_auc = roc_auc_score(y, train_predictions)

In [87]:
# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
ROC AUC Score: 1.0
