In [1]:
import pandas as pd 
from scipy.stats import gaussian_kde
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Load the CSV file
df = pd.read_csv('heart_2022_cleaned.csv')

In [2]:
from sklearn import metrics, model_selection, preprocessing

In [3]:
# Checking the columns in df
print(df.columns)

# Separating features and target variable
x = df.drop(columns=['HadHeartAttack'])  # Drop the target column
y = df['HadHeartAttack']                # Access the target column

# Printing shapes of x and y
print(x.shape, y.shape)

Index(['State', 'Sex', 'GeneralHealth', 'PhysicalHealthDays',
       'MentalHealthDays', 'LastCheckupTime', 'PhysicalActivities',
       'SleepHours', 'RemovedTeeth', 'HadHeartAttack', 'HadAngina',
       'HadStroke', 'HadAsthma', 'HadSkinCancer', 'HadCOPD',
       'HadDepressiveDisorder', 'HadKidneyDisease', 'HadArthritis',
       'HadDiabetes', 'DeafOrHardOfHearing', 'BlindOrVisionDifficulty',
       'DifficultyConcentrating', 'DifficultyWalking',
       'DifficultyDressingBathing', 'DifficultyErrands', 'SmokerStatus',
       'ECigaretteUsage', 'ChestScan', 'RaceEthnicityCategory', 'AgeCategory',
       'HeightInMeters', 'WeightInKilograms', 'BMI', 'AlcoholDrinkers',
       'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver', 'TetanusLast10Tdap',
       'HighRiskLastYear', 'CovidPos'],
      dtype='object')
(444975, 39) (444975,)


In [4]:
# Normalizing the feature data
x_normalize = preprocessing.MinMaxScaler()
x_norm = x_normalize.fit_transform(x)

# Printing shapes of normalized data and original data
print(x_norm.shape, x.shape, y.shape)

(444975, 39) (444975, 39) (444975,)


In [5]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(
    x_norm, y, test_size=0.1, random_state=42, stratify=y
)

# Printing shapes of train and test sets
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(400477, 39) (44498, 39) (400477,) (44498,)


In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Assuming you've already preprocessed your data as in your previous steps
# x = features, y = target variable

# Initialize the RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Initialize KFold with 5 splits (you can change this value)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Apply K-Fold Cross-Validation with Random Forest Classifier
cv_scores = cross_val_score(rf, x_norm, y, cv=kf, scoring='accuracy')

# Print the accuracy for each fold
print(f'Accuracy per fold: {cv_scores}')
print(f'Mean accuracy: {cv_scores.mean():.4f}')
print(f'Standard deviation of accuracy: {cv_scores.std():.4f}')

Accuracy per fold: [0.94110905 0.94044609 0.94107534 0.93877184 0.94067082]
Mean accuracy: 0.9404
Standard deviation of accuracy: 0.0009


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Assuming you've already preprocessed your data as in previous steps
# x = features, y = target variable

# Initialize Logistic Regression
logreg = LogisticRegression(random_state=42)

# Initialize KFold with 5 splits (you can change this value)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Apply K-Fold Cross-Validation with Logistic Regression
cv_scores = cross_val_score(logreg, x_norm, y, cv=kf, scoring='accuracy')

# Print the accuracy for each fold
print(f'Accuracy per fold: {cv_scores}')
print(f'Mean accuracy: {cv_scores.mean():.4f}')
print(f'Standard deviation of accuracy: {cv_scores.std():.4f}')


Accuracy per fold: [0.9380527  0.93742345 0.93827743 0.93718748 0.93888421]
Mean accuracy: 0.9380
Standard deviation of accuracy: 0.0006


In [9]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, KFold
import numpy as np

# Assuming you've already preprocessed your data as in previous steps
# x = features, y = target variable
# x_norm = normalized features (e.g., using MinMaxScaler)

# Initialize KFold with 5 splits
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Set n_neighbors to 13
n_neighbors = 13

# Initialize KNN classifier with n_neighbors=13
knn = KNeighborsClassifier(n_neighbors=n_neighbors)

# Perform cross-validation
cv_scores = cross_val_score(knn, x_norm, y, cv=kf, scoring='accuracy')

# Calculate mean accuracy and standard deviation
mean_accuracy = cv_scores.mean()
std_deviation = cv_scores.std()

# Print the accuracy for each fold
print(f"Accuracy for each fold (n_neighbors={n_neighbors}): {cv_scores}")
print(f"Mean accuracy (n_neighbors={n_neighbors}): {mean_accuracy:.4f}")
print(f"Standard deviation (n_neighbors={n_neighbors}): {std_deviation:.4f}")


Accuracy for each fold (n_neighbors=13): [0.93850216 0.93773808 0.93907523 0.93635598 0.93813136]
Mean accuracy (n_neighbors=13): 0.9380
Standard deviation (n_neighbors=13): 0.0009


In [14]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, KFold
import numpy as np

# Assuming you've already preprocessed your data as in previous steps
# x = features, y = target variable

# Initialize KFold with 5 splits
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# List to store mean cross-validation scores for each value of C (regularization parameter)
C_range = [0.1, 1, 10, 100, 1000]  # Regularization strength, we will test various values of C

# List to store mean scores and standard deviations
mean_scores = []
std_scores = []

# Iterate over different values of C (regularization parameter)
for C in C_range:
    # Initialize SVC classifier with current C value
    svc = SVC(C=C, kernel='linear', random_state=42)  # You can try other kernels as well
    
    # Calculate cross-validation scores for the current C value
    cv_scores = cross_val_score(svc, x_norm, y, cv=kf, scoring='accuracy')
    
    # Calculate and store the mean and standard deviation of the cross-validation scores
    mean_scores.append(cv_scores.mean())
    std_scores.append(cv_scores.std())
    
    # Print the accuracy for each fold for the current C value
    print(f'Accuracy for each fold (C={C}): {cv_scores}')
    print(f'Mean accuracy (C={C}): {cv_scores.mean():.4f}')
    print(f'Standard deviation (C={C}): {cv_scores.std():.4f}')
    print('-' * 50)

# Find the optimal C based on the highest mean score
optimal_C = C_range[np.argmax(mean_scores)]
optimal_score = max(mean_scores)
optimal_std = std_scores[np.argmax(mean_scores)]

# Print the optimal C and its corresponding mean accuracy and standard deviation
print(f'Optimal regularization parameter C: {optimal_C}')
print(f'Best cross-validation accuracy: {optimal_score:.4f}')
print(f'Standard deviation of accuracy: {optimal_std:.4f}')


ValueError: Found input variables with inconsistent numbers of samples: [444975, 1000]

In [11]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Assuming you've already preprocessed your data as in previous steps
# x = features, y = target variable

# Initialize XGBoost classifier
xg_clf = xgb.XGBClassifier(random_state=42)

# Initialize KFold with 5 splits (you can change this value)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Apply K-Fold Cross-Validation with XGBoost Classifier
cv_scores = cross_val_score(xg_clf, x_norm, y, cv=kf, scoring='accuracy')

# Print the accuracy for each fold
print(f'Accuracy per fold: {cv_scores}')
print(f'Mean accuracy: {cv_scores.mean():.4f}')
print(f'Standard deviation of accuracy: {cv_scores.std():.4f}')


Accuracy per fold: [0.94123265 0.94103039 0.94130007 0.93971571 0.94113152]
Mean accuracy: 0.9409
Standard deviation of accuracy: 0.0006


In [13]:
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score, KFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from tqdm import tqdm
import numpy as np

# Sample data (replace with your actual preprocessed data)
from sklearn.datasets import make_classification
x, y = make_classification(n_samples=1000, n_features=10, random_state=42)

# Initialize individual models
svc = SVC(kernel="linear", probability=True, random_state=42)
xgboost = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
log_reg = LogisticRegression(max_iter=1000, random_state=42)
knn = KNeighborsClassifier(n_neighbors=13)
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)

# Create a hybrid model using VotingClassifier
hybrid_model = VotingClassifier(
    estimators=[
        ('svc', svc),
        ('xgboost', xgboost),
        ('log_reg', log_reg),
        ('knn', knn),
        ('random_forest', random_forest)
    ],
    voting='soft'  # 'soft' for probabilities, 'hard' for majority voting
)

# Initialize K-Fold for cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Cross-validation with progress tracking
cv_scores = []

print("Performing cross-validation for the hybrid model:")
for train_idx, test_idx in tqdm(kf.split(x), total=kf.get_n_splits(), desc="Folds"):
    x_train, x_test = x[train_idx], x[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    # Fit the hybrid model on the training data
    hybrid_model.fit(x_train, y_train)
    
    # Evaluate the hybrid model on the test data
    score = hybrid_model.score(x_test, y_test)  # Accuracy for the fold
    cv_scores.append(score)

# Compute mean accuracy and standard deviation
mean_accuracy = np.mean(cv_scores)
std_deviation = np.std(cv_scores)

# Display the results
print("\nHybrid Model Cross-Validation Results:")
for i, score in enumerate(cv_scores, 1):
    print(f"Fold {i}: Accuracy = {score:.4f}")
print(f"\nMean Accuracy: {mean_accuracy:.4f}")
print(f"Standard Deviation: {std_deviation:.4f}")


Performing cross-validation for the hybrid model:


Folds: 100%|█████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  3.23it/s]


Hybrid Model Cross-Validation Results:
Fold 1: Accuracy = 0.8650
Fold 2: Accuracy = 0.9050
Fold 3: Accuracy = 0.9200
Fold 4: Accuracy = 0.9000
Fold 5: Accuracy = 0.9050

Mean Accuracy: 0.8990
Standard Deviation: 0.0183



