In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the dataset
df = pd.read_csv('/content/heart (3).csv')

# Display the first few rows of the dataset
print(df.head())

# Assuming the target column is 'target' and the rest are features
# Split dataset into features (X) and target (y)
X = df.drop('Cholesterol', axis=1)
y = df['Cholesterol']

# Step 1: Convert text columns to numbers using label encoding and one hot encoding
# Identify categorical columns (e.g., if they are object types or have string values)
categorical_columns = X.select_dtypes(include=['object']).columns

# Apply Label Encoding for binary or ordinal categorical columns
label_encoder = LabelEncoder()
for col in categorical_columns:
    X[col] = label_encoder.fit_transform(X[col])

# Step 2: Apply Scaling (StandardScaler)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 3: Build classification models and check the best accuracy

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize models
models = {
    'SVM': SVC(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42)
}

# Train models and evaluate accuracy
accuracies = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies[model_name] = accuracy
    print(f'{model_name} Accuracy: {accuracy:.4f}')

# Step 4: Apply PCA to reduce dimensions and retrain the models

# Apply PCA (choose the number of components based on explained variance)
pca = PCA(n_components=0.95)  # Keep 95% of the variance

X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Retrain models using PCA-transformed data
pca_accuracies = {}
for model_name, model in models.items():
    model.fit(X_train_pca, y_train)
    y_pred_pca = model.predict(X_test_pca)
    accuracy_pca = accuracy_score(y_test, y_pred_pca)
    pca_accuracies[model_name] = accuracy_pca
    print(f'{model_name} Accuracy with PCA: {accuracy_pca:.4f}')

# Step 5: Comparison of accuracies
print("\n--- Model Performance Comparison ---")
for model_name in models.keys():
    print(f'{model_name} Accuracy without PCA: {accuracies[model_name]:.4f}')
    print(f'{model_name} Accuracy with PCA: {pca_accuracies[model_name]:.4f}')


   Age Sex ChestPainType  RestingBP  Cholesterol  FastingBS RestingECG  MaxHR  \
0   40   M           ATA        140          289          0     Normal    172   
1   49   F           NAP        160          180          0     Normal    156   
2   37   M           ATA        130          283          0         ST     98   
3   48   F           ASY        138          214          0     Normal    108   
4   54   M           NAP        150          195          0     Normal    122   

  ExerciseAngina  Oldpeak ST_Slope  HeartDisease  
0              N      0.0       Up             0  
1              N      1.0     Flat             1  
2              N      0.0       Up             0  
3              Y      1.5     Flat             1  
4              N      0.0       Up             0  
SVM Accuracy: 0.1848
Logistic Regression Accuracy: 0.1685
Random Forest Accuracy: 0.1848
SVM Accuracy with PCA: 0.1848
Logistic Regression Accuracy with PCA: 0.1739
Random Forest Accuracy with PCA: 0.1739

-