In [12]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [13]:
# Step 1: Load Data
data = pd.read_csv('heart.csv')  # Replace with the path to your Titanic dataset
print(data.head())  # To display first few rows of the data

   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   52    1   0       125   212    0        1      168      0      1.0      2   
1   53    1   0       140   203    1        0      155      1      3.1      0   
2   70    1   0       145   174    0        1      125      1      2.6      0   
3   61    1   0       148   203    0        1      161      0      0.0      2   
4   62    0   0       138   294    1        1      106      0      1.9      1   

   ca  thal  target  
0   2     3       0  
1   0     3       0  
2   0     3       0  
3   1     3       0  
4   3     2       0  


In [16]:
# Step 2: Preprocess Data
def preprocess_data(data):
    # Handle missing values
    data['Age'].fillna(data['Age'].median(), inplace=True)
    data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
    data['Fare'].fillna(data['Fare'].median(), inplace=True)
    
    # Drop irrelevant columns
    data.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace=True)
    
    # Encode categorical columns (Sex and Embarked)
    data = pd.get_dummies(data, columns=['Sex', 'Embarked'], drop_first=True)
    
    # Separate features (X) and target (y)
    X = data.drop(columns=['Survived'])
    y = data['Survived']
    
    # Standardize features using StandardScaler
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    return X_scaled, y

# Preprocess the data and split into features and target
X, y = preprocess_data(data)

KeyError: 'Age'

In [18]:
# Step 3: Apply Recursive Feature Elimination (RFE)
model = RandomForestClassifier()  # You can replace this with any model of your choice
selector = RFE(model, n_features_to_select=5)  # Select the top 5 features using RFE
selector = selector.fit(X, y)
# Get the selected features
selected_features = X[:, selector.support_]
print("Selected Features: ", selector.support_)


NameError: name 'X' is not defined

In [19]:
# Step 4: Train the model using only the selected features
model.fit(selected_features, y)
y_pred_selected = model.predict(selected_features)
print(f"Model accuracy with selected features: {accuracy_score(y, y_pred_selected):.4f}")

NameError: name 'selected_features' is not defined

In [20]:
# Step 5: Train model with all features (without RFE)
model.fit(X, y)
y_pred_all = model.predict(X)
print(f"Model accuracy with all features: {accuracy_score(y, y_pred_all):.4f}")

NameError: name 'X' is not defined

In [21]:
# Step 6: Confusion Matrix for both models
# For model with selected features
cm_selected = confusion_matrix(y, y_pred_selected)

# For model with all features
cm_all = confusion_matrix(y, y_pred_all)

# Plotting Confusion Matrices
fig, axes = plt.subplots(1, 2, figsize=(12, 6))
sns.heatmap(cm_selected, annot=True, fmt="d", cmap="Blues", ax=axes[0])
axes[0].set_title('Confusion Matrix (Selected Features)')
sns.heatmap(cm_all, annot=True, fmt="d", cmap="Blues", ax=axes[1])
axes[1].set_title('Confusion Matrix (All Features)')

plt.show()

NameError: name 'y' is not defined

In [22]:
# Step 7: Classification Report
print("Classification Report for Model with Selected Features:")
print(classification_report(y, y_pred_selected))

print("Classification Report for Model with All Features:")
print(classification_report(y, y_pred_all))

Classification Report for Model with Selected Features:


NameError: name 'y' is not defined

In [23]:
# Step 8: Feature Importance Visualization
# Using Random Forest to get feature importance
model = RandomForestClassifier()
model.fit(X, y)

importances = model.feature_importances_
feature_names = data.drop(columns=['Survived']).columns

# Visualizing Feature Importance
plt.figure(figsize=(10, 6))
plt.barh(feature_names, importances)
plt.xlabel('Feature Importance')
plt.title('Feature Importance using RandomForest')
plt.show()

NameError: name 'X' is not defined

In [24]:
# Step 9: Splitting the Data into Train and Test Sets
from sklearn.model_selection import train_test_split

# Split the dataset into train and test sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

NameError: name 'X' is not defined

In [25]:
# Step 10: Train Model with Selected Features
# Fit the RandomForest model with selected features
model_selected = RandomForestClassifier(random_state=42)
model_selected.fit(X_train[:, selector.support_], y_train)

# Predict on test set with selected features
y_pred_selected_test = model_selected.predict(X_test[:, selector.support_])

NameError: name 'X_train' is not defined

In [26]:
# Step 11: Train Model with All Features
# Fit the RandomForest model with all features
model_all = RandomForestClassifier(random_state=42)
model_all.fit(X_train, y_train)

# Predict on test set with all features
y_pred_all_test = model_all.predict(X_test)

NameError: name 'X_train' is not defined

In [27]:
# Step 12: Evaluate Model Performance (Accuracy, Confusion Matrix, Classification Report)
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Evaluate model with selected features
accuracy_selected = accuracy_score(y_test, y_pred_selected_test)
print(f"Test Accuracy with selected features: {accuracy_selected:.4f}")
print("Classification Report for Model with Selected Features:")
print(classification_report(y_test, y_pred_selected_test))

# Evaluate model with all features
accuracy_all = accuracy_score(y_test, y_pred_all_test)
print(f"Test Accuracy with all features: {accuracy_all:.4f}")
print("Classification Report for Model with All Features:")
print(classification_report(y_test, y_pred_all_test))

NameError: name 'y_test' is not defined

In [28]:
# Step 13: Plot Confusion Matrices for both models
cm_selected_test = confusion_matrix(y_test, y_pred_selected_test)
cm_all_test = confusion_matrix(y_test, y_pred_all_test)

fig, axes = plt.subplots(1, 2, figsize=(12, 6))
sns.heatmap(cm_selected_test, annot=True, fmt="d", cmap="Blues", ax=axes[0])
axes[0].set_title('Confusion Matrix (Selected Features)')
sns.heatmap(cm_all_test, annot=True, fmt="d", cmap="Blues", ax=axes[1])
axes[1].set_title('Confusion Matrix (All Features)')
plt.show()

NameError: name 'y_test' is not defined

In [29]:
# Step 14: Hyperparameter Tuning with GridSearchCV
from sklearn.model_selection import GridSearchCV

# Hyperparameter tuning for the RandomForest model
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, cv=3, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best hyperparameters and the best score
print("Best Hyperparameters found by GridSearchCV:")
print(grid_search.best_params_)
print(f"Best Cross-validation score: {grid_search.best_score_:.4f}")


NameError: name 'X_train' is not defined

In [30]:
# Step 15: Evaluate the Best Model from GridSearchCV
best_model = grid_search.best_estimator_

# Evaluate on test set
y_pred_best = best_model.predict(X_test)

# Accuracy
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f"Test Accuracy with Best Model: {accuracy_best:.4f}")

# Confusion Matrix for Best Model
cm_best = confusion_matrix(y_test, y_pred_best)
sns.heatmap(cm_best, annot=True, fmt="d", cmap="Blues")
plt.title('Confusion Matrix (Best Model from GridSearchCV)')
plt.show()


AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

In [31]:
# Step 16: Feature Importance from Best Model
importances_best = best_model.feature_importances_

# Visualizing Feature Importance for the Best Model
plt.figure(figsize=(10, 6))
plt.barh(X.columns, importances_best)
plt.xlabel('Feature Importance')
plt.title('Feature Importance from Best Model')
plt.show()


NameError: name 'best_model' is not defined