In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils.class_weight import compute_class_weight

# Step 1: Load the new dataset

# Load the dataset (assuming it's a CSV file, update if it's a different format)
df = pd.read_csv("/content/processed.cleveland.data")  # header=None if no column headers in the file

# Step 2: Check the first few rows to understand the structure of the data
print("First few rows of the dataset:")
print(df.head())

# Step 3: Handle the dataset columns (assuming columns are unnamed, we will name them)
# Column names can be adjusted depending on the dataset's actual structure
columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
df.columns = columns  # Rename the columns

# Step 4: Replace '?' with NaN and handle missing values
df.replace('?', np.nan, inplace=True)
df = df.apply(pd.to_numeric, errors='coerce')  # Convert to numeric values, coercing invalid entries to NaN
df.fillna(df.median(), inplace=True)  # Impute missing values with the median of each column

# Step 5: Feature selection and preprocessing
target_column = 'target'  # This is the target column for classification

# Separate features and target
X = df.drop(target_column, axis=1)
y = df[target_column]

# Step 6: Data scaling (Standardizing the features)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 7: Handle Class Imbalance (if necessary)
classes = np.unique(y)  # This will extract unique classes from the target column
class_weights = compute_class_weight('balanced', classes=classes, y=y)
class_weight_dict = dict(zip(classes, class_weights))

# Step 8: Random Forest Classifier with Regularization
# Adjust the hyperparameters to prevent overfitting
model = RandomForestClassifier(
    random_state=42,
    class_weight=class_weight_dict,
    n_estimators=50,  # Reduced number of trees
    max_depth=5,  # Reduced depth of trees
    min_samples_split=10,  # Increased min samples per split
    min_samples_leaf=5,  # Increased min samples per leaf
    max_features='sqrt'  # Limit number of features per split
)

# Use Cross-validation to evaluate the model (StratifiedKFold ensures balanced class distribution in each fold)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X_scaled, y, cv=cv, scoring='accuracy')

print(f"\nCross-validation accuracy scores: {cv_scores}")
print(f"Mean cross-validation accuracy: {cv_scores.mean()}")

# Step 9: Train the model on the entire dataset
model.fit(X_scaled, y)

# Step 10: Make predictions
y_pred = model.predict(X_scaled)

# Step 11: Evaluate the model
print("\nClassification Report:")
print(classification_report(y, y_pred))

print("\nAccuracy Score on the dataset:", accuracy_score(y, y_pred))


First few rows of the dataset:
   63.0  1.0  1.0.1  145.0  233.0  1.0.2  2.0  150.0  0.0  2.3  3.0 0.0.1  \
0  67.0  1.0    4.0  160.0  286.0    0.0  2.0  108.0  1.0  1.5  2.0   3.0   
1  67.0  1.0    4.0  120.0  229.0    0.0  2.0  129.0  1.0  2.6  2.0   2.0   
2  37.0  1.0    3.0  130.0  250.0    0.0  0.0  187.0  0.0  3.5  3.0   0.0   
3  41.0  0.0    2.0  130.0  204.0    0.0  2.0  172.0  0.0  1.4  1.0   0.0   
4  56.0  1.0    2.0  120.0  236.0    0.0  0.0  178.0  0.0  0.8  1.0   0.0   

   6.0  0  
0  3.0  2  
1  7.0  1  
2  3.0  0  
3  3.0  0  
4  3.0  0  

Cross-validation accuracy scores: [0.54098361 0.50819672 0.6        0.68333333 0.55      ]
Mean cross-validation accuracy: 0.5765027322404371

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.89      0.90       163
           1       0.69      0.53      0.60        55
           2       0.76      0.86      0.81        36
           3       0.76      0.83      0.79       

# New Section