<a href="https://colab.research.google.com/github/purvamarkam/ML_LAB/blob/main/Assignment10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score

np.random.seed(42)


# Load & Preprocess

def load_and_preprocess_data(filepath):
    data = pd.read_csv(filepath)
    print(f" Loaded data! Shape: {data.shape}")

    if 'id' in data.columns:
        data = data.drop(columns=['id'])
        print(" Dropped 'id' column.")

    print("\nChecking for missing values:")
    print(data.isnull().sum().to_string())

    data['diagnosis'] = data['diagnosis'].map({'M': 1, 'B': 0})
    print(" Encoded 'diagnosis': M → 1, B → 0")

    X = data.drop(columns=['diagnosis'])
    y = data['diagnosis']
    print(f" Features: {X.shape}, Target: {y.shape}")
    return X, y


# Skewed Split

def skew_train_test_split(X, y, n_move=120, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=random_state
    )

    train_df = pd.concat([X_train, y_train], axis=1)
    test_df = pd.concat([X_test, y_test], axis=1)

    m_class_rows = train_df[train_df['diagnosis'] == 1]
    to_move = m_class_rows.sample(n=n_move, random_state=random_state)

    train_df = train_df.drop(to_move.index)
    test_df = pd.concat([test_df, to_move])

    print(f" Moved {n_move} malignant cases to test set.")
    print(f"Train size: {train_df.shape[0]},  Test size: {test_df.shape[0]}")

    X_train = train_df.drop(columns=['diagnosis'])
    y_train = train_df['diagnosis']
    X_test = test_df.drop(columns=['diagnosis'])
    y_test = test_df['diagnosis']

    return X_train, X_test, y_train, y_test


#  Train Ensemble

def train_decision_trees(X_train, y_train, n_trees=10):
    trees, accuracies, importances = [], [], []

    for i in range(n_trees):
        idx = np.random.choice(X_train.index, size=len(X_train), replace=True)
        X_sample = X_train.loc[idx]
        y_sample = y_train.loc[idx]

        tree = DecisionTreeClassifier(max_features='sqrt', class_weight='balanced', random_state=i)
        tree.fit(X_sample, y_sample)

        acc = accuracy_score(y_train, tree.predict(X_train))
        print(f" Tree {i+1} → Accuracy: {acc:.4f}")

        trees.append(tree)
        accuracies.append(acc)
        importances.append(tree.feature_importances_)

    return trees, accuracies, importances


#  Select Top Features

def get_top_features_simple(importances, feature_names, top_n=10):
    avg_importance = np.mean(np.array(importances), axis=0)
    top_indices = np.argsort(avg_importance)[::-1][:top_n]
    top_features = [feature_names[i] for i in top_indices]

    print(f"\nTop {top_n} Features:")
    for i, feature in enumerate(top_features):
        print(f"{i+1}. {feature}")
    return top_features


#  Retrain on Top Features

def retrain_decision_trees_on_selected_features(X_train_selected, y_train, n_trees=10):
    trees, accuracies = [], []

    for i in range(n_trees):
        idx = np.random.choice(X_train_selected.index, size=len(X_train_selected), replace=True)
        X_sample = X_train_selected.loc[idx]
        y_sample = y_train.loc[idx]

        tree = DecisionTreeClassifier(max_features='sqrt', class_weight='balanced', random_state=100+i)
        tree.fit(X_sample, y_sample)

        acc = accuracy_score(y_train, tree.predict(X_train_selected))
        print(f"Retrained Tree {i+1} Accuracy: {acc:.4f}")

        trees.append(tree)
        accuracies.append(acc)

    return trees, accuracies

# Train Final Models

def train_final_models(retrained_trees, X_train_selected, y_train):
    tree_preds = [tree.predict(X_train_selected) for tree in retrained_trees]
    tree_preds_matrix = np.column_stack(tree_preds)

    X_combined = pd.concat([
        X_train_selected.reset_index(drop=True),
        pd.DataFrame(tree_preds_matrix, columns=[f"tree_{i+1}" for i in range(len(retrained_trees))])
    ], axis=1)

    log_model = LogisticRegression(class_weight='balanced', max_iter=1000)
    log_model.fit(X_combined, y_train)

    master_tree = DecisionTreeClassifier(class_weight='balanced', max_features='sqrt', random_state=200)
    master_tree.fit(X_combined, y_train)

    return log_model, master_tree


# Evaluate Final Models

def evaluate_final_models(retrained_trees, log_model, master_tree, X_test_selected, y_test):
    test_tree_preds = [tree.predict(X_test_selected) for tree in retrained_trees]
    test_preds_matrix = np.column_stack(test_tree_preds)

    X_test_combined = pd.concat([
        X_test_selected.reset_index(drop=True),
        pd.DataFrame(test_preds_matrix, columns=[f"tree_{i+1}" for i in range(len(retrained_trees))])
    ], axis=1)

    y_pred_log = log_model.predict(X_test_combined)
    acc_log = accuracy_score(y_test, y_pred_log)
    recall_log = recall_score(y_test, y_pred_log)

    y_pred_tree = master_tree.predict(X_test_combined)
    acc_tree = accuracy_score(y_test, y_pred_tree)
    recall_tree = recall_score(y_test, y_pred_tree)

    print("\nFinal Model Evaluation:")
    print(f"🔹 Logistic Regression: Accuracy = {acc_log:.4f}, Recall = {recall_log:.4f}")
    print(f"🔸 Master Tree: Accuracy = {acc_tree:.4f}, Recall = {recall_tree:.4f}")

    return acc_log, recall_log, acc_tree, recall_tree


def main_pipeline():
    # Replace with the correct file path
    file_path = 'Cancer_Data.csv'

    X, y = load_and_preprocess_data(file_path)
    X_train, X_test, y_train, y_test = skew_train_test_split(X, y)

    trees, accuracies, importances = train_decision_trees(X_train, y_train)
    top_features = get_top_features_simple(importances, X_train.columns, top_n=10)

    X_train_selected = X_train[top_features]
    X_test_selected = X_test[top_features]

    retrained_trees, _ = retrain_decision_trees_on_selected_features(X_train_selected, y_train)
    log_model, master_tree = train_final_models(retrained_trees, X_train_selected, y_train)
    evaluate_final_models(retrained_trees, log_model, master_tree, X_test_selected, y_test)
main_pipeline()

 Loaded data! Shape: (569, 33)
 Dropped 'id' column.

Checking for missing values:
diagnosis                    0
radius_mean                  0
texture_mean                 0
perimeter_mean               0
area_mean                    0
smoothness_mean              0
compactness_mean             0
concavity_mean               0
concave points_mean          0
symmetry_mean                0
fractal_dimension_mean       0
radius_se                    0
texture_se                   0
perimeter_se                 0
area_se                      0
smoothness_se                0
compactness_se               0
concavity_se                 0
concave points_se            0
symmetry_se                  0
fractal_dimension_se         0
radius_worst                 0
texture_worst                0
perimeter_worst              0
area_worst                   0
smoothness_worst             0
compactness_worst            0
concavity_worst              0
concave points_worst         0
symmetry_worst    