In [None]:
# Q4  (Sheet 4) – Models Comparison 

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import pandas as pd


# Load Dataset

data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


# 1) Decision Tree (Full)

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

dt_train_acc = accuracy_score(y_train, dt.predict(X_train))
dt_test_acc = accuracy_score(y_test, dt.predict(X_test))


# 2) Random Forest

rf = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)
rf.fit(X_train, y_train)

rf_train_acc = accuracy_score(y_train, rf.predict(X_train))
rf_test_acc = accuracy_score(y_test, rf.predict(X_test))

# Top 5 features
rf_importances = (
    pd.DataFrame({
        "Feature": X.columns,
        "Importance": rf.feature_importances_
    })
    .sort_values("Importance", ascending=False)
    .head(5)
)


# 3) Gradient Boosting

gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train, y_train)

gb_train_acc = accuracy_score(y_train, gb.predict(X_train))
gb_test_acc = accuracy_score(y_test, gb.predict(X_test))

# Top 5 features
gb_importances = (
    pd.DataFrame({
        "Feature": X.columns,
        "Importance": gb.feature_importances_
    })
    .sort_values("Importance", ascending=False)
    .head(5)
)

In [None]:
# Assignment (Sheet 4 )

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# load dataset
data = load_breast_cancer()
X = data.data
y = data.target

# split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


# 1) Decision Tree (FULL)

dt_full = DecisionTreeClassifier(random_state=42)
dt_full.fit(X_train, y_train)

dt_full_train = accuracy_score(y_train, dt_full.predict(X_train))
dt_full_test  = accuracy_score(y_test, dt_full.predict(X_test))


# 2) Decision Tree (Pruned)

dt_pruned = DecisionTreeClassifier(max_depth=3, random_state=42)
dt_pruned.fit(X_train, y_train)

dt_pruned_train = accuracy_score(y_train, dt_pruned.predict(X_train))
dt_pruned_test  = accuracy_score(y_test, dt_pruned.predict(X_test))


# 3) Random Forest

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

rf_train = accuracy_score(y_train, rf.predict(X_train))
rf_test  = accuracy_score(y_test, rf.predict(X_test))


# 4) Gradient Boosting

gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train, y_train)

gb_train = accuracy_score(y_train, gb.predict(X_train))
gb_test  = accuracy_score(y_test, gb.predict(X_test))


# Print results

print("\n=== Model Comparison ===")
print(f"Decision Tree (Full)     -> Train: {dt_full_train:.4f} | Test: {dt_full_test:.4f}")
print(f"Decision Tree (Pruned)   -> Train: {dt_pruned_train:.4f} | Test: {dt_pruned_test:.4f}")
print(f"Random Forest            -> Train: {rf_train:.4f} | Test: {rf_test:.4f}")
print(f"Gradient Boosting        -> Train: {gb_train:.4f} | Test: {gb_test:.4f}")


In [None]:
# Q5 (Sheet 3 )– Effect of Decision Threshold

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
import numpy as np

# Load dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train logistic regression
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)

# Predict probabilities
prob = model.predict_proba(X_test)[:, 1]   # probability of class "1 - benign"

# Try different thresholds
thresholds = [0.5, 0.6, 0.7, 0.8]

for t in thresholds:
    y_pred_t = (prob >= t).astype(int)
    acc = accuracy_score(y_test, y_pred_t)
    cm = confusion_matrix(y_test, y_pred_t)

    print(f"\n=== Threshold: {t} ===")
    print(f"Accuracy: {acc:.4f}")
    print("Confusion Matrix:")
    print(cm)


In [None]:
# Task (Sheet 3) Q1 – Complete Pipeline with Preprocessing

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load data
data = load_breast_cancer()
X, y = data.data, data.target

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Create pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(max_iter=500))
])

# Train
pipeline.fit(X_train, y_train)

# Evaluate
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Pipeline Test Accuracy: {accuracy:.4f}")


In [None]:
# Task (Sheet 3) Q2 – Feature Importance for Logistic Regression

import pandas as pd
import numpy as np

# Extract model coefficients
coeffs = pipeline.named_steps['model'].coef_[0]

# Put into dataframe
feature_importance = pd.DataFrame({
    'Feature': data.feature_names,
    'Importance': np.abs(coeffs)
}).sort_values('Importance', ascending=False)

# Show top 5 features
print(feature_importance.head())

# Task (Sheet 3) Q1 – Complete Pipeline with Preprocessing

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load data
data = load_breast_cancer()
X, y = data.data, data.target

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Create pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(max_iter=500))
])

# Train
pipeline.fit(X_train, y_train)

# Evaluate
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Pipeline Test Accuracy: {accuracy:.4f}")


# Task (Sheet 3) Q2 – Feature Importance for Logistic Regression

import pandas as pd
import numpy as np

# Extract model coefficients
coeffs = pipeline.named_steps['model'].coef_[0]

# Put into dataframe
feature_importance = pd.DataFrame({
    'Feature': data.feature_names,
    'Importance': np.abs(coeffs)
}).sort_values('Importance', ascending=False)

# Show top 5 features
print(feature_importance.head())
