In [None]:
# ===============================================
# ADABOOST FULL ASSIGNMENT (Q1 + Q2 + Q3)
# All datasets auto-download — NO upload needed
# ===============================================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import requests, re, string


# ====================================================
# HELPER FUNCTIONS
# ====================================================
def clean_text(s):
    s = str(s).lower()
    s = re.sub(r"http\S+|www\S+", "", s)
    s = s.translate(str.maketrans("", "", string.punctuation))
    return s


def manual_adaboost(X_train, y_train, X_test, y_test, T=20):
    n = len(y_train)
    w = np.ones(n) / n
    y_train2 = np.where(y_train == 1, 1, -1)
    y_test2 = np.where(y_test == 1, 1, -1)

    train_score_list = []
    test_score_list = []
    alpha_list = []
    error_list = []

    F_train = np.zeros(n)
    F_test = np.zeros(len(y_test))

    for t in range(1, T + 1):
        stump = DecisionTreeClassifier(max_depth=1)
        stump.fit(X_train, y_train, sample_weight=w)

        pred_train = stump.predict(X_train)
        pred_train2 = np.where(pred_train == 1, 1, -1)

        err = np.sum(w[pred_train2 != y_train2])
        err = max(min(err, 0.999), 1e-10)

        alpha = 0.5 * np.log((1 - err) / err)

        w *= np.exp(-alpha * y_train2 * pred_train2)
        w /= np.sum(w)

        pred_test = stump.predict(X_test)
        pred_test2 = np.where(pred_test == 1, 1, -1)

        F_train += alpha * pred_train2
        F_test += alpha * pred_test2

        train_acc = accuracy_score(y_train, (F_train >= 0).astype(int))
        test_acc = accuracy_score(y_test, (F_test >= 0).astype(int))

        train_score_list.append(train_acc)
        test_score_list.append(test_acc)
        alpha_list.append(alpha)
        error_list.append(err)

        print(f"Iteration {t} | train={train_acc:.4f} | test={test_acc:.4f} | error={err:.4f} | alpha={alpha:.4f}")

    final_pred = (F_test >= 0).astype(int)

    return {
        "train_scores": train_score_list,
        "test_scores": test_score_list,
        "alphas": alpha_list,
        "errors": error_list,
        "final_pred": final_pred,
    }


# ====================================================
# Q1 — SMS SPAM (AdaBoost Manual + Sklearn)
# ====================================================

print("\n=========== Q1: SMS SPAM AdaBoost ===========")

url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_csv(url, sep="\t", names=["label", "text"])
df["label_bin"] = df["label"].map({"ham": 0, "spam": 1})
df["clean"] = df["text"].apply(clean_text)

vec = TfidfVectorizer(max_features=4000)
X = vec.fit_transform(df["clean"])
y = df["label_bin"].values

X_train, X_test, y_train, y_test = train_test_split(
    X.toarray(), y, test_size=0.2, random_state=42, stratify=y
)

print("Dataset Loaded — Spam / Ham Counts:", np.bincount(y))


# Manual AdaBoost
manual_result = manual_adaboost(X_train, y_train, X_test, y_test, T=15)

print("\nManual AdaBoost Final Accuracy:", accuracy_score(y_test, manual_result["final_pred"]))
print("Confusion Matrix:\n", confusion_matrix(y_test, manual_result["final_pred"]))


# Sklearn AdaBoost
adb = AdaBoostClassifier(
    base_estimator=DecisionTreeClassifier(max_depth=1),
    n_estimators=100,
    learning_rate=0.6,
)

adb.fit(X_train, y_train)
print("\nSklearn AdaBoost Accuracy:", adb.score(X_test, y_test))
print("Confusion Matrix:\n", confusion_matrix(y_test, adb.predict(X_test)))


# ====================================================
# Q2 — HEART DISEASE AdaBoost
# ====================================================

print("\n=========== Q2: HEART DISEASE AdaBoost ===========")

heart_url = "https://raw.githubusercontent.com/plotly/datasets/master/heart.csv"
heart = pd.read_csv(heart_url)

y = heart["target"].values
X = heart.drop(columns=["target"])

X = pd.get_dummies(X)
X = StandardScaler().fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Baseline stump
stump = DecisionTreeClassifier(max_depth=1)
stump.fit(X_train, y_train)
print("Stump Test Accuracy:", stump.score(X_test, y_test))


# Grid Search
best_acc = -1
best_model = None
for n in [10, 25, 50, 100]:
    for lr in [0.1, 0.5, 1.0]:
        model = AdaBoostClassifier(
            base_estimator=DecisionTreeClassifier(max_depth=1),
            n_estimators=n,
            learning_rate=lr,
        )
        model.fit(X_train, y_train)
        acc = model.score(X_test, y_test)
        print(f"n={n}, lr={lr} → acc={acc:.4f}")

        if acc > best_acc:
            best_acc = acc
            best_model = model

print("\nBest AdaBoost Model Accuracy:", best_acc)
print("Confusion Matrix:\n", confusion_matrix(y_test, best_model.predict(X_test)))


# ====================================================
# Q3 — WISDM ACTIVITY AdaBoost
# ====================================================

print("\n=========== Q3: WISDM AdaBoost ===========")

wisdm_url = (
    "https://raw.githubusercontent.com/ankitaggarwal011/Activity-Recognition-WISDM/master/WISDM_ar_v1.1_raw.txt"
)
df = pd.read_csv(wisdm_url, header=None, names=["raw"], engine="python")

def parse_row(row):
    try:
        parts = row.split(",")
        act = parts[1].strip()
        x, y, z = float(parts[3]), float(parts[4]), float(parts[5])
        return act, x, y, z
    except:
        return None, None, None, None

parsed = df["raw"].apply(parse_row)
parsed = pd.DataFrame(parsed.tolist(), columns=["act", "x", "y", "z"])
parsed = parsed.dropna()

parsed["label"] = parsed["act"].apply(lambda a: 1 if "Jog" in a or "Up" in a else 0)

X = parsed[["x", "y", "z"]].values
y = parsed["label"].values

# downsample (dataset is HUGE)
idx = np.random.choice(len(X), 30000, replace=False)
X, y = X[idx], y[idx]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Manual AdaBoost
wisdm_res = manual_adaboost(X_train, y_train, X_test, y_test, T=20)

print("\nManual AdaBoost WISDM Accuracy:", accuracy_score(y_test, wisdm_res["final_pred"]))
print("Confusion Matrix:\n", confusion_matrix(y_test, wisdm_res["final_pred"]))


# Sklearn AdaBoost
adb2 = AdaBoostClassifier(
    base_estimator=DecisionTreeClassifier(max_depth=1),
    n_estimators=100,
    learning_rate=1.0,
)

adb2.fit(X_train, y_train)
print("\nSklearn AdaBoost WISDM Accuracy:", adb2.score(X_test, y_test))
print("Confusion Matrix:\n", confusion_matrix(y_test, adb2.predict(X_test)))


# ====================================================
print("\n============= ALL QUESTIONS DONE SUCCESSFULLY =============")



Dataset Loaded — Spam / Ham Counts: [4825  747]
Iteration 1 | train=0.8833 | test=0.8960 | error=0.1167 | alpha=1.0122
Iteration 2 | train=0.8833 | test=0.8960 | error=0.2582 | alpha=0.5278
Iteration 3 | train=0.8833 | test=0.8960 | error=0.2981 | alpha=0.4281
Iteration 4 | train=0.9150 | test=0.9256 | error=0.3796 | alpha=0.2457
Iteration 5 | train=0.8948 | test=0.9058 | error=0.3861 | alpha=0.2319
Iteration 6 | train=0.9266 | test=0.9291 | error=0.3907 | alpha=0.2221
Iteration 7 | train=0.9138 | test=0.9193 | error=0.3910 | alpha=0.2216
Iteration 8 | train=0.9289 | test=0.9300 | error=0.3949 | alpha=0.2133
Iteration 9 | train=0.9190 | test=0.9229 | error=0.3749 | alpha=0.2555
Iteration 10 | train=0.9145 | test=0.9202 | error=0.4577 | alpha=0.0848
Iteration 11 | train=0.9338 | test=0.9354 | error=0.3902 | alpha=0.2233
Iteration 12 | train=0.9264 | test=0.9265 | error=0.4175 | alpha=0.1665
Iteration 13 | train=0.9282 | test=0.9309 | error=0.4557 | alpha=0.0889
Iteration 14 | train=0.9

TypeError: AdaBoostClassifier.__init__() got an unexpected keyword argument 'base_estimator'