
# Classification Task — Predicting Tax Avoidance Categories
**ML2 Course — Extra Points Assignment**  
**Primary metric: Macro F1-Score**

> Complete this notebook and push to GitHub. Save the best model(s) in `models/`.


In [None]:

# === Imports & Config ===
import os, warnings, joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score, f1_score, precision_recall_fscore_support

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

warnings.filterwarnings("ignore")
plt.rcParams["figure.figsize"] = (7,5)

DATA_DIR = "data"
MODEL_DIR = "models"
os.makedirs(MODEL_DIR, exist_ok=True)


## Part 1 — Data Preparation

In [None]:

# Load prepared datasets
train_path = os.path.join(DATA_DIR, "train_fe.csv")
test_path  = os.path.join(DATA_DIR, "test_fe.csv")

assert os.path.exists(train_path), f"Missing file: {train_path}. Put train_fe.csv in data/"
assert os.path.exists(test_path),  f"Missing file: {test_path}. Put test_fe.csv in data/"

train = pd.read_csv(train_path)
test  = pd.read_csv(test_path)

print(train.shape, test.shape)
train.head()


In [None]:

# Create target variable from ETR thresholds
def label_from_etr(etr):
    if etr > 0.25:
        return 0  # Low Tax Avoidance
    elif etr > 0.15:
        return 1  # Medium Tax Avoidance
    else:
        return 2  # High Tax Avoidance

for df in (train, test):
    if "ETR" not in df.columns:
        raise ValueError("Expected 'ETR' column in the dataset.")
    df["target"] = df["ETR"].apply(label_from_etr)

train["target"].value_counts().sort_index()


In [None]:

# Check class balance
ax = train["target"].value_counts().sort_index().plot(kind="bar")
ax.set_title("Class Distribution (train)")
ax.set_xlabel("Class (0=Low, 1=Medium, 2=High)")
ax.set_ylabel("Count")
plt.show()


In [None]:

# Select features — use the engineered features from the project except obvious targets/ids
drop_cols = ["ETR", "target"]
id_like = [c for c in train.columns if c.lower() in {"id", "date", "firm", "ticker"}]
drop_cols.extend(id_like)
drop_cols = list(set(drop_cols))

feature_cols = [c for c in train.columns if c not in drop_cols]
print(f"{len(feature_cols)} features:")
print(feature_cols[:20], "..." if len(feature_cols) > 20 else "")

X_train = train[feature_cols].copy()
y_train = train["target"].copy()

X_test  = test[feature_cols].copy()
y_test  = test["target"].copy()


In [None]:

# Time-series aware CV
tscv = TimeSeriesSplit(n_splits=5)

# Grids
logreg_grid = {
    "clf__C": [0.01, 0.1, 1, 10],
    "clf__penalty": ["l2"],
    "clf__solver": ["lbfgs", "newton-cg", "saga"],
    "clf__max_iter": [2000]
}

knn_grid = {
    "clf__n_neighbors": [3,5,9,15],
    "clf__weights": ["uniform", "distance"],
    "clf__p": [1,2]
}

svc_grid = {
    "clf__C": [0.1, 1, 10],
    "clf__kernel": ["rbf", "linear"],
    "clf__gamma": ["scale", "auto"]
}


In [None]:

def tune_and_score(name, estimator, param_grid, X, y, cv, scoring="f1_macro"):
    pipe = Pipeline([
        ("scaler", StandardScaler(with_mean=False) if hasattr(X, "sparse") else StandardScaler()),
        ("clf", estimator)
    ])
    gs = GridSearchCV(pipe, param_grid, cv=cv, scoring=scoring, n_jobs=-1, refit=True, verbose=0)
    gs.fit(X, y)
    print(f"=== {name} ===")
    print("Best params:", gs.best_params_)
    print(f"CV best {scoring}: {gs.best_score_:.4f}")
    return {
        "name": name,
        "search": gs,
        "best_score": gs.best_score_,
        "best_estimator": gs.best_estimator_
    }


## Part 2 — Model Training & Validation (TimeSeriesSplit + GridSearchCV)

In [None]:

results = []
results.append(tune_and_score("LogisticRegression", LogisticRegression(multi_class="auto"), logreg_grid, X_train, y_train, tscv))
results.append(tune_and_score("KNN", KNeighborsClassifier(), knn_grid, X_train, y_train, tscv))
results.append(tune_and_score("SVC", SVC(probability=False), svc_grid, X_train, y_train, tscv))

# Compare
summary = pd.DataFrame({
    "Model": [r["name"] for r in results],
    "CV_MacroF1": [r["best_score"] for r in results]
}).sort_values("CV_MacroF1", ascending=False).reset_index(drop=True)
summary


## Part 3 — Final Evaluation on Test Set

In [None]:

best = max(results, key=lambda r: r["best_score"])
best_name = best["name"]
best_est = best["best_estimator"]
print("Selected best model:", best_name)

# Fit on full TRAIN
best_est.fit(X_train, y_train)

# Predict on TEST
y_pred = best_est.predict(X_test)

acc = accuracy_score(y_test, y_pred)
macro_f1 = f1_score(y_test, y_pred, average="macro")
print(f"Test Accuracy: {acc:.4f}")
print(f"Test Macro F1: {macro_f1:.4f}\n")

print("Classification report (per class):")
print(classification_report(y_test, y_pred, digits=4))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred, labels=[0,1,2])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0,1,2])
disp.plot(values_format='d')
plt.title(f"Confusion Matrix — {best_name}")
plt.show()


In [None]:

# Save best model and (optionally) scaler inside the Pipeline
best_model_path = os.path.join(MODEL_DIR, "best_model.pkl")
joblib.dump(best_est, best_model_path)
print("Saved:", best_model_path)



### Brief Interpretation
- We selected the model with the highest **CV Macro F1** using **TimeSeriesSplit** to respect time ordering.  
- Final test results report **macro F1**, accuracy, and the confusion matrix for classes 0/1/2.  
- If confusion shows frequent confusion between classes 1 and 2, consider adding class weights, different ETR thresholds, or additional features.
