In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, confusion_matrix, ConfusionMatrixDisplay
)
from tabulate import tabulate
##grafice invatare - pe validare
##parametri calumea la gb si rf mai ales si la arbore de decizie arbori slabi la gb, cp_alpha
##
#_________________________________
#1 incarcare si pregatire date


#Am folosit OpenML pentru a elimina folosirea fisierelor locale, executare usoara, reproductivitate si portabilitate mai mare.
#Csv-ul spambase e inclus in arhiva de pe moodle
df = fetch_openml(name="spambase", version=1, as_frame=True).frame

if df.isnull().sum().sum() > 0:
    print("valori lipsa gasite, se elimina...")
    df = df.dropna()
else:
    print("0 valori lipsa, continua rularea\n")
#verificam valori lipsa


X = df.drop("class", axis=1)
y = df["class"].astype(int)

date_train, date_test, etichete_train, etichete_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

scaler = StandardScaler()
date_train = scaler.fit_transform(date_train)
date_test = scaler.transform(date_test)

#______________________
#2 definirea modelelor

modele = {
    "Regresie Logistica": LogisticRegression(max_iter=200, random_state=42),
    "Arbore de Decizie": DecisionTreeClassifier(max_depth=10, min_samples_split=5, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=150, learning_rate=0.1, random_state=42)
}

#___________________________________
#3 Antrenare si evaluare modele

rezultate = []
predictii = {}
probabilitati = {}
matrici_confuzie = {}

for nume, model in modele.items():
    print(f"Antrenare si evaluare model: {nume}")
    model.fit(date_train, etichete_train)
    pred_train = model.predict(date_train)
    pred_test = model.predict(date_test)
    prob_test = model.predict_proba(date_test)[:, 1]
    scor_auc = roc_auc_score(etichete_test, prob_test)
    
    rezultate.append({
        "Model": nume,
        "Acuratete Antrenare": accuracy_score(etichete_train, pred_train),
        "Acuratete Testare": accuracy_score(etichete_test, pred_test),
        "Precizie": precision_score(etichete_test, pred_test),
        "Recall": recall_score(etichete_test, pred_test),
        "F1 Score": f1_score(etichete_test, pred_test),
        "AUC": scor_auc
    })
    
    # Salveaza predictii/probabilitati pentru grafice
    predictii[nume] = (pred_train, pred_test)
    probabilitati[nume] = prob_test
    matrici_confuzie[nume] = (etichete_test, pred_test)

#____________________________________
#4 Rezumatul rezultatelor modelelor

rezultate_df = pd.DataFrame(rezultate)
rezultate_df_sortat = rezultate_df.sort_values(by="F1 Score", ascending=False).reset_index(drop=True)

print("\nTabel Performanta Modele:\n")
print(tabulate(rezultate_df_sortat, headers="keys", tablefmt="fancy_grid", floatfmt=".4f"))

#________________________
#5 generare si salvare grafice

print("\ngenerare si salvare grafice:")

# 5.1 Matrici de confuzie si ROC individual
for nume in modele.keys():
    etichete_test_cm, pred_test_cm = matrici_confuzie[nume]
    disp = ConfusionMatrixDisplay.from_predictions(etichete_test_cm, pred_test_cm)
    plt.title(f"{nume} - Matrice de Confuzie")
    plt.grid(False)
    plt.tight_layout()
    path_cm = f"{nume.replace(' ', '_').lower()}_matrice_confuzie.png"
    plt.savefig(path_cm)
    print(f"grafic salvat: {path_cm}")
    plt.close()

    prob_test = probabilitati[nume]
    fpr, tpr, _ = roc_curve(etichete_test, prob_test)
    scor_auc = roc_auc_score(etichete_test, prob_test)
    plt.figure(figsize=(6, 4))
    plt.plot(fpr, tpr, label=f"{nume} (AUC = {scor_auc:.2f})")
    plt.plot([0, 1], [0, 1], "k--", label="Random")
    plt.xlabel("Rata de false positives (FPR)")
    plt.ylabel("Rata de true positives (TPR)")
    plt.title(f"{nume} - Curba ROC")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    path_roc = f"{nume.replace(' ', '_').lower()}_roc.png"
    plt.savefig(path_roc)
    print(f"grafic salvat: {path_roc}")
    plt.close()

# 5.2 Barplot F1
plt.figure(figsize=(10, 6))
sns.barplot(
    data=rezultate_df_sortat,
    x="F1 Score",
    y="Model",
    hue="Model",
    palette="viridis",
    dodge=False,
    legend=False
)
plt.title("Comparatie dupa scorul F1")
plt.xlim(0.8, 1.0)
plt.grid(True)
plt.tight_layout()
plt.savefig("comparatie_f1_score.png")
print("grafic salvat: comparatie_f1_score.png")
plt.close()

# 5.3 Barplot AUC
plt.figure(figsize=(10, 6))
sns.barplot(
    data=rezultate_df_sortat,
    x="AUC",
    y="Model",
    hue="Model",
    palette="mako",
    dodge=False,
    legend=False
)
plt.title("Comparatie dupa scorul AUC")
plt.xlim(0.8, 1.0)
plt.grid(True)
plt.tight_layout()
plt.savefig("comparatie_auc.png")
print("grafic salvat: comparatie_auc.png")
plt.close()

# 5.4 ROC suprapus
plt.figure(figsize=(8, 6))
for nume in modele.keys():
    prob_test = probabilitati[nume]
    fpr, tpr, _ = roc_curve(etichete_test, prob_test)
    scor_auc = roc_auc_score(etichete_test, prob_test)
    plt.plot(fpr, tpr, label=f"{nume} (AUC = {scor_auc:.2f})")

plt.plot([0, 1], [0, 1], "k--", label="Random")
plt.xlabel("Rata de false positives (FPR)")
plt.ylabel("Rata de true positives (TPR)")
plt.title("Curbe ROC suprapuse - toate modelele")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig("roc_all_models.png")
print("grafic salvat: roc_all_models.png")
plt.close()

# 5.5 mic Overfitting check
rezultate_df_sortat.plot(
    x="Model",
    y=["Acuratete Antrenare", "Acuratete Testare"],
    kind="bar",
    figsize=(10, 6)
)
plt.title("Acuratete Antrenare vs Testare pentru fiecare model")
plt.ylabel("Acuratete")
plt.ylim(0.8, 1.0)
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("acuratete_antrenare_vs_testare.png")
print("grafic salvat: acuratete_antrenare_vs_testare.png")
plt.close()

#______________________________________
#6 Test final random pe un email din setul de test

i = np.random.randint(0, date_test.shape[0])
email_vector = date_test[i].reshape(1, -1)


print("\nTest random: predictii pentru un email aleator din setul de test")

eticheta_reala = etichete_test.iloc[i] if hasattr(etichete_test, "iloc") else list(etichete_test)[i]
print(f"Eticheta reala: {'spam' if eticheta_reala == 1 else 'non-spam'}")

for nume, model in modele.items():
    pred = model.predict(email_vector)[0]
    prob = model.predict_proba(email_vector)[0][1]
    print(f"- {nume}: {'spam' if pred == 1 else 'non-spam'} (probabilitate spam: {prob:.2%})")
    


0 valori lipsa, continua rularea

Antrenare si evaluare model: Regresie Logistica
Antrenare si evaluare model: Arbore de Decizie
Antrenare si evaluare model: Random Forest
Antrenare si evaluare model: Gradient Boosting

Tabel Performanta Modele:

╒════╤════════════════════╤═══════════════════════╤═════════════════════╤════════════╤══════════╤════════════╤════════╕
│    │ Model              │   Acuratete Antrenare │   Acuratete Testare │   Precizie │   Recall │   F1 Score │    AUC │
╞════╪════════════════════╪═══════════════════════╪═════════════════════╪════════════╪══════════╪════════════╪════════╡
│  0 │ Gradient Boosting  │                0.9689 │              0.9500 │     0.9488 │   0.9307 │     0.9396 │ 0.9873 │
├────┼────────────────────┼───────────────────────┼─────────────────────┼────────────┼──────────┼────────────┼────────┤
│  1 │ Random Forest      │                0.9665 │              0.9457 │     0.9614 │   0.9064 │     0.9331 │ 0.9849 │
├────┼────────────────────┼──────