In [12]:
import torch, umap, optuna
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
from dags.utils.embedding_utils import get_embeddings
from sklearn.model_selection import train_test_split 
from sklearn.metrics import recall_score, confusion_matrix
from dags.utils.encode_utils import decode_zip
from dags.utils.payload_utils import decode_gmail_payload


print("PyTorch version:", torch.__version__)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print("CUDA available:", torch.cuda.is_available()) 

model_name = "sentence-transformers/all-MiniLM-L6-v2"

PyTorch version: 2.9.0+cu130
CUDA available: True


In [9]:
path_1 = "data/imp_22-08-2025-03-23.json.gz"
path_2 = "data/unimp_22-08-2025-03-23.json.gz"
path_3 = "data/15-08-2025-10-41.json.gz"

decompressed_data_1 = decode_zip(path_1)
decompressed_data_2 = decode_zip(path_2)
decompressed_data_3 = decode_zip(path_3)

df_imp =pd.DataFrame(decompressed_data_1)[["Payload"]]
df_unimp =pd.DataFrame(decompressed_data_2)[["Payload"]]
df_unlb =pd.DataFrame(decompressed_data_3)[["Payload"]]

df_imp[["Subject", "Body"]] = df_imp["Payload"].apply(lambda row: pd.Series(decode_gmail_payload(row)[1:]))
df_imp["Important"] = 1
df_imp = df_imp.drop(["Payload"], axis=1) 

df_unimp[["Subject", "Body"]] = df_unimp["Payload"].apply(lambda row: pd.Series(decode_gmail_payload(row)[1:]))
df_unimp["Important"] = 0
df_unimp = df_unimp.drop(["Payload"], axis=1)

df_unlb[["Subject", "Body"]] = df_unlb["Payload"].apply(lambda row: pd.Series(decode_gmail_payload(row))[1:])
df_unlb = df_unlb.drop(["Payload"], axis=1)

In [10]:
train = pd.concat([df_imp, df_unimp])
train.isnull().sum()

Subject      0
Body         0
Important    0
dtype: int64

In [30]:
X = get_embeddings(train, model_name, device)
print(X.get_device())
y = torch.from_numpy(train.loc[:,"Important"].values).cuda()
#y = train.loc[:,"Important"].values
print(y.get_device())

0
0


In [31]:
len(X[0])

768

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, shuffle=True, random_state=42)

In [26]:
dtrain = xgb.DMatrix(X_train, label= y_train)
dtest = xgb.DMatrix(X_test)
y_test_np = y_test.cpu().numpy()

default_params = {
        "objective": "binary:logistic", 
        "eval_metric": "logloss",
        "tree_method": "hist",
        "device": "cuda",
        "random_state": 42,
}
def objective2(trial):
    params = {
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "eta": trial.suggest_float("eta", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 5.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 5.0),
        **default_params
    }
    

    cls = xgb.train(params, dtrain, num_boost_round = 1500)    
    y_pred_proba = cls.predict(dtest)

    # Convert probabilities to binary predictions
    y_pred_binary = (y_pred_proba > 0.50).astype(int)
    
    score = recall_score(y_test_np, y_pred_binary)
    return score

In [27]:
# Run optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective2, n_trials=25, n_jobs=5)

[I 2025-11-06 18:06:53,909] A new study created in memory with name: no-name-f5674152-3a6f-4606-b771-ec04a329e41b
[I 2025-11-06 18:07:04,469] Trial 0 finished with value: 0.825 and parameters: {'max_depth': 7, 'eta': 0.24607036723951145, 'subsample': 0.8894990870083058, 'colsample_bytree': 0.6933244850450013, 'gamma': 3.9838266759264696, 'reg_alpha': 1.0375946750319154, 'reg_lambda': 1.8274508970284042}. Best is trial 0 with value: 0.825.
[I 2025-11-06 18:07:04,637] Trial 2 finished with value: 0.875 and parameters: {'max_depth': 14, 'eta': 0.2071210797444219, 'subsample': 0.656157184698971, 'colsample_bytree': 0.8822217787577966, 'gamma': 1.8297526409683207, 'reg_alpha': 1.6852872146236737, 'reg_lambda': 4.058797840407122}. Best is trial 2 with value: 0.875.
[I 2025-11-06 18:07:04,832] Trial 1 finished with value: 0.85 and parameters: {'max_depth': 11, 'eta': 0.07777390595505697, 'subsample': 0.8729439534923629, 'colsample_bytree': 0.9997651028620271, 'gamma': 1.3299048697721965, 'reg

In [None]:
cls = xgb.train({**study.best_trial.params,
                 **default_params,}, dtrain, num_boost_round = 1500)

In [None]:
del dtest
import gc
gc.collect()

177

In [None]:
# # UMAP reduction to 2D
# umap_model = umap.UMAP(n_neighbors=10, min_dist=0.3, random_state=42, n_components=2)
# emb_2d = umap_model.fit_transform(embeddings)

# # Some Overlap is expected
# #High-dimensional separation may not be visible in 2d
# #Can try diff embedding/tokenizer


# # Scatterplot
# plt.figure(figsize=(6, 4))
# scatter = plt.scatter(emb_2d[:, 0], emb_2d[:, 1], c=labels, cmap="Spectral", alpha=0.8)
# plt.xlabel("UMAP-1")
# plt.ylabel("UMAP-2")
# plt.title("Sentence Embeddings with UMAP (colored by label)")
# plt.colorbar(scatter, label="Dependent Variable (0/1)")
# plt.show()

In [None]:
unlb_embeddings = get_embeddings(df_unlb, model_name)

In [None]:
epochs = 10
thresh = 0.95


optuna_params = study.best_trial.params

X_c, y_c = X_train.clone(), y_train.clone()
X_unlb = unlb_embeddings.clone()

In [None]:
for epoch in range(1,epochs):
    print(f"\n>>>>Epoch {epoch}<<<<")

    dtrain = xgb.DMatrix(X_c, label=y_c)
    model = xgb.train({**optuna_params,
                       **default_params}, dtrain, num_boost_round=1500)

    if len(X_unlb) == 0:
        print("No unlabeled data left.")
        break

    d_unlb = xgb.DMatrix(X_unlb)
    y_proba = model.predict(d_unlb)
    y_proba = torch.from_numpy(y_proba).cuda()

    mask = (y_proba > thresh) | (y_proba < 1 - thresh) # a tensor of True, False... of shape y_proba
    
    if mask.sum() == 0:                                # True==1, False==0
        print("No high-confidence samples found.")
        break

    X_pseudo = X_unlb[mask]
    y_pseudo = (y_proba[mask] > 0.5).int() # torch.int

    # move high confidence rows from unlabeled to training set
    X_c = torch.vstack([X_c, X_pseudo]) # stacks vertically. similar to 1d append but for high d's. 
    y_c = torch.cat([y_c, y_pseudo], axis =0)

    # Remove high confidence rows
    X_unlb = X_unlb[~mask]

    print(f"Added {len(X_pseudo)} pseudo samples. Labeled samples size: {len(y_c)}")


>>>>Epoch 1<<<<
Added 143 pseudo samples. Labeled samples size: 374

>>>>Epoch 2<<<<
Added 30 pseudo samples. Labeled samples size: 404

>>>>Epoch 3<<<<
Added 7 pseudo samples. Labeled samples size: 411

>>>>Epoch 4<<<<
Added 4 pseudo samples. Labeled samples size: 415

>>>>Epoch 5<<<<
Added 3 pseudo samples. Labeled samples size: 418

>>>>Epoch 6<<<<
Added 2 pseudo samples. Labeled samples size: 420

>>>>Epoch 7<<<<
Added 2 pseudo samples. Labeled samples size: 422

>>>>Epoch 8<<<<
Added 1 pseudo samples. Labeled samples size: 423

>>>>Epoch 9<<<<
Added 1 pseudo samples. Labeled samples size: 424


In [None]:
# Train final model
dfinal = xgb.DMatrix(X_c, label=y_c)
final_model = xgb.train({**optuna_params,
                 **default_params,}, dfinal, num_boost_round = 1500)

In [None]:
dunlabeled = xgb.DMatrix(X_test)
y_pred_proba = final_model.predict(dunlabeled)

# Convert probabilities to binary predictions
# Decrease threshold for negating False Negatives
y_pred_binary = (y_pred_proba >= 0.5).astype(int)

from sklearn.metrics import confusion_matrix
cm =confusion_matrix(y_test_np, y_pred_binary)
print(cm)

[[34  3]
 [ 3 37]]


In [None]:
print(recall_score(y_test_np, y_pred_binary))

0.925


In [None]:
final_model.save_model("data/XGBmodel.json")

In [None]:
sub = ""
body = """
"""

In [None]:
df_2 = pd.DataFrame({
    "Subject": [sub],
    "Body":[body]
})

embeddings_2 =get_embeddings(df_2, model_name)

dunlabeled_2 = xgb.DMatrix(embeddings_2)
y_pred_proba_2 = final_model.predict(dunlabeled_2)
print(y_pred_proba_2)

[0.7819592]


In [None]:
# xm= xgb.Booster()
# xm.load_model("data/XGBmodel.json")

# config = xm.save_config()
# print(config)

In [None]:
from datetime import datetime
start_date=datetime(2025,8,26)