In [1]:
import os, torch, umap
import numpy as np
import pandas as pd
import xgboost as xgb
from dotenv import load_dotenv
import matplotlib.pyplot as plt
from transformers import AutoModel, AutoTokenizer
from dags.utils.gm_main_utils import get_embeddings
from sklearn.model_selection import train_test_split
from dags.utils.gm_data_utils import decode_zip, extract_headers, decode_body

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available()) 
# load_dotenv()
# # SCOPES: Gmail read-only
# SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']
# token_path = os.getenv("token_path")
# credentials_path = os.getenv("credentials_path")

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

PyTorch version: 2.8.0+cu129
CUDA available: True


In [2]:
path_1 = "data/imp_22-08-2025-03-23.json.gz"
path_2 = "data/unimp_22-08-2025-03-23.json.gz"
path_3 = "data/15-08-2025-10-41.json.gz"

decompressed_data_1 = decode_zip(path_1)
decompressed_data_2 = decode_zip(path_2)
decompressed_data_3 = decode_zip(path_3)

df_imp =pd.DataFrame(decompressed_data_1)[["Payload"]]
df_unimp =pd.DataFrame(decompressed_data_2)[["Payload"]]
df_unlb =pd.DataFrame(decompressed_data_3)[["Payload"]]

df_imp["Subject"] = df_imp["Payload"].apply(extract_headers)
df_imp["Body"] = df_imp["Payload"].apply(decode_body)
df_imp["Important"] = 1
df_imp = df_imp.drop(["Payload"], axis=1)

df_unimp["Subject"] = df_unimp["Payload"].apply(extract_headers)
df_unimp["Body"] = df_unimp["Payload"].apply(decode_body)
df_unimp["Important"] = 0
df_unimp = df_unimp.drop(["Payload"], axis=1)

df_unlb["Subject"] = df_unlb["Payload"].apply(extract_headers)
df_unlb["Body"] = df_unlb["Payload"].apply(decode_body)
df_unlb = df_unlb.drop(["Payload"], axis=1)

In [3]:
train = pd.concat([df_imp, df_unimp])
train.isnull().sum()

Subject       0
Body         53
Important     0
dtype: int64

In [4]:
train["Body"] = train["Body"].fillna(train["Subject"]*5)

In [5]:
X = get_embeddings(train, model_name)
y = torch.from_numpy(train.loc[:,"Important"].values)
#y = train.loc[:,"Important"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, shuffle=True, random_state=42)

In [6]:
import optuna
from sklearn.metrics import recall_score

In [42]:
dtrain = xgb.DMatrix(X_train, label= y_train)
dtest = xgb.DMatrix(X_test)
def objective2(trial):
    params = {
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "eta": trial.suggest_float("eta", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 5.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 5.0),
        "objective": "binary:logistic", 
        "eval_metric": "logloss",
        "tree_method": "hist",
        "device": "cuda",
        "random_state": 42,
    }
    

    cls = xgb.train(params, dtrain, num_boost_round = 1500)    
    y_pred_proba = cls.predict(dtest)

    # Convert probabilities to binary predictions
    # Decrease threshold for negating False Negatives
    y_pred_binary = (y_pred_proba > 0.50).astype(int)
    
    score = recall_score(y_test, y_pred_binary)
    return score

In [43]:
# Run optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective2, n_trials=25)

[I 2025-08-24 18:29:31,521] A new study created in memory with name: no-name-879713b3-10ce-4fa8-a31a-1d3f899f9a9a
[I 2025-08-24 18:29:33,843] Trial 0 finished with value: 0.85 and parameters: {'max_depth': 10, 'eta': 0.2953843525095601, 'subsample': 0.9911081873116594, 'colsample_bytree': 0.56475886114318, 'gamma': 4.142897505182672, 'reg_alpha': 2.522754984354045, 'reg_lambda': 3.9680792319066267}. Best is trial 0 with value: 0.85.
[I 2025-08-24 18:29:36,675] Trial 1 finished with value: 0.85 and parameters: {'max_depth': 5, 'eta': 0.17607660681500664, 'subsample': 0.810958297859574, 'colsample_bytree': 0.7117333075391337, 'gamma': 0.1796378705140883, 'reg_alpha': 3.146550622924668, 'reg_lambda': 0.5959542185077749}. Best is trial 0 with value: 0.85.
[I 2025-08-24 18:29:38,680] Trial 2 finished with value: 0.9 and parameters: {'max_depth': 3, 'eta': 0.18954892595276174, 'subsample': 0.6464215011544552, 'colsample_bytree': 0.8140873451467547, 'gamma': 2.662248008680188, 'reg_alpha': 1.

In [23]:
# # UMAP reduction to 2D
# umap_model = umap.UMAP(n_neighbors=10, min_dist=0.3, random_state=42, n_components=2)
# emb_2d = umap_model.fit_transform(embeddings)

# # Some Overlap is expected
# #High-dimensional separation may not be visible in 2d
# #Can try diff embedding/tokenizer


# # Scatterplot
# plt.figure(figsize=(6, 4))
# scatter = plt.scatter(emb_2d[:, 0], emb_2d[:, 1], c=labels, cmap="Spectral", alpha=0.8)
# plt.xlabel("UMAP-1")
# plt.ylabel("UMAP-2")
# plt.title("Sentence Embeddings with UMAP (colored by label)")
# plt.colorbar(scatter, label="Dependent Variable (0/1)")
# plt.show()

In [25]:
sub = ""
body = """
"""

In [13]:
# df_2 = pd.DataFrame({
#     "Subject": [sub],
#     "Body":[body]
# })

# embeddings_2 =get_embeddings(df_2, model_name)

# dunlabeled_2 = xgb.DMatrix(embeddings_2)
# y_pred_proba_2 = model.predict(dunlabeled_2)
# print(y_pred_proba_2)

In [21]:
unlb_embeddings = get_embeddings(df_unlb, model_name)

In [56]:
epochs = 10
thresh = 0.95


optuna_params = study.best_trial.params

X_c, y_c = X_train.clone(), y_train.clone()
X_unlb = unlb_embeddings.clone()

In [57]:
for epoch in range(1,epochs):
    print(f"\n>>>>Epoch {epoch}<<<<")

    dtrain = xgb.DMatrix(X_c, label=y_c)
    model = xgb.train(optuna_params, dtrain, num_boost_round=1500)

    if len(X_unlb) == 0:
        print("No unlabeled data left.")
        break

    d_unlb = xgb.DMatrix(X_unlb)
    y_proba = model.predict(d_unlb)
    y_proba = torch.from_numpy(y_proba)

    mask = (y_proba > thresh) | (y_proba < 1 - thresh) # a tensor of True, False... of shape y_proba
    
    if mask.sum() == 0:                                # True==1, False==0
        print("No high-confidence samples found.")
        break

    X_pseudo = X_unlb[mask]
    y_pseudo = (y_proba[mask] > 0.5).int() # torch.int

    # move high confidence rows from unlabeled to training set
    X_c = torch.vstack([X_c, X_pseudo]) # stacks vertically. similar to 1d append but for high d's. 
    y_c = torch.cat([y_c, y_pseudo], axis =0)

    # Remove high confidence rows
    X_unlb = X_unlb[~mask]

    print(f"Added {len(X_pseudo)} pseudo samples. Labeled samples size: {len(y_c)}")


>>>>Epoch 1<<<<
Added 60 pseudo samples. Labeled samples size: 291

>>>>Epoch 2<<<<
Added 3 pseudo samples. Labeled samples size: 294

>>>>Epoch 3<<<<
Added 36 pseudo samples. Labeled samples size: 330

>>>>Epoch 4<<<<
No high-confidence samples found.


In [58]:
# Train final model
dfinal = xgb.DMatrix(X_c, label=y_c)
final_model = xgb.train(optuna_params, dfinal, num_boost_round=1500)

In [59]:
dunlabeled = xgb.DMatrix(X_test)
y_pred_proba = final_model.predict(dunlabeled)

# Convert probabilities to binary predictions
# Decrease threshold for negating False Negatives
y_pred_binary = (y_pred_proba >= 0.5).astype(int)

from sklearn.metrics import confusion_matrix
cm =confusion_matrix(y_test, y_pred_binary)
print(cm)

[[32  5]
 [ 4 36]]


In [60]:
print(recall_score(y_test, y_pred_binary))

0.9


In [68]:
final_model.save_model("data/XGBmodel.json")