In [1]:
import os, torch, umap
import numpy as np
import pandas as pd
from dotenv import load_dotenv
import matplotlib.pyplot as plt
from dags.utils import decode_zip, extract_headers, decode_body
from transformers import AutoModel, AutoTokenizer

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available()) 
# load_dotenv()
# # SCOPES: Gmail read-only
# SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']
# token_path = os.getenv("token_path")
# credentials_path = os.getenv("credentials_path")

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

PyTorch version: 2.8.0+cu129
CUDA available: True


In [2]:
type(tokenizer)

transformers.models.distilbert.tokenization_distilbert_fast.DistilBertTokenizerFast

In [3]:
path_1 = "data/imp_22-08-2025-03-23.json.gz"
path_2 = "data/unimp_22-08-2025-03-23.json.gz"
decompressed_data_1 = decode_zip(path_1)
decompressed_data_2 = decode_zip(path_2)

df_imp =pd.DataFrame(decompressed_data_1)
df_unimp =pd.DataFrame(decompressed_data_2)

df_imp["Subject"] = df_imp["Payload"].apply(lambda x: pd.Series(extract_headers(x)))
df_imp["Body"] = df_imp["Payload"].apply(decode_body)
df_imp["Important"] = 1
df_imp = df_imp.drop(["Id", "Payload"], axis=1)

df_unimp["Subject"] = df_unimp["Payload"].apply(extract_headers)
df_unimp["Body"] = df_unimp["Payload"].apply(decode_body)
df_unimp["Important"] = 0
df_unimp = df_unimp.drop(["Id", "Payload"], axis=1)
df_unimp.head(2)

Unnamed: 0,Subject,Body,Important
0,artificial intelligence engineer the reliable ...,the reliable jobs artificial intelligence engi...,0
1,get 7 days of unrestricted learning,hey sathwik have you ever felt that little spa...,0


In [4]:
df_imp.head(2)

Unnamed: 0,Subject,Body,Important
0,important scheduled maintenance affecting serv...,hello sathwik please note there will be essent...,1
1,urgent housing application expiry warning,hi sathwik your housing application will expir...,1


In [5]:
train = pd.concat([df_imp, df_unimp])
train.isnull().sum()

Subject       0
Body         53
Important     0
dtype: int64

In [6]:
train["Body"] = train["Body"].fillna(train["Subject"])

In [7]:
""" Tokenizer takes input list[str], list[list[str]], not just str!!! """

# Way Faster and then doing .apply ele wise
sub_list = train.loc[:, "Subject"].astype(str).tolist() # turns object to string and returns list of strs
body_list = train.loc[:, "Body"].astype(str).tolist()

sub_tokenized = tokenizer(sub_list, truncation=True, max_length=50, padding=True, return_tensors="pt")
body_tokenized = tokenizer(body_list, truncation=True, max_length=512, padding=True, return_tensors="pt")

# LOL forgot about this
if torch.cuda.is_available():
    model.cuda()
    sub_tokenized = {k: v.cuda() for k, v in sub_tokenized.items()}
    body_tokenized = {k: v.cuda() for k, v in body_tokenized.items()}


In [8]:
with torch.no_grad():
    sub_outputs = model(**sub_tokenized)
    body_outputs = model(**body_tokenized)

    sub_cls_embeddings_t = sub_outputs.last_hidden_state[:, 0, :]
    body_cls_embeddings_t = body_outputs.last_hidden_state[:, 0, :]

    embd = torch.cat((sub_cls_embeddings_t, body_cls_embeddings_t), 1).detach().cpu().numpy() # moves tensor from gpu to cpu

In [9]:
# # UMAP reduction to 2D
# umap_model = umap.UMAP(n_neighbors=10, min_dist=0.3, random_state=42, n_components=2)
# emb_2d = umap_model.fit_transform(embeddings)

# # Some Overlap is expected
# #High-dimensional separation may not be visible in 2d
# #Can try diff embedding/tokenizer


# # Scatterplot
# plt.figure(figsize=(6, 4))
# scatter = plt.scatter(emb_2d[:, 0], emb_2d[:, 1], c=labels, cmap="Spectral", alpha=0.8)
# plt.xlabel("UMAP-1")
# plt.ylabel("UMAP-2")
# plt.title("Sentence Embeddings with UMAP (colored by label)")
# plt.colorbar(scatter, label="Dependent Variable (0/1)")
# plt.show()

In [10]:
labels = train.loc[:,"Important"].values 

X = embd
y = labels

In [11]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, shuffle=True, random_state=0)


dtrain = xgb.DMatrix(X_train, label=y_train)
params = {
    "objective": "binary:logistic", 
    "eval_metric": "logloss",
    "eta": 0.1,
    "max_depth": 15
}
model = xgb.train(params, dtrain, num_boost_round=200)

In [12]:
dunlabeled = xgb.DMatrix(X_test)
y_pred_proba = model.predict(dunlabeled)

In [13]:
# Convert probabilities to binary predictions
# Decrease threshold for negating False Negatives
y_pred_binary = (y_pred_proba >= 0.40).astype(int)

In [14]:
from sklearn.metrics import confusion_matrix
cm =confusion_matrix(y_test, y_pred_binary)

In [15]:
print(cm)

[[32  5]
 [ 2 38]]


In [16]:
"""[[34  3]
 [ 5 35]]"""

'[[34  3]\n [ 5 35]]'

In [17]:
sub = "“artificial intelligence engineer”: Cloud 9 Solutions, LLC - Artificial Intelligence Engineer"
body = """ 

"""