In [1]:
import os, torch
import pandas as pd
from dotenv import load_dotenv
from dags.utils import decode_zip, extract_headers, decode_body
from transformers import AutoModel, AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available()) 
# load_dotenv()
# # SCOPES: Gmail read-only
# SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']
# token_path = os.getenv("token_path")
# credentials_path = os.getenv("credentials_path")

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 2)
model_2 = AutoModel.from_pretrained(model_name)

PyTorch version: 2.8.0+cu129
CUDA available: True


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
path_1 = "data/imp_22-08-2025-03-23.json.gz"
path_2 = "data/unimp_22-08-2025-03-23.json.gz"
decompressed_data_1 = decode_zip(path_1)
decompressed_data_2 = decode_zip(path_2)

df_imp =pd.DataFrame(decompressed_data_1)
df_unimp =pd.DataFrame(decompressed_data_2)

df_imp["Subject"] = df_imp["Payload"].apply(lambda x: pd.Series(extract_headers(x)))
df_imp["Body"] = df_imp["Payload"].apply(decode_body)
df_imp["Important"] = 1
df_imp = df_imp.drop(["Id", "Payload"], axis=1)

df_unimp["Subject"] = df_unimp["Payload"].apply(extract_headers)
df_unimp["Body"] = df_unimp["Payload"].apply(decode_body)
df_unimp["Important"] = 0
df_unimp = df_unimp.drop(["Id", "Payload"], axis=1)
df_unimp.head(2)

Unnamed: 0,Subject,Body,Important
0,artificial intelligence engineer the reliable ...,the reliable jobs artificial intelligence engi...,0
1,get 7 days of unrestricted learning,hey sathwik have you ever felt that little spa...,0


In [3]:
df_imp.head(2)

Unnamed: 0,Subject,Body,Important
0,important scheduled maintenance affecting serv...,hello sathwik please note there will be essent...,1
1,urgent housing application expiry warning,hi sathwik your housing application will expir...,1


In [4]:
train = pd.concat([df_imp, df_unimp])
train.isnull().sum()

Subject       0
Body         53
Important     0
dtype: int64

In [23]:
train["Body"] = train["Body"].fillna(train["Subject"])

train["subject_tokenized"] = train["Subject"].apply(lambda x: tokenizer(x, truncation=True, max_length=50, padding=True, return_tensors="pt"))
train["body_tokenized"] = train["Body"].apply(lambda x: tokenizer(x, truncation=True, max_length=512, padding=True, return_tensors="pt"))

In [24]:
train.head(2)

Unnamed: 0,Subject,Body,Important,subject_tokenized,body_tokenized
0,important scheduled maintenance affecting serv...,hello sathwik please note there will be essent...,1,"[input_ids, attention_mask]","[input_ids, attention_mask]"
1,urgent housing application expiry warning,hi sathwik your housing application will expir...,1,"[input_ids, attention_mask]","[input_ids, attention_mask]"


In [25]:
body_tokens = train.loc[:, "body_tokenized"].values
subject_tokens = train.loc[:, "subject_tokenized"].values
labels = train.loc[:,"Important"].values 

In [42]:
vv = train.loc[:, ["Important", "subject_tokenized"]].values

In [26]:
import umap
import numpy as np
import matplotlib.pyplot as plt
from dags.utils import get_embeddings_from_token

In [27]:
body_embd = get_embeddings_from_token(model_name, body_tokens)

In [12]:
subject_embd = get_embeddings_from_token(model_name, subject_tokens)

In [15]:
# # UMAP reduction to 2D
# umap_model = umap.UMAP(n_neighbors=10, min_dist=0.3, random_state=42, n_components=2)
# emb_2d = umap_model.fit_transform(embeddings)

# # Some Overlap is expected
# #High-dimensional separation may not be visible in 2d
# #Can try diff embedding/tokenizer


# # Scatterplot
# plt.figure(figsize=(6, 4))
# scatter = plt.scatter(emb_2d[:, 0], emb_2d[:, 1], c=labels, cmap="Spectral", alpha=0.8)
# plt.xlabel("UMAP-1")
# plt.ylabel("UMAP-2")
# plt.title("Sentence Embeddings with UMAP (colored by label)")
# plt.colorbar(scatter, label="Dependent Variable (0/1)")
# plt.show()

In [44]:
X = np.concatenate((body_embd, subject_embd,),axis = 1)
y = labels

In [47]:
X.shape

(308, 1536)

In [71]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, shuffle=True, random_state=42)


dtrain = xgb.DMatrix(X_train, label=y_train)
params = {
    "objective": "binary:logistic", 
    "eval_metric": "logloss",
    "eta": 0.1,
    "max_depth": 6
}
model = xgb.train(params, dtrain, num_boost_round=200)

In [72]:
dunlabeled = xgb.DMatrix(X_test)
y_pred_proba = model.predict(dunlabeled)

In [73]:
# Convert probabilities to binary predictions
y_pred_binary = (y_pred_proba >= 0.45).astype(int)

In [74]:
from sklearn.metrics import confusion_matrix
cm =confusion_matrix(y_test, y_pred_binary)

In [75]:
print(cm)

[[33  4]
 [ 5 35]]


In [85]:
sub = "“artificial intelligence engineer”: Cloud 9 Solutions, LLC - Artificial Intelligence Engineer"
body = """Your job listings for 22 August 2025
Data Scientist
	Remote, India
	

ZENOFFI E-LEARNING LABB TRAINING SOLUTIONS PRIVATE LIMITED

Data Science Intern

Remote

₹25K (Employer Est.)
	

Easy Apply
	

Placement Hub

AI/ML Engineer

Remote

₹4L - ₹17L (Employer Est.)
	

Easy Apply
	

Norstella
	

3.6 ★

Associate Data Scientist

Remote

Provident Fund	

Engineering	

Full-time	

Remote
	
Hubooze

Data Science Intern

Remote	

Easy Apply


VWorker Solutions India Pvt Ltd

Data Scientist

Remote

₹30K (Employer Est.)
	
Easy Apply

Waayslive Solutions
	
2. ★

ML Engineer

Remote

₹14L (Employer Est.)

Easy Apply
TensorFlow	
Azure	
Kubernetes	
PyTorch
	
Crowd4Test
computer vision& Gen AI Intern
Remote
	
Easy Apply	
Group 8a
AI/ML Developer
Remote
	
Easy Apply	
AI Academia

AI/ML Mentor"""

In [86]:
from dags.utils import preprocess_email_body
body = preprocess_email_body(body)

In [90]:
body_t = tokenizer(body, truncation=True, max_length=50, padding=True, return_tensors="pt")
sub_t = tokenizer(sub, truncation=True, max_length=50, padding=True, return_tensors="pt")

In [92]:
body_t

{'input_ids': tensor([[  101,  2115,  3105, 26213,  2005,  2570,  2257, 16798,  2629,  2951,
          7155,  6556,  2634, 16729,  7245,  2072,  3449, 14644,  5582,  6845,
          2497,  2731,  7300,  2797,  3132,  2951,  2671, 25204,  6556,  1576,
         17788,  2243, 11194,  9765,  3733,  6611, 11073,  9594,  6614,  2140,
          3992,  6556,  1576,  2549,  2140,  1576, 16576,  2140, 11194,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1]])}

In [93]:
b_e = get_embeddings_from_token(model_name, [body_t])
s_e = get_embeddings_from_token(model_name, [sub_t])

In [95]:
xx = xgb.DMatrix(np.concatenate((b_e, s_e,),axis = 1))
y_pred_proba = model.predict(xx)

In [96]:
y_pred_proba

array([0.21637449], dtype=float32)