In [1]:
# --- 1. Import Libraries ---
import pandas as pd
import ast
import re
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
import joblib

# --- 2. Load Dataset & Clean Titles ---
df = pd.read_csv("leetcode.csv")
def clean_title(title):
    if pd.isna(title):
        return ""
    cleaned = re.sub(r"^\d+\.\s*", "", str(title)).strip()
    return cleaned
df["clean_title"] = df["title"].apply(clean_title)

def clean_similar_questions(x):
    if pd.isna(x) or x in ['[]', 'None', '', 'nan']:
        return []
    try:
        lst = ast.literal_eval(x)
        lst = [s.strip(" '\"\n\t") for s in lst if isinstance(s, str) and s.strip()]
        lst = [re.sub(r"^\d+\.\s*", "", s).strip() for s in lst]
        return lst
    except Exception:
        return []
df["clean_similar_questions"] = df["similar_questions"].apply(clean_similar_questions)
print("Initial Shape:", df.shape)
print("\nSample cleaned data:")
print(df[["title", "clean_title", "similar_questions", "clean_similar_questions"]].head(10))

# Remove duplicate or empty titles
df = df[df["clean_title"] != ""].drop_duplicates(subset=["clean_title"]).reset_index(drop=True)

# Optional: report missing similar question titles
all_titles = set(df["clean_title"])
missing = set(q for lst in df["clean_similar_questions"] for q in lst if q not in all_titles)
print(f"\nNumber of missing similar questions: {len(missing)}")
if len(missing) > 0:
    print("Examples of missing titles:", list(missing)[:10])

# --- 3. Save Cleaned Dataset ---
df.to_csv("leetcode_cleaned.csv", index=False)
print("\n✅ Cleaned dataset saved as 'leetcode_cleaned.csv'")

# --- 4. Vectorization & Pair Preparation ---
df = pd.read_csv("leetcode_cleaned.csv")
df["full_text"] = df["clean_title"].fillna('') + " " + df["problem_description"].fillna('')
pairs = []
for _, row in df.iterrows():
    main = row["clean_title"]
    for sim in row["clean_similar_questions"]:
        pairs.append((main, sim, 1))  # Similar pair (label 1)

titles = df["clean_title"].tolist()
np.random.shuffle(titles)
for i in range(len(pairs)):
    a = titles[np.random.randint(len(titles))]
    b = titles[np.random.randint(len(titles))]
    pairs.append((a, b, 0))  # Random negative (label 0)

pairs_df = pd.DataFrame(pairs, columns=["text1", "text2", "label"])
print("Total pairs:", len(pairs_df))

vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
all_texts = df["full_text"].tolist()
vectorizer.fit(all_texts)
def text_to_vec(text):
    return torch.tensor(vectorizer.transform([text]).toarray(), dtype=torch.float32)

# --- 5. Embedding Model (class definition) ---
class TextEmbedder(nn.Module):
    def __init__(self, input_dim, embed_dim=128):
        super(TextEmbedder, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Linear(512, embed_dim)
        )
    def forward(self, x):
        return self.model(x)
embed_dim = 128
model = TextEmbedder(input_dim=5000, embed_dim=embed_dim)

# --- 6. Contrastive Loss Function ---
def cosine_loss(a, b, label):
    cos = nn.functional.cosine_similarity(a, b)
    loss = torch.mean((1 - cos) * label + (1 + cos) * (1 - label))
    return loss

# --- 7. Train/Test Split ---
train_df, val_df = train_test_split(pairs_df, test_size=0.1, random_state=42)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
epochs = 10

# --- 8. Training Loop ---
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for _, row in train_df.sample(2000).iterrows():  # sample subset
        t1 = text_to_vec(row["text1"])
        t2 = text_to_vec(row["text2"])
        label = torch.tensor(row["label"], dtype=torch.float32)
        emb1 = model(t1)
        emb2 = model(t2)
        loss = cosine_loss(emb1, emb2, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_df):.4f}")

# --- 9. Validation ---
model.eval()
sims, labels = [], []
for _, row in val_df.iterrows():
    t1 = text_to_vec(row["text1"])
    t2 = text_to_vec(row["text2"])
    label = row["label"]
    emb1 = model(t1)
    emb2 = model(t2)
    sim = nn.functional.cosine_similarity(emb1, emb2).item()
    sims.append(sim)
    labels.append(label)
preds = [1 if s > 0.5 else 0 for s in sims]
accuracy = np.mean(np.array(preds) == np.array(labels))
print(f"\n✅ Validation Accuracy: {accuracy*100:.2f}%")

# --- 10. SAVE: Use only state_dict for deployment! ---
torch.save(model.state_dict(), "leetcode_model.pt")   # <-- THIS is the critical change!
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
print("✅ Model weights and vectorizer saved successfully!")

# --- 11. Deployment/Test Load (always use this in deployment) ---
class TextEmbedder(nn.Module):
    def __init__(self, input_dim, embed_dim=128):
        super(TextEmbedder, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Linear(512, embed_dim)
        )
    def forward(self, x):
        return self.model(x)

vectorizer = joblib.load("tfidf_vectorizer.pkl")
input_dim = vectorizer.transform(['example']).shape[1]
model = TextEmbedder(input_dim=input_dim, embed_dim=128)
model.load_state_dict(torch.load("leetcode_model.pt", map_location=torch.device('cpu')))
model.eval()


Initial Shape: (3000, 20)

Sample cleaned data:
                                               title  \
0                                         1. Two Sum   
1                                 2. Add Two Numbers   
2  3. Longest Substring Without Repeating Characters   
3                     4. Median of Two Sorted Arrays   
4                   5. Longest Palindromic Substring   
5                               6. Zigzag Conversion   
6                                 7. Reverse Integer   
7                        8. String to Integer (atoi)   
8                               9. Palindrome Number   
9                    10. Regular Expression Matching   

                                      clean_title  \
0                                         Two Sum   
1                                 Add Two Numbers   
2  Longest Substring Without Repeating Characters   
3                     Median of Two Sorted Arrays   
4                   Longest Palindromic Substring   
5                

TextEmbedder(
  (model): Sequential(
    (0): Linear(in_features=5000, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=128, bias=True)
  )
)