In [2]:
# --- 1. Import Libraries ---
import pandas as pd
import ast
import re

# --- 2. Load Dataset ---
# Replace 'leetcode_data.csv' with your file path
df = pd.read_csv("leetcode.csv")

# --- 3. Display Basic Info ---
print("Initial Shape:", df.shape)
df.head()

# --- 4. Clean the 'title' column ---
# Remove any numbering or prefixes like "1. Two Sum" → "Two Sum"
def clean_title(title):
    if pd.isna(title):
        return ""
    # Remove leading numbers, dots, spaces
    cleaned = re.sub(r"^\d+\.\s*", "", str(title)).strip()
    return cleaned

df["clean_title"] = df["title"].apply(clean_title)

# --- 5. Clean the 'similar_questions' column ---
def clean_similar_questions(x):
    if pd.isna(x) or x in ['[]', 'None', '', 'nan']:
        return []
    try:
        # Convert string that looks like a list into a real Python list
        lst = ast.literal_eval(x)
        # Remove extra quotes and spaces from each element
        lst = [s.strip(" '\"\n\t") for s in lst if isinstance(s, str) and s.strip()]
        # Clean titles similarly (remove numbering if they exist)
        lst = [re.sub(r"^\d+\.\s*", "", s).strip() for s in lst]
        return lst
    except Exception:
        return []

df["clean_similar_questions"] = df["similar_questions"].apply(clean_similar_questions)

# --- 6. Verify Cleaning ---
print("\nSample cleaned data:")
print(df[["title", "clean_title", "similar_questions", "clean_similar_questions"]].head(10))

# --- 7. (Optional) Remove Duplicate or Empty Titles ---
df = df[df["clean_title"] != ""].drop_duplicates(subset=["clean_title"]).reset_index(drop=True)

# --- 8. (Optional) Check for unmatched titles ---
# Helps you see which 'similar_questions' don't exist in the main title list
all_titles = set(df["clean_title"])
missing = set(q for lst in df["clean_similar_questions"] for q in lst if q not in all_titles)

print(f"\nNumber of missing similar questions: {len(missing)}")
if len(missing) > 0:
    print("Examples of missing titles:", list(missing)[:10])

# --- 9. Save Cleaned Dataset ---
df.to_csv("leetcode_cleaned.csv", index=False)
print("\n✅ Cleaned dataset saved as 'leetcode_cleaned.csv'")


Initial Shape: (3000, 18)

Sample cleaned data:
                                               title  \
0                                         1. Two Sum   
1                                 2. Add Two Numbers   
2  3. Longest Substring Without Repeating Characters   
3                     4. Median of Two Sorted Arrays   
4                   5. Longest Palindromic Substring   
5                               6. Zigzag Conversion   
6                                 7. Reverse Integer   
7                        8. String to Integer (atoi)   
8                               9. Palindrome Number   
9                    10. Regular Expression Matching   

                                      clean_title  \
0                                         Two Sum   
1                                 Add Two Numbers   
2  Longest Substring Without Repeating Characters   
3                     Median of Two Sorted Arrays   
4                   Longest Palindromic Substring   
5                

In [3]:
# --- 1. Import libraries ---
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

# --- 2. Load cleaned dataset ---
df = pd.read_csv("leetcode_cleaned.csv")

# Combine title + problem_description for richer text
df["full_text"] = df["clean_title"].fillna('') + " " + df["problem_description"].fillna('')

# --- 3. Prepare Positive Pairs (duplicates / similar questions) ---
pairs = []
for _, row in df.iterrows():
    main = row["clean_title"]
    for sim in row["clean_similar_questions"]:
        pairs.append((main, sim, 1))  # Label 1 = duplicate/similar

# Negative sampling: random non-duplicates
titles = df["clean_title"].tolist()
np.random.shuffle(titles)
for i in range(len(pairs)):
    a = titles[np.random.randint(len(titles))]
    b = titles[np.random.randint(len(titles))]
    pairs.append((a, b, 0))  # Label 0 = not similar

pairs_df = pd.DataFrame(pairs, columns=["text1", "text2", "label"])
print("Total pairs:", len(pairs_df))

# --- 4. TF-IDF vectorization (build vocabulary from scratch) ---
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
all_texts = df["full_text"].tolist()
vectorizer.fit(all_texts)

def text_to_vec(text):
    return torch.tensor(vectorizer.transform([text]).toarray(), dtype=torch.float32)

# --- 5. Define a simple neural embedding model ---
class TextEmbedder(nn.Module):
    def __init__(self, input_dim, embed_dim=128):
        super(TextEmbedder, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Linear(512, embed_dim)
        )
    
    def forward(self, x):
        return self.model(x)

embed_dim = 128
model = TextEmbedder(input_dim=5000, embed_dim=embed_dim)

# --- 6. Contrastive loss function ---
def cosine_loss(a, b, label):
    cos = nn.functional.cosine_similarity(a, b)
    loss = torch.mean((1 - cos) * label + (1 + cos) * (1 - label))
    return loss

# --- 7. Train/test split ---
train_df, val_df = train_test_split(pairs_df, test_size=0.1, random_state=42)

optimizer = optim.Adam(model.parameters(), lr=1e-3)
epochs = 10

# --- 8. Training loop ---
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for _, row in train_df.sample(2000).iterrows():  # sample subset for speed
        t1 = text_to_vec(row["text1"])
        t2 = text_to_vec(row["text2"])
        label = torch.tensor(row["label"], dtype=torch.float32)

        emb1 = model(t1)
        emb2 = model(t2)

        loss = cosine_loss(emb1, emb2, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_df):.4f}")

# --- 9. Evaluate on validation set ---
model.eval()
sims, labels = [], []
for _, row in val_df.iterrows():
    t1 = text_to_vec(row["text1"])
    t2 = text_to_vec(row["text2"])
    label = row["label"]

    emb1 = model(t1)
    emb2 = model(t2)

    sim = nn.functional.cosine_similarity(emb1, emb2).item()
    sims.append(sim)
    labels.append(label)

# Compute basic accuracy (using threshold)
preds = [1 if s > 0.5 else 0 for s in sims]
accuracy = np.mean(np.array(preds) == np.array(labels))
print(f"\n✅ Validation Accuracy: {accuracy*100:.2f}%")


Total pairs: 210562
Epoch 1/10, Loss: 0.0094
Epoch 2/10, Loss: 0.0085
Epoch 3/10, Loss: 0.0082
Epoch 4/10, Loss: 0.0076
Epoch 5/10, Loss: 0.0076
Epoch 6/10, Loss: 0.0078
Epoch 7/10, Loss: 0.0082
Epoch 8/10, Loss: 0.0074
Epoch 9/10, Loss: 0.0075
Epoch 10/10, Loss: 0.0093

✅ Validation Accuracy: 56.44%


In [4]:
import torch
import joblib

# Save your trained PyTorch model
torch.save(model, "leetcode_model.pt")

# Save your TF-IDF vectorizer (if you used one)
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

print("✅ Model and vectorizer saved successfully!")


✅ Model and vectorizer saved successfully!


In [2]:
import torch
import torch.nn as nn

class TextEmbedder(nn.Module):
    def __init__(self, input_dim, embed_dim=128):
        super(TextEmbedder, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Linear(512, embed_dim)
        )
    def forward(self, x):
        return self.model(x)

# Load the vectorizer, get input_dim from it
import joblib
vectorizer = joblib.load("tfidf_vectorizer.pkl")
input_dim = vectorizer.transform(['example']).shape[1]

# Then create the model instance and LOAD STATE DICT!
model = TextEmbedder(input_dim=input_dim, embed_dim=128)
model.load_state_dict(torch.load("leetcode_model.pt", map_location=torch.device('cpu')))
model.eval()


UnpicklingError: Weights only load failed. This file can still be loaded, to do so you have two options, [1mdo those steps only if you trust the source of the checkpoint[0m. 
	(1) In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source.
	(2) Alternatively, to load with `weights_only=True` please check the recommended steps in the following error message.
	WeightsUnpickler error: Unsupported global: GLOBAL __main__.TextEmbedder was not an allowed global by default. Please use `torch.serialization.add_safe_globals([__main__.TextEmbedder])` or the `torch.serialization.safe_globals([__main__.TextEmbedder])` context manager to allowlist this global if you trust this class/function.

Check the documentation of torch.load to learn more about types accepted by default with weights_only https://pytorch.org/docs/stable/generated/torch.load.html.