### Load the required data

In [5]:
%pip install gdown
import gdown
from google.colab import files

gdown.download_folder('https://drive.google.com/drive/folders/1fgAdvNcOsbHI82HF-PHUc1ya3AxHa1U-?usp=drive_link', output='TalentSync', quiet=True)



['TalentSync/label_encoder.pkl',
 'TalentSync/processed_data.parquet',
 'TalentSync/processed_data.pkl',
 'TalentSync/section_weighted_grn_weights.pth',
 'TalentSync/test_indices (1).csv',
 'TalentSync/test_resume_embeddings.npy',
 'TalentSync/train_indices (1).csv']

### Read the processed data

In [3]:
import pandas as pd
df = pd.read_parquet("/content/TalentSync/processed_data.parquet")

### Load Test Re-calculated embeddings


In [4]:
%pip install pandas numpy torch sentence-transformers scikit-learn faiss-cpu
import pandas as pd
import numpy as np
import torch
import pickle
import faiss
from torch import nn
from sentence_transformers import SentenceTransformer
from functools import wraps

# ------------------------------------------------------------------------------
# 1) Device setup
# ------------------------------------------------------------------------------
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ------------------------------------------------------------------------------
# 2) Load DataFrame and test subset
# ------------------------------------------------------------------------------
# Assume `df` is already in memory; test indices saved during preprocessing:
test_idx = pd.read_csv("artifacts/embeddings/test_indices.csv")["index"].tolist()
test_df  = df.loc[test_idx].reset_index(drop=True)

# ------------------------------------------------------------------------------
# 3) Load precomputed resume embeddings and build FAISS index
# ------------------------------------------------------------------------------
resume_embeddings = np.load("artifacts/embeddings/test_resume_embeddings.npy").astype("float32")
# Normalize all embeddings to unit length (so IP == cosine similarity)
faiss.normalize_L2(resume_embeddings)
dim = resume_embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(resume_embeddings)

# ------------------------------------------------------------------------------
# 4) Load SentenceTransformer and LabelEncoder
# ------------------------------------------------------------------------------
embedder = SentenceTransformer("all-MiniLM-L6-v2").to(DEVICE)
with open("artifacts/models/label_encoder.pkl", "rb") as f:
    le = pickle.load(f)
num_classes = len(le.classes_)

# ------------------------------------------------------------------------------
# 5) Define the Section‐Wise Weighted GRN model and load weights
# ------------------------------------------------------------------------------
class SectionWiseWeightedGRN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        # gating network to decide how much to transform vs pass-through
        self.gate = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim),
            nn.Sigmoid()
        )
        # transform network for candidate sections
        self.transform = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim),
        )
        # weight predictor + final classifier
        self.weight_fc    = nn.Linear(input_dim, 1)
        self.output_layer = nn.Linear(input_dim, output_dim)

    def forward(self, x: torch.Tensor):
        """
        x: Tensor of shape (batch_size, num_sections, embed_dim)
        returns: (logits, fused_embedding)
        """
        gate_out      = self.gate(x)
        transform_out = self.transform(x)
        fused         = gate_out * transform_out + (1 - gate_out) * x
        # compute attention weights over sections
        weights       = torch.softmax(self.weight_fc(fused).squeeze(-1), dim=1).unsqueeze(-1)
        # weighted sum to fuse sections
        fused_emb     = (weights * fused).sum(dim=1)
        return self.output_layer(fused_emb), fused_emb

# instantiate and load trained weights
model = SectionWiseWeightedGRN(input_dim=dim, hidden_dim=256, output_dim=num_classes)
state = torch.load("artifacts/models/section_weighted_grn_weights.pth", map_location=DEVICE)
model.load_state_dict(state)
model.to(DEVICE)

# ------------------------------------------------------------------------------
# 6) Decorators for inference context
# ------------------------------------------------------------------------------
def inference_mode(func):
    """Decorator to set model to eval and disable grads."""
    @wraps(func)
    def wrapper(*args, **kwargs):
        model.eval()
        with torch.no_grad():
            return func(*args, **kwargs)
    return wrapper

def l2_normalize(vec: np.ndarray) -> np.ndarray:
    """L2‐normalize a vector to unit length."""
    norm = np.linalg.norm(vec)
    return vec / (norm + 1e-8)

# ------------------------------------------------------------------------------
# 7) Retrieval + re-ranking function
# ------------------------------------------------------------------------------
@inference_mode
def retrieve_top_k(
    query_sections: list[str],
    k: int = 10
) -> pd.DataFrame:
    """
    Given a list of section‐level strings for the query, returns top‐k resumes
    ranked by FAISS inner‐product (equivalent to cosine) plus raw cosine scores.
    """
    # a) embed each section then fuse via GRN
    q_embs = embedder.encode(query_sections, convert_to_tensor=True).to(DEVICE)
    _, q_fused = model(q_embs.unsqueeze(0))               # (1, embed_dim)
    q_vec = q_fused.squeeze(0).cpu().numpy().astype("float32")

    # b) raw NumPy cosine similarity
    cos_scores = (resume_embeddings @ q_vec) / (
        np.linalg.norm(resume_embeddings, axis=1) * np.linalg.norm(q_vec)
    )

    # c) FAISS search (inner-product on unit-normalized = cosine)
    q_vec_norm = l2_normalize(q_vec).reshape(1, -1)
    D, I = index.search(q_vec_norm, k)

    # d) assemble results
    results = []
    for rank, (idx, faiss_score) in enumerate(zip(I[0], D[0]), start=1):
        row = test_df.iloc[idx]
        results.append({
            "rank":         rank,
            "cosine_score": float(cos_scores[idx]),
            "faiss_score":  float(faiss_score),
            "category":     row["category"],
            "resume_text":  row["text"]
        })
    return pd.DataFrame(results)

# ------------------------------------------------------------------------------
# 8) Example usage
# ------------------------------------------------------------------------------
if __name__ == "__main__":
    query = [
        "Proficiency in Python, SQL, and data visualization",
        "Hands-on experience with machine learning, statistical analysis",
        "Familiarity with cloud services such as AWS or GCP",
        "Projects involving recommendation engines, A/B testing",
        "Strong communication and teamwork skills"
    ]
    top10 = retrieve_top_k(query, k=10)
    print(top10.to_string(index=False))




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


 rank  cosine_score  faiss_score         category                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       