In [1]:
# Install necessary libraries
!pip install rapidfuzz scikit-learn pandas jellyfish

import pandas as pd
from rapidfuzz import fuzz, process
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import jellyfish

Collecting rapidfuzz
  Downloading rapidfuzz-3.14.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Collecting jellyfish
  Downloading jellyfish-1.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.6 kB)
Downloading rapidfuzz-3.14.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jellyfish-1.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (355 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m355.9/355.9 kB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jellyfish
Successfully installed jellyfish-1.2.0 rapidfuzz-3.14.0


In [3]:
# ===============================
# Task 1 - Match Queries
# ===============================

# Install dependencies
!pip install rapidfuzz scikit-learn pandas

import pandas as pd
from rapidfuzz import fuzz, process
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# --- Load CSVs ---
resolved = pd.read_csv("resolved_queries.csv")
new = pd.read_csv("new_queries.csv")

# --- Rename columns for consistency ---
resolved = resolved.rename(columns={"Pre_Resolved_Query": "query", "Query_ID": "id"})
new = new.rename(columns={"Variation_Query": "query", "Matches_With_Query_ID": "id"})

print("Resolved queries shape:", resolved.shape)
print("New queries shape:", new.shape)
print("Resolved columns:", resolved.columns.tolist())
print("New columns:", new.columns.tolist())

# --- Preprocessing helper ---
def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text).lower().strip()
    return text

resolved["query_clean"] = resolved["query"].apply(clean_text)
new["query_clean"] = new["query"].apply(clean_text)

# --- Fuzzy Matching (RapidFuzz) ---
def match_fuzzy(query, choices, scorer=fuzz.token_set_ratio, threshold=85):
    match = process.extractOne(query, choices, scorer=scorer, score_cutoff=threshold)
    return match

fuzzy_results = []
for q in new["query_clean"]:
    match = match_fuzzy(q, resolved["query_clean"].tolist())
    fuzzy_results.append(match)

new["fuzzy_match"] = [m[0] if m else None for m in fuzzy_results]
new["fuzzy_score"] = [m[1] if m else None for m in fuzzy_results]

print("\n=== Fuzzy Matching Results (Top 5) ===")
print(new[["query", "fuzzy_match", "fuzzy_score"]].head())

# --- TF-IDF + Cosine Similarity ---
tfidf = TfidfVectorizer().fit(resolved["query_clean"].tolist() + new["query_clean"].tolist())
resolved_vecs = tfidf.transform(resolved["query_clean"])
new_vecs = tfidf.transform(new["query_clean"])

cosine_results = []
for i in range(new_vecs.shape[0]):
    sims = cosine_similarity(new_vecs[i], resolved_vecs)[0]
    best_idx = sims.argmax()
    best_score = sims[best_idx]
    cosine_results.append((resolved.loc[best_idx, "query"], best_score))

new["cosine_match"] = [m[0] for m in cosine_results]
new["cosine_score"] = [m[1] for m in cosine_results]

print("\n=== Cosine Similarity Results (Top 5) ===")
print(new[["query", "cosine_match", "cosine_score"]].head())

# --- Save results ---
new.to_csv("task1_query_matches.csv", index=False)
print("\n✅ Task 1 results saved as task1_query_matches.csv")

Resolved queries shape: (5, 2)
New queries shape: (20, 2)
Resolved columns: ['id', 'query']
New columns: ['query', 'id']

=== Fuzzy Matching Results (Top 5) ===
                                       query  \
0           Unabel to conect to the internet   
1                  Can’t connect to internet   
2                        Intenet not working   
3               Payment failed while chekout   
4  Payment did not go through during chckout   

                         fuzzy_match  fuzzy_score  
0  unable to connect to the internet    94.915254  
1  unable to connect to the internet    86.363636  
2                               None          NaN  
3                               None          NaN  
4                               None          NaN  

=== Cosine Similarity Results (Top 5) ===
                                       query  \
0           Unabel to conect to the internet   
1                  Can’t connect to internet   
2                        Intenet not working   
3  

In [4]:
# ===============================
# Task 2 - Match Names
# ===============================

# Install dependencies (if not already done)
!pip install rapidfuzz pandas jellyfish

import pandas as pd
from rapidfuzz import fuzz, process
import jellyfish

# --- Load CSVs ---
base = pd.read_csv("base_names.csv")
variations = pd.read_csv("name_variations.csv")

print("Base names columns:", base.columns.tolist())
print("Name variations columns:", variations.columns.tolist())

# --- Rename columns for consistency ---
# Assuming first col is ID and second is name
base = base.rename(columns={base.columns[0]: "id", base.columns[1]: "name"})
variations = variations.rename(columns={variations.columns[0]: "id", variations.columns[1]: "name"})

print("Base names shape:", base.shape)
print("Variations shape:", variations.shape)

# --- Preprocess names ---
def clean_name(name):
    if pd.isna(name):
        return ""
    name = str(name).lower().strip()
    name = name.replace(",", " ")        # handle "Smith, John"
    name = " ".join(name.split())        # remove extra spaces
    return name

base["name_clean"] = base["name"].apply(clean_name)
variations["name_clean"] = variations["name"].apply(clean_name)

# --- Fuzzy match names ---
def match_name(name, choices, threshold=90):
    match = process.extractOne(name, choices, scorer=fuzz.token_set_ratio, score_cutoff=threshold)
    return match

name_matches = []
for n in variations["name_clean"]:
    match = match_name(n, base["name_clean"].tolist())
    name_matches.append(match)

variations["best_match"] = [m[0] if m else None for m in name_matches]
variations["match_score"] = [m[1] if m else None for m in name_matches]

print("\n=== Fuzzy Name Matching Results (Top 5) ===")
print(variations[["name", "best_match", "match_score"]].head())

# --- Optional: add Jaro-Winkler score for better spelling variations ---
jw_scores = []
for i, row in variations.iterrows():
    if row["best_match"]:
        jw = jellyfish.jaro_winkler_similarity(row["name_clean"], row["best_match"])
        jw_scores.append(jw)
    else:
        jw_scores.append(None)

variations["jaro_winkler_score"] = jw_scores

# --- Save results ---
variations.to_csv("task2_name_matches.csv", index=False)
print("\n✅ Task 2 results saved as task2_name_matches.csv")

Base names columns: ['Base_Name_ID', 'Base_Name']
Name variations columns: ['Variation', 'Matches_With_Base_Name']
Base names shape: (20, 2)
Variations shape: (100, 2)

=== Fuzzy Name Matching Results (Top 5) ===
           name    best_match  match_score
0   Thomas King   thomas king        100.0
1   Thomas King   thomas king        100.0
2  Maria Garcia  maria garcia        100.0
3    Mary Lewis    mary lewis        100.0
4  Nancy Wright  nancy wright        100.0

✅ Task 2 results saved as task2_name_matches.csv
