In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

****Task 1 - Match user queries with resolved queries ****

In [2]:
import pandas as pd
import re
from fuzzywuzzy import fuzz, process
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the datasets
resolved_queries = pd.read_csv('/kaggle/input/resolved-queries/resolved_queries.csv')
new_queries = pd.read_csv('/kaggle/input/new-queries/new_queries.csv')

# Text Preprocessing Function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

# Apply preprocessing
resolved_queries['Pre_Resolved_Query_Clean'] = resolved_queries['Pre_Resolved_Query'].apply(preprocess_text)
new_queries['Variation_Query_Clean'] = new_queries['Variation_Query'].apply(preprocess_text)


In [3]:
# Function to perform fuzzy matching
def fuzzy_matching(resolved_queries, new_queries, threshold=80, method='ratio'):
    matches = []
    for new_query in new_queries['Variation_Query_Clean']:
        if method == 'ratio':
            match = process.extractOne(new_query, resolved_queries['Pre_Resolved_Query_Clean'], scorer=fuzz.ratio)
        elif method == 'partial_ratio':
            match = process.extractOne(new_query, resolved_queries['Pre_Resolved_Query_Clean'], scorer=fuzz.partial_ratio)
        elif method == 'token_sort_ratio':
            match = process.extractOne(new_query, resolved_queries['Pre_Resolved_Query_Clean'], scorer=fuzz.token_sort_ratio)
        else:
            raise ValueError("Invalid method selected.")
        
        if match[1] >= threshold:
            matches.append((new_query, match[0], match[1]))
        else:
            matches.append((new_query, None, match[1]))
    
    return matches

In [4]:
# Perform Fuzzy Matching using different methods and thresholds
fuzzy_matches_ratio = fuzzy_matching(resolved_queries, new_queries, threshold=80, method='ratio')
fuzzy_matches_partial = fuzzy_matching(resolved_queries, new_queries, threshold=80, method='partial_ratio')
fuzzy_matches_token_sort = fuzzy_matching(resolved_queries, new_queries, threshold=80, method='token_sort_ratio')

# Vectorization and Cosine Similarity Calculation
vectorizer = TfidfVectorizer()

In [5]:
# Fit the vectorizer on both resolved and new queries
tfidf_matrix = vectorizer.fit_transform(
    resolved_queries['Pre_Resolved_Query_Clean'].tolist() + new_queries['Variation_Query_Clean'].tolist()
)

# Split the matrix back into resolved and new queries
resolved_tfidf = tfidf_matrix[:len(resolved_queries)]
new_tfidf = tfidf_matrix[len(resolved_queries):]

In [6]:
# Calculate cosine similarity between each new query and all resolved queries
cosine_similarities = cosine_similarity(new_tfidf, resolved_tfidf)

In [7]:
# Find the best match for each new query
cosine_matches = []
for i in range(len(new_queries)):
    best_match_index = cosine_similarities[i].argmax()
    best_match_score = cosine_similarities[i][best_match_index]
    
    if best_match_score > 0.5:  # Using 0.5 as an arbitrary threshold
        cosine_matches.append((new_queries['Variation_Query'][i], 
                               resolved_queries['Pre_Resolved_Query'][best_match_index], 
                               best_match_score))
    else:
        cosine_matches.append((new_queries['Variation_Query'][i], None, best_match_score))


In [8]:
# Print the results
print("Fuzzy Matches using Ratio:")
print(fuzzy_matches_ratio[:5])

Fuzzy Matches using Ratio:
[('unabel to conect to the internet', 'unable to connect to the internet', 95), ('cant connect to internet', None, 77), ('intenet not working', None, 33), ('payment failed while chekout', 'payment failed during checkout', 83), ('payment did not go through during chckout', None, 68)]


In [9]:
print("\nFuzzy Matches using Partial Ratio:")
print(fuzzy_matches_partial[:5])



Fuzzy Matches using Partial Ratio:
[('unabel to conect to the internet', 'unable to connect to the internet', 94), ('cant connect to internet', 'unable to connect to the internet', 83), ('intenet not working', None, 52), ('payment failed while chekout', None, 79), ('payment did not go through during chckout', None, 61)]


In [10]:
print("\nFuzzy Matches using Token Sort Ratio:")
print(fuzzy_matches_token_sort[:5])


Fuzzy Matches using Token Sort Ratio:
[('unabel to conect to the internet', 'unable to connect to the internet', 95), ('cant connect to internet', None, 67), ('intenet not working', None, 35), ('payment failed while chekout', None, 76), ('payment did not go through during chckout', None, 65)]


In [11]:
print("\nCosine Similarity Matches:")
print(cosine_matches[:5])


Cosine Similarity Matches:
[('Unabel to conect to the internet', 'Unable to connect to the internet', 0.5559733949526011), ('Can’t connect to internet', 'Unable to connect to the internet', 0.6916347237387183), ('Intenet not working', None, 0.0), ('Payment failed while chekout', None, 0.38813059411225814), ('Payment did not go through during chckout', None, 0.3190817598317858)]


****Task 2 -  Match name****

In [2]:
!pip install fuzzywuzzy python-Levenshtein


Collecting python-Levenshtein
  Downloading python_Levenshtein-0.25.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.25.1 (from python-Levenshtein)
  Downloading Levenshtein-0.25.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.3 kB)
Collecting rapidfuzz<4.0.0,>=3.8.0 (from Levenshtein==0.25.1->python-Levenshtein)
  Downloading rapidfuzz-3.9.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading python_Levenshtein-0.25.1-py3-none-any.whl (9.4 kB)
Downloading Levenshtein-0.25.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (177 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.4/177.4 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.9.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m60.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected p

In [13]:
pip install pandas fuzzywuzzy[speedup] python-Levenshtein


Note: you may need to restart the kernel to use updated packages.


In [3]:
!pip install thefuzz


Collecting thefuzz
  Downloading thefuzz-0.22.1-py3-none-any.whl.metadata (3.9 kB)
Downloading thefuzz-0.22.1-py3-none-any.whl (8.2 kB)
Installing collected packages: thefuzz
Successfully installed thefuzz-0.22.1


In [4]:
import pandas as pd
import numpy as np
import re
from thefuzz import fuzz
base_names = pd.read_csv('/kaggle/input/base-names/base_names.csv')
base_names.head()


Unnamed: 0,Base_Name_ID,Base_Name
0,1,John Smith
1,2,Jennifer Brown
2,3,Michael O'Connor
3,4,Maria Garcia
4,5,Robert Lee


In [5]:
name_var = pd.read_csv('/kaggle/input/name-var/name_variations.csv')
name_var.head()

Unnamed: 0,Variation,Matches_With_Base_Name
0,Thomas King,Thomas King
1,ThomasKing,Thomas King
2,Maria Garcia,Maria Garcia
3,MaryLewis,Mary Lewis
4,Nancy W.,Nancy Wright


In [6]:
def find_best_match(variation, base_names):
    best_match = None
    highest_score = 0
    for base_name in base_names:
        score = fuzz.ratio(variation, base_name)
        if score > highest_score:
            highest_score = score
            best_match = base_name
    return best_match, highest_score
name_var['Best_Match'] = name_var['Variation'].apply(lambda x: find_best_match(x, base_names['Base_Name'])[0])
name_var['Match_Score'] = name_var['Variation'].apply(lambda x: find_best_match(x, base_names['Base_Name'])[1])
name_var.head() 

Unnamed: 0,Variation,Matches_With_Base_Name,Best_Match,Match_Score
0,Thomas King,Thomas King,Thomas King,96
1,ThomasKing,Thomas King,Thomas King,95
2,Maria Garcia,Maria Garcia,Maria Garcia,100
3,MaryLewis,Mary Lewis,Mary Lewis,95
4,Nancy W.,Nancy Wright,Nancy Wright,70


In [8]:
correct_matches = name_var[name_var['Variation'].str.replace(" ", "").str.lower() == name_var['Best_Match'].str.replace(" ", "").str.lower()]
accuracy = len(correct_matches) / len(name_var) * 100
print(f"Accuracy: {accuracy:.2f}%")

Accuracy: 56.00%
