# Match user queries with resolved queries
Rishi Gandhi J022

# 1. Load and Inspect Data
First, load the CSV files to inspect the data and understand their structure.

In [1]:
import pandas as pd

resolved_queries = pd.read_csv('resolved_queries.csv')
new_queries = pd.read_csv('new_queries.csv')

print("Resolved Queries:")
print(resolved_queries.head())

print("New Queries:")
print(new_queries.head())


Resolved Queries:
   Query_ID                    Pre_Resolved_Query
0         1     Unable to connect to the internet
1         2        Payment failed during checkout
2         3     App crashes when opening settings
3         4   Forgot password and unable to reset
4         5  Unable to upload files to the server
New Queries:
                             Variation_Query  Matches_With_Query_ID
0           Unabel to conect to the internet                      1
1                  Can’t connect to internet                      1
2                        Intenet not working                      1
3               Payment failed while chekout                      2
4  Payment did not go through during chckout                      2


# 2. Preprocessing
Clean the text data to ensure consistency in comparison. Typical preprocessing steps include:

Lowercasing
Removing punctuation
Removing stop words
Tokenization

In [5]:
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

resolved_queries = pd.read_csv('resolved_queries.csv')
new_queries = pd.read_csv('new_queries.csv')

nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

resolved_queries['processed'] = resolved_queries['Pre_Resolved_Query'].apply(preprocess)
new_queries['processed'] = new_queries['Variation_Query'].apply(preprocess)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rishi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rishi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# 3. Fuzzy Search
You can use the fuzzywuzzy library or rapidfuzz for fuzzy matching. For fuzzywuzzy, the process module can be used to find the best matches.

In [7]:
pip install fuzzywuzzy

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [13]:
from fuzzywuzzy import process

def fuzzy_match(query, choices, threshold=80):
    results = process.extract(query, choices, limit=1)
    return results[0] if results[0][1] >= threshold else ("No match", 0)

resolved_list = resolved_queries['processed'].tolist()
new_queries['fuzzy_match'] = new_queries['processed'].apply(lambda x: fuzzy_match(x, resolved_list))


# 4. BoW / TF-IDF with Cosine Similarity
For this, use scikit-learn to vectorize the text and compute similarities.

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the data
tfidf_resolved = vectorizer.fit_transform(resolved_queries['processed'])
tfidf_new = vectorizer.transform(new_queries['processed'])

# Compute cosine similarity
similarity_matrix = cosine_similarity(tfidf_new, tfidf_resolved)

def find_best_match(similarities, threshold=0.5):
    results = []
    for i, row in enumerate(similarities):
        best_match_index = row.argmax()
        best_match_score = row.max()
        if best_match_score >= threshold:
            results.append((i, best_match_index, best_match_score))
        else:
            results.append((i, None, 0))  # Append None if no match is above threshold
    return results

# Apply function
boW_matches = find_best_match(similarity_matrix)


In [21]:
assert len(boW_matches) == len(new_queries), "Length mismatch between new_queries and boW_matches"


# 5. Results and Evaluation
Fuzzy Search: Review the results based on the fuzzy matching threshold. Adjust the threshold if needed.
BoW/TF-IDF: Evaluate the matches based on cosine similarity scores. Adjust the similarity threshold if needed.

In [23]:
# Convert the list of tuples to a DataFrame
boW_matches_df = pd.DataFrame(boW_matches, columns=['Index', 'Resolved_Query_ID', 'Similarity_Score'])

# Add the results to the new_queries DataFrame
new_queries = new_queries.join(boW_matches_df[['Resolved_Query_ID', 'Similarity_Score']])


In [27]:
new_queries[['Variation_Query', 'Resolved_Query_ID', 'Similarity_Score']]

Unnamed: 0,Variation_Query,Resolved_Query_ID,Similarity_Score
0,Unabel to conect to the internet,0.0,0.63907
1,Can’t connect to internet,0.0,0.903782
2,Intenet not working,,0.0
3,Payment failed while chekout,1.0,0.816497
4,Payment did not go through during chckout,1.0,0.57735
5,Payment issue at check out,1.0,0.57735
6,Application crashes when opening setings,2.0,0.707107
7,App crash when going to settings,2.0,0.707107
8,Settings cause the app to chrash,2.0,0.707107
9,Forgot passwrd and cant reset,3.0,0.761551


In [32]:
# Display Fuzzy Matching Results
print("Fuzzy Matching Results:")
new_queries[['Variation_Query', 'fuzzy_match']]

Fuzzy Matching Results:


Unnamed: 0,Variation_Query,fuzzy_match
0,Unabel to conect to the internet,"(unable connect internet, 93)"
1,Can’t connect to internet,"(unable connect internet, 95)"
2,Intenet not working,"(No match, 0)"
3,Payment failed while chekout,"(payment failed checkout, 98)"
4,Payment did not go through during chckout,"(No match, 0)"
5,Payment issue at check out,"(No match, 0)"
6,Application crashes when opening setings,"(app crashes opening settings, 86)"
7,App crash when going to settings,"(app crashes opening settings, 88)"
8,Settings cause the app to chrash,"(No match, 0)"
9,Forgot passwrd and cant reset,"(forgot password unable reset, 83)"
