In [19]:
import numpy as np
import pandas as pd
from thefuzz import fuzz
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
pip install thefuzz

Collecting thefuzz
  Downloading thefuzz-0.22.1-py3-none-any.whl.metadata (3.9 kB)
Collecting rapidfuzz<4.0.0,>=3.0.0 (from thefuzz)
  Downloading rapidfuzz-3.9.7-cp312-cp312-win_amd64.whl.metadata (12 kB)
Downloading thefuzz-0.22.1-py3-none-any.whl (8.2 kB)
Downloading rapidfuzz-3.9.7-cp312-cp312-win_amd64.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ---------------------------------------- 1.7/1.7 MB 14.8 MB/s eta 0:00:00
Installing collected packages: rapidfuzz, thefuzz
Successfully installed rapidfuzz-3.9.7 thefuzz-0.22.1
Note: you may need to restart the kernel to use updated packages.


In [5]:
df=pd.read_csv('new_queries.csv')
data=pd.read_csv('resolved_queries.csv')

In [6]:
data

Unnamed: 0,Query_ID,Pre_Resolved_Query
0,1,Unable to connect to the internet
1,2,Payment failed during checkout
2,3,App crashes when opening settings
3,4,Forgot password and unable to reset
4,5,Unable to upload files to the server


In [13]:
#Partial Ratio
df['Highest_Ratio'] = 0

for i in range(len(df)):
    for j in range(len(data)):
        current_ratio = fuzz.partial_ratio(df['Variation_Query'][i], data['Pre_Resolved_Query'][j])
        if current_ratio > 60:
            if current_ratio > df.loc[i, 'Highest_Ratio']:
                # Update the highest ratio and resolved index
                df.loc[i, 'Highest_Ratio'] = current_ratio
                df.loc[i, 'Resolved_Index_pr'] = j + 1

# Drop the helper column if no longer needed
df.drop(columns=['Highest_Ratio'], inplace=True)


In [14]:
#Token Set
df['Highest_Ratio'] = 0

for i in range(len(df)):
    for j in range(len(data)):
        current_ratio = fuzz.token_set_ratio(df['Variation_Query'][i], data['Pre_Resolved_Query'][j])
        if current_ratio > 60:
            # Check if the current ratio is greater than the previously stored highest ratio
            if current_ratio > df.loc[i, 'Highest_Ratio']:
                # Update the highest ratio and resolved index
                df.loc[i, 'Highest_Ratio'] = current_ratio
                df.loc[i, 'Resolved_Index_tsr1'] = j + 1

# Drop the helper column if no longer needed
df.drop(columns=['Highest_Ratio'], inplace=True)


In [15]:
#Token Sort
df['Highest_Ratio'] = 0

for i in range(len(df)):
    for j in range(len(data)):
        current_ratio = fuzz.token_sort_ratio(df['Variation_Query'][i], data['Pre_Resolved_Query'][j])
        if current_ratio > 60:
            # Check if the current ratio is greater than the previously stored highest ratio
            if current_ratio > df.loc[i, 'Highest_Ratio']:
                # Update the highest ratio and resolved index
                df.loc[i, 'Highest_Ratio'] = current_ratio
                df.loc[i, 'Resolved_Index_tsr'] = j + 1

# Drop the helper column if no longer needed
df.drop(columns=['Highest_Ratio'], inplace=True)


In [17]:
#Partial Sort
df['Highest_Ratio'] = 0

for i in range(len(df)):
    for j in range(len(data)):
        current_ratio = fuzz.partial_token_sort_ratio(df['Variation_Query'][i], data['Pre_Resolved_Query'][j])
        if current_ratio > 60:
            if current_ratio > df.loc[i, 'Highest_Ratio']:
                # Update the highest ratio and resolved index
                df.loc[i, 'Highest_Ratio'] = current_ratio
                df.loc[i, 'Resolved_Index_ptsr'] = j + 1


df.drop(columns=['Highest_Ratio'], inplace=True)




In [8]:
df

Unnamed: 0,Variation_Query,Matches_With_Query_ID,Resolved_Index_ptsr,Resolved_Index_pr,Resolved_Index_tsr1,Resolved_Index_tsr
0,Unabel to conect to the internet,1,1.0,1.0,1.0,1.0
1,Can’t connect to internet,1,1.0,1.0,1.0,1.0
2,Intenet not working,1,,,,
3,Payment failed while chekout,2,2.0,2.0,2.0,2.0
4,Payment did not go through during chckout,2,2.0,2.0,2.0,2.0
5,Payment issue at check out,2,,2.0,,
6,Application crashes when opening setings,3,3.0,3.0,3.0,3.0
7,App crash when going to settings,3,3.0,3.0,3.0,3.0
8,Settings cause the app to chrash,3,3.0,,3.0,3.0
9,Forgot passwrd and cant reset,4,4.0,4.0,4.0,4.0


In [18]:
#accuracy
for i in df.columns[2:]:
    count=0
    for j in range(len(df)):
        if df[i][j]==df['Matches_With_Query_ID'][j]:
            count+=1
    accuracy=(count/len(df))*100
    print('Accuracy:',accuracy)

Accuracy: 85.0
Accuracy: 60.0
Accuracy: 85.0
Accuracy: 80.0
Accuracy: 95.0


In [10]:
df['Highest_Ratio'] = 0

all_queries = list(df['Variation_Query']) + list(data['Pre_Resolved_Query'])

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(all_queries)

tfidf_variation = tfidf_matrix[:len(df)]  
tfidf_resolved = tfidf_matrix[len(df):]   

for i in range(len(df)):
    for j in range(len(data)):
        
        cosine_sim = cosine_similarity(tfidf_variation[i], tfidf_resolved[j])[0][0]
        print(f"Cosine Similarity between '{df['Variation_Query'][i]}' and '{data['Pre_Resolved_Query'][j]}': {cosine_sim}")

        if cosine_sim > df.loc[i, 'Highest_Ratio']:
            df.loc[i, 'Highest_Ratio'] = cosine_sim
            df.loc[i, 'Resolved_Index_tfidf'] = j + 1  

print(df[['Variation_Query', 'Resolved_Index_tfidf', 'Highest_Ratio']])

    

Cosine Similarity between 'Unabel to conect to the internet' and 'Unable to connect to the internet': 0.5559733949526011
Cosine Similarity between 'Unabel to conect to the internet' and 'Payment failed during checkout': 0.0
Cosine Similarity between 'Unabel to conect to the internet' and 'App crashes when opening settings': 0.0
Cosine Similarity between 'Unabel to conect to the internet' and 'Forgot password and unable to reset': 0.13528276067068729
Cosine Similarity between 'Unabel to conect to the internet' and 'Unable to upload files to the server': 0.37271970880735006
Cosine Similarity between 'Can’t connect to internet' and 'Unable to connect to the internet': 0.6764627066857578
Cosine Similarity between 'Can’t connect to internet' and 'Payment failed during checkout': 0.0
Cosine Similarity between 'Can’t connect to internet' and 'App crashes when opening settings': 0.0
Cosine Similarity between 'Can’t connect to internet' and 'Forgot password and unable to reset': 0.0907435246713

  df.loc[i, 'Highest_Ratio'] = cosine_sim


In [11]:
df

Unnamed: 0,Variation_Query,Matches_With_Query_ID,Resolved_Index_ptsr,Resolved_Index_pr,Resolved_Index_tsr1,Resolved_Index_tsr,Highest_Ratio,Resolved_Index_tfidf
0,Unabel to conect to the internet,1,1.0,1.0,1.0,1.0,0.555973,1.0
1,Can’t connect to internet,1,1.0,1.0,1.0,1.0,0.676463,1.0
2,Intenet not working,1,,,,,0.0,
3,Payment failed while chekout,2,2.0,2.0,2.0,2.0,0.388131,2.0
4,Payment did not go through during chckout,2,2.0,2.0,2.0,2.0,0.319082,2.0
5,Payment issue at check out,2,,2.0,,,0.137678,2.0
6,Application crashes when opening setings,3,3.0,3.0,3.0,3.0,0.57748,3.0
7,App crash when going to settings,3,3.0,3.0,3.0,3.0,0.490914,3.0
8,Settings cause the app to chrash,3,3.0,,3.0,3.0,0.315979,3.0
9,Forgot passwrd and cant reset,4,4.0,4.0,4.0,4.0,0.55833,4.0


In [12]:
#accuracy
count=0
for j in range(len(df)):
    if df['Resolved_Index_tfidf'][j]==df['Matches_With_Query_ID'][j]:
        count+=1

accuracy=(count/len(df))*100
print('Accuracy:',accuracy)

Accuracy: 95.0
