In [11]:
pip install fuzzywuzzy



In [12]:
pip install Levenshtein



In [13]:
import numpy as np
import pandas as pd
import concurrent.futures
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import threading
import scipy


In [14]:
df_train = pd.read_csv("/content/drive/MyDrive/train.csv")

In [15]:
df_train

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
...,...,...,...,...,...,...
404285,404285,433578,379845,How many keywords are there in the Racket prog...,How many keywords are there in PERL Programmin...,0
404286,404286,18840,155606,Do you believe there is life after death?,Is it true that there is life after death?,1
404287,404287,537928,537929,What is one coin?,What's this coin?,0
404288,404288,537930,537931,What is the approx annual cost of living while...,I am having little hairfall problem but I want...,0


In [16]:

def retrieve_pair_levenshtein(input_string, dataset):
    dataset['question1'] = dataset['question1'].astype(str)  # Convert to string
    matches = process.extract(input_string, dataset['question1'], scorer=fuzz.ratio)
    pairs = []
    scores = []

    for match in matches:
        if match is not None:
            indices = dataset[dataset['question1'] == match[0]].index
            for index in indices:
                pair = (dataset.loc[index, 'question1'], dataset.loc[index, 'question2'])
                score = match[1]  # Get the similarity score
                pairs.append(pair)
                scores.append(score)

    if pairs:
        return pairs, scores
    else:
        return None, None

# Step 3: Retrieve the pair with the highest Levenshtein score using thread parallelism
def retrieve_pairs_parallel(input_string, dataset):
    levenshtein_thread = threading.Thread(target=retrieve_pair_levenshtein, args=(input_string, dataset))
    levenshtein_thread.start()
    levenshtein_thread.join()
    pairs_levenshtein, scores_levenshtein = retrieve_pair_levenshtein(input_string, dataset)
    return pairs_levenshtein, scores_levenshtein


In [17]:

input_string = "What is the step by step guide to invest in shares?"
pairs_levenshtein, scores_levenshtein = retrieve_pairs_parallel(input_string, df_train)

#print(pairs_levenshtein,scores_levenshtein)

if pairs_levenshtein:
    highest_score_index = scores_levenshtein.index(max(scores_levenshtein))
    pair_levenshtein = pairs_levenshtein[highest_score_index]
    print("Pair based on Levenshtein Distance:")
    print(pair_levenshtein)
    levenshtein_score = max(scores_levenshtein)
else:
    print("No matches found.")
    pair_levenshtein = None
    levenshtein_score = 0


Pair based on Levenshtein Distance:
('What is the step by step guide to invest in share market in india?', 'What is the step by step guide to invest in share market?')


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb

# Handle missing values by replacing them with empty strings
df_train['question1'].fillna('', inplace=True)
df_train['question2'].fillna('', inplace=True)

# Preprocess the data (TF-IDF vectorization)
corpus = df_train['question1'] + " " + df_train['question2']
vectorizer = TfidfVectorizer()
tfidf_features = vectorizer.fit_transform(corpus)

# Split data into train and test sets
# (You can decide how to split based on your needs)
X_train, X_test, y_train, y_test = train_test_split(tfidf_features, df_train['is_duplicate'], test_size=0.2, random_state=42)

# Train an XGBoost model
model = xgb.XGBClassifier()
model.fit(X_train, y_train)

# Make predictions on the testing set
predictions = model.predict(X_test)



In [23]:
best_match_index = predictions.argmax()
pair_xgboost = (df_train.loc[best_match_index, 'question1'], df_train.loc[best_match_index, 'question2'])
print("Pair based on XGBoost:")
print(pair_xgboost)


Pair based on XGBoost:
('Why am I mentally very lonely? How can I solve it?', 'Find the remainder when [math]23^{24}[/math] is divided by 24,23?')


In [27]:
levenshtein_weight = 0.7
xgboost_weight = 0.3

# Calculate combined scores
levenshtein_score = fuzz.ratio(input_string, pair_levenshtein[0])
xgboost_score = model.predict_proba(vectorizer.transform([input_string]))[0][1]
combined_score = levenshtein_weight * levenshtein_score + xgboost_weight * xgboost_score

print("Combined Score:")
print(combined_score)

Combined Score:
11.350678795576094
