In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from scipy.sparse import hstack

In [2]:
path = "C:\\Users\\ATTELLI SANJAY KUMAR\\Downloads\\dla\\questions.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [3]:
path = "C:\\Users\\ATTELLI SANJAY KUMAR\\Downloads\\dla\\questions.csv"
df = pd.read_csv(path)

def clean_text(text):
    text = str(text).lower()  # Ensure text is string before lowercasing
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df['question1'] = df['question1'].apply(clean_text)
df['question2'] = df['question2'].apply(clean_text)

In [4]:
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf.fit(pd.concat([df['question1'], df['question2']]))

q1_tfidf = tfidf.transform(df['question1'])
q2_tfidf = tfidf.transform(df['question2'])

X = hstack([q1_tfidf, q2_tfidf])
y = df['is_duplicate']

print("TF-IDF feature matrix shape:", X.shape)

TF-IDF feature matrix shape: (404351, 10000)


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [7]:
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", acc)
print("F1-Score:", f1)

Accuracy: 0.7372729408564257
F1-Score: 0.5872846292807055


In [8]:
sample = df.sample(5, random_state=42)
q1_sample = tfidf.transform(sample['question1'])
q2_sample = tfidf.transform(sample['question2'])
X_sample = hstack([q1_sample, q2_sample])

preds = model.predict(X_sample)

for i in range(5):
    print(f"\nQ1: {sample.iloc[i]['question1']}")
    print(f"Q2: {sample.iloc[i]['question2']}")
    print(f"Predicted Duplicate: {bool(preds[i])}")


Q1: how does the boggart work
Q2: what would the boggart of a boggart be
Predicted Duplicate: False

Q1: what is difference between project manager and product manager
Q2: what are the differences between project management and business management
Predicted Duplicate: False

Q1: what hotel in jabalpur would be safe for unmarried couples without the harassment of police hotel staff and moral police
Q2: what hotel in allahabad would be safe for unmarried couples without the harassment of police hotel staff and moral police
Predicted Duplicate: False

Q1: what is stronger super saiyan 4 or super saiyan god
Q2: how does gohan turn into super saiyan 2
Predicted Duplicate: False

Q1: how do i fill in address line 1 and address line 2
Q2: how do i register desired web address
Predicted Duplicate: False
