In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv(r'questions.csv')

In [3]:
data.shape

(404351, 6)

In [4]:
new_data = data.sample(30000, random_state=42)

In [5]:
new_data.shape

(30000, 6)

In [6]:
new_data.isna().sum()

id              0
qid1            0
qid2            0
question1       0
question2       0
is_duplicate    0
dtype: int64

In [7]:
new_data.duplicated().sum()

0

## Custom Features

In [8]:
# length of questions

new_data['q1_len'] = new_data['question1'].str.len()
new_data['q2_len'] = new_data['question2'].str.len()

In [9]:
# number of words in questions

new_data['q1_word_count'] =  new_data['question1'].apply(lambda x: len(x.split()))
new_data['q2_word_count'] =  new_data['question2'].apply(lambda x: len(x.split()))

In [10]:
# number of common words between question pairs

def common_words(row):
    w1 = set(row['question1'].lower().strip().split())
    w2 = set(row['question2'].lower().split())

    return len(w1.intersection(w2))

new_data['common_word_count'] = new_data.apply(common_words, axis=1)

In [11]:
# total number of words in question pairs

def length_words(row):
    w1 = row['question1'].lower().strip().split()
    w2 = row['question2'].lower().strip().split()

    return (len(w1) + len(w2))

new_data['total_word_count'] = new_data.apply(length_words, axis=1)

In [12]:
# word share in question pairs

new_data['word_share'] = round(new_data['common_word_count']/new_data['total_word_count'], 2)

In [13]:
new_data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len,q1_word_count,q2_word_count,common_word_count,total_word_count,word_share
120567,120567,238932,238933,How does the Boggart work?,What would the boggart of a boggart be?,0,26,39,5,8,2,13,0.15
324466,324466,636476,636477,What is difference between project manager and...,What are the differences between project manag...,0,63,76,9,10,4,19,0.21
398558,398558,778728,778729,What hotel in Jabalpur would be safe for unmar...,What hotel in Allahabad would be safe for unma...,0,124,125,20,20,18,40,0.45
339914,339914,666314,666315,What is stronger - Super Saiyan 4 or Super Sai...,How does Gohan turn into Super Saiyan 2?,0,54,40,11,8,2,19,0.11
185732,185732,366764,366765,How do I fill in Address Line 1 and Address Li...,How do I register desired web address?,0,51,38,12,7,3,19,0.16


In [14]:
from scipy.sparse import csr_matrix

custom_features = new_data[['q1_len','q2_len',
                            'q1_word_count','q2_word_count',
                            'common_word_count','total_word_count',
                            'word_share']].values

# Convert to sparse for compatibility with hstack
custom_sparse = csr_matrix(custom_features)
print("custom features shape:", custom_sparse.shape)


custom features shape: (30000, 7)


## Sentence Transformer

In [15]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

q1_embeddings = model.encode(new_data['question1'].astype(str).tolist(), show_progress_bar=True)
q2_embeddings = model.encode(new_data['question2'].astype(str).tolist(), show_progress_bar=True)

semantic_features = np.hstack([q1_embeddings, q2_embeddings])

# Convert semantic features to sparse for compatibility
semantic_sparse = csr_matrix(semantic_features)

print("semantic features shape:", semantic_sparse.shape)

  from tqdm.autonotebook import tqdm, trange
Batches: 100%|██████████| 938/938 [04:32<00:00,  3.44it/s]
Batches: 100%|██████████| 938/938 [01:50<00:00,  8.50it/s]


semantic features shape: (30000, 768)


* sentence transformer with all-MiniLM-L6-v2 gives 384 dimension embeddings
* so each q1 and q2 will have 384 dimensions
* Therefore, semantic_features will have 768 dimensions for each question pair

## Tfidf Vectorization

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix

tfidf = TfidfVectorizer(max_features=5000)
tfidf.fit(new_data['question1'].astype(str).tolist() + new_data['question2'].astype(str).tolist())

q1_tfidf = tfidf.transform(new_data['question1'].astype(str).tolist())
q2_tfidf = tfidf.transform(new_data['question2'].astype(str).tolist())

# Concatenate sparse matrices
lexical_features = hstack([q1_tfidf, q2_tfidf])
print("lexical features shape:", lexical_features.shape)

lexical features shape: (30000, 10000)


* tfidf is created with max_features=5000 for each question, so total 10000 features

In [17]:
# Final hybrid feature matrix
X = hstack([semantic_sparse, lexical_features, custom_sparse])
y = new_data['is_duplicate'].values


In [18]:
print(X.shape)
print(y.shape)

(30000, 10775)
(30000,)


* no. of features are: 10000 + (384+384) + 7 = 10775

## Splitting the data

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
print(X_train.shape, y_train.shape)  
print(X_test.shape, y_test.shape) 

(24000, 10775) (24000,)
(6000, 10775) (6000,)


## Model Building

In [21]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

xgb = XGBClassifier(n_estimators=300,max_depth=6, learning_rate=0.1)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.8227


In [23]:
from sklearn.ensemble import RandomForestClassifier


rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
y_pred_rfc = rfc.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_rfc))

Accuracy: 0.7481666666666666


## so the best model is XGB classifier with accuracy 82%