In [1]:
import pandas as pd

df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [2]:
print(df.shape)
print(df.info())

(404290, 6)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404290 entries, 0 to 404289
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            404290 non-null  int64 
 1   qid1          404290 non-null  int64 
 2   qid2          404290 non-null  int64 
 3   question1     404289 non-null  object
 4   question2     404288 non-null  object
 5   is_duplicate  404290 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 18.5+ MB
None


In [3]:
df["is_duplicate"].value_counts(normalize=True)

is_duplicate
0    0.630802
1    0.369198
Name: proportion, dtype: float64

In [4]:
df["q1_len"] = df["question1"].apply(lambda x : len(str(x).split()))
df["q2_len"] = df["question2"].apply(lambda x : len(str(x).split()))

df[["q1_len","q2_len"]].describe()

Unnamed: 0,q1_len,q2_len
count,404290.0,404290.0
mean,10.94221,11.181991
std,5.428824,6.305246
min,1.0,1.0
25%,7.0,7.0
50%,10.0,10.0
75%,13.0,13.0
max,125.0,237.0


In [5]:
df["len_diff"] = abs(df["q1_len"] - df["q2_len"])
df[["len_diff","is_duplicate"]].groupby("is_duplicate").mean()

Unnamed: 0_level_0,len_diff
is_duplicate,Unnamed: 1_level_1
0,4.439436
1,2.420982


In [6]:
df["q1_word"] = df["question1"].apply(lambda x : str(x).split())
df["q2_word"] = df["question2"].apply(lambda x : str(x).split())
df["common_word"] = df.apply(lambda x: len(set(x["q1_word"]) & set(x["q2_word"])) , axis=1)

In [7]:
df["common_word"].describe()
df[["common_word" , "is_duplicate"]].groupby("is_duplicate").mean()

Unnamed: 0_level_0,common_word
is_duplicate,Unnamed: 1_level_1
0,3.761641
1,5.223572


DATA PREPROSSING


In [8]:
df = df.fillna("")
df.isnull().sum()

id              0
qid1            0
qid2            0
question1       0
question2       0
is_duplicate    0
q1_len          0
q2_len          0
len_diff        0
q1_word         0
q2_word         0
common_word     0
dtype: int64

In [None]:
from bs4 import BeautifulSoup

def html_remove(text):
    return BeautifulSoup(text , "html.parser").get_text()
df["question1"] = df["question1"].apply(html_remove)
df["question2"] = df["question2"].apply(html_remove)

In [None]:
import contractions

def expand(text):
    return contractions.fix(text)

df["question1"] = df["question1"].apply(expand)
df["question2"] = df["question2"].apply(expand)

In [None]:
df["question1"] = df["question1"].str.lower()
df["question2"] = df["question2"].str.lower()

In [None]:
df["question1"] = df["question1"].str.replace(r"[^a-zA-Z0-9\s]" ,"", regex=True).str.replace(r"\s+"," ", regex = True ).str.strip()
df["question2"] = df["question2"].str.replace(r"[^a-zA-Z0-9\s]" , " ", regex=True).str.replace(r"\s+"," ", regex = True ).str.strip()

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])  # faster
texts1 = df["question1"].astype(str).tolist()
texts2 = df["question2"].astype(str).tolist()

def stop_and_lemma_pipe(docs):
    for doc in docs:
        yield " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

df["question1"] = list(stop_and_lemma_pipe(nlp.pipe(texts1, batch_size=500)))
df["question2"] = list(stop_and_lemma_pipe(nlp.pipe(texts2, batch_size=500)))




In [None]:
df["q1_len"] = df["question1"].apply(lambda x : len(str(x).split()))
df["q2_len"] = df["question2"].apply(lambda x : len(str(x).split()))

df["q1_word"] = df["question1"].apply(lambda x : str(x).split())
df["q2_word"] = df["question2"].apply(lambda x : str(x).split())

df["total_word"] = df.apply(lambda x: len(set(x["q1_word"]) | set(x["q2_word"])) , axis=1)

df["common_word"] = df.apply(lambda x: len(set(x["q1_word"]) & set(x["q2_word"]))/x["total_word"] if x["total_word"] != 0 else 0,  axis=1)

df["len_diff"] = df.apply(lambda x: abs(x["q1_len"] - x["q2_len"]) / x["total_word"] if x["total_word"] != 0 else 0, axis=1)

In [None]:
def jaccard(row):
    q1 = set(row['question1'].split())
    q2 = set(row['question2'].split())
    return len(q1 & q2) / len(q1 | q2) if (len(q1 | q2)) > 0 else 0

df['jaccard'] = df.apply(jaccard, axis=1)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

TfidV = TfidfVectorizer(max_features = 5000)

TfidV_metrix = TfidV.fit_transform(pd.concat([df["question1"] , df["question2"]]))

q1_TfidV = TfidV_metrix[:len(df)]
q2_TfidV = TfidV_metrix[len(df):]

df["cosine_sim"] = [cosine_similarity(q1 ,q2)[0][0] for q1 ,q2 in zip(q1_TfidV , q2_TfidV)]

In [None]:
from fuzzywuzzy import fuzz

df['fuzz_ratio'] = df.apply(lambda x: fuzz.ratio(x['question1'], x['question2']), axis=1)
df['fuzz_partial_ratio'] = df.apply(lambda x: fuzz.partial_ratio(x['question1'], x['question2']), axis=1)
df['fuzz_token_sort_ratio'] = df.apply(lambda x: fuzz.token_sort_ratio(x['question1'], x['question2']), axis=1)




In [None]:
import torch
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

q1_list = df['question1'].astype(str).tolist()
q2_list = df['question2'].astype(str).tolist()

# Encode in batches, on GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"

q1_embeddings = model.encode(q1_list, convert_to_tensor=True, batch_size=64, device=device, show_progress_bar=True)
q2_embeddings = model.encode(q2_list, convert_to_tensor=True, batch_size=64, device=device, show_progress_bar=True)

# Compute diagonal cosine similarity batch-wise
batch_size = 4096  # adjust depending on GPU memory
cos_sim_list = []

for start_idx in range(0, len(q1_embeddings), batch_size):
    end_idx = min(start_idx + batch_size, len(q1_embeddings))
    q1_batch = q1_embeddings[start_idx:end_idx]
    q2_batch = q2_embeddings[start_idx:end_idx]
    
    # element-wise cosine similarity
    sim = torch.sum(q1_batch * q2_batch, dim=1) / (
        torch.norm(q1_batch, dim=1) * torch.norm(q2_batch, dim=1)
    )
    cos_sim_list.append(sim)

cos_sim = torch.cat(cos_sim_list)

# Move to CPU and save to dataframe
df['cosine_sim_BERT'] = cos_sim.cpu().numpy()


  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 6318/6318 [01:35<00:00, 65.95it/s]
Batches: 100%|██████████| 6318/6318 [01:33<00:00, 67.34it/s]


In [None]:
df.head(1)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_len,q2_len,len_diff,q1_word,q2_word,common_word,total_word,jaccard,cosine_sim,fuzz_ratio,fuzz_partial_ratio,fuzz_token_sort_ratio,cosine_sim_BERT
0,0,1,2,step step guide invest share market india,step step guide invest share market,0,7,6,0.166667,"[step, step, guide, invest, share, market, india]","[step, step, guide, invest, share, market]",0.833333,6,0.833333,0.978943,92,100,92,0.915797


HYPERPARAMETER TUNING


In [None]:
feature_col = ['q1_len', 'q2_len', 'len_diff',  'common_word', 'jaccard', 'cosine_sim', 'fuzz_ratio','fuzz_partial_ratio', 'fuzz_token_sort_ratio', 'cosine_sim_BERT']
X = df[feature_col]
y = df["is_duplicate"]

In [None]:
from sklearn.model_selection import train_test_split
# x_tune,x_test,y_tune,y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# #logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# param_grid_lr = {
#     "C" : [0.01 , 0.1 , 1 , 10],
#     "penalty" : ["l2"],
#     "solver" : ["lbfgs" , "saga"]
# }

# lr = LogisticRegression(max_iter=1000)
# Grid_lr = GridSearchCV(lr , param_grid_lr , verbose= 2 , cv=3 , scoring="f1" , n_jobs= -1)
# Grid_lr.fit(x_tune,y_tune)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV

# param_dist_rf = {
#     "n_estimators" : randint(100 , 500),
#     "max_depth" : randint(5,20),
#     "min_samples_split" : randint(2,10),
#     "min_samples_leaf" : randint(1,5)
# }

# rf = RandomForestClassifier(random_state=42)
# rand_rf = RandomizedSearchCV(rf , param_dist_rf , n_iter=20 , cv=3 , scoring="f1" , n_jobs= -1 , verbose= 2)
# rand_rf.fit(x_tune,y_tune)

In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

# param_grid_xgb = {
#     "C" : [100 , 200 , 300],
#     "max_depth" : [3 , 5 ,7],
#     "learning_rate" : [0.01 , 0.1 , 0.2],
#     "subsamples" : [0.7 , 0.8 , 1.0]
# }

# xgb_m = xgb.XGBClassifier(use_label_encoder = False , eval_metric = "logloss")
# grid_xgb = GridSearchCV(xgb_m , param_grid_xgb , cv= 3 , verbose= 2, n_jobs= - 1)
# grid_xgb.fit(x_tune,y_tune)


In [None]:
from sklearn.svm import SVC

# param_grid_svm = {
#     'C': [0.1, 1, 10],
#     'kernel': ['linear', 'rbf'],
#     'gamma': ['scale', 'auto']
# }
# svm = SVC(probability=True)
# grid_svm = GridSearchCV(svm, param_grid_svm, cv=3, scoring='f1', n_jobs=-1, verbose=2)
# grid_svm.fit(x_tune, y_tune)


In [None]:
from sklearn.neural_network import MLPClassifier

# param_grid_mlp = {
#     'hidden_layer_sizes': [(64,32), (128,64), (128,64,32)],
#     'activation': ['relu', 'tanh'],
#     'solver': ['adam', 'sgd'],
#     'alpha': [0.0001, 0.001],
#     'learning_rate': ['constant', 'adaptive']
# }

# mlp = MLPClassifier(max_iter=500, random_state=42)
# grid_mlp = GridSearchCV(mlp, param_grid_mlp, cv=3, scoring='f1', n_jobs=-1, verbose=2)
# grid_mlp.fit(x_tune, y_tune)


In [None]:
x_train,x_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
final_lr = LogisticRegression( max_iter=1000)
final_lr.fit(x_train, y_train)

In [None]:
final_rf = RandomForestClassifier( random_state=42)
final_rf.fit(x_train, y_train)

In [None]:
final_mlp = MLPClassifier( max_iter=500, random_state=42)
final_mlp.fit(x_train, y_train)

In [None]:
# final_lr = LogisticRegression( max_iter=1000)
# final_lr.fit(x_train, y_train)

# final_rf = RandomForestClassifier( random_state=42)
# final_rf.fit(x_train, y_train)

# final_xgb = xgb.XGBClassifier( use_label_encoder=False, eval_metric='logloss')
# final_xgb.fit(x_train, y_train)

# final_svm = SVC( probability=True)
# final_svm.fit(x_train, y_train)

# final_mlp = MLPClassifier( max_iter=500, random_state=42)
# final_mlp.fit(x_train, y_train)



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
from sklearn.metrics import classification_report
models = {
    "Logistic Regression": final_lr,
    "Random Forest": final_rf,
    "MLP": final_mlp
}

# Generate reports
for name, model in models.items():
    y_pred = model.predict(x_test)
    print(f"\n--- {name} ---")
    print(classification_report(y_test, y_pred))

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

# Define base learners
estimators = [
    ('lr', final_lr),
    ('rf', final_rf),
    ('xgb', final_mlp)
]

# Meta learner
stack_model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=1000),
    cv=5,              # 5-fold cross-validation for stacking
    n_jobs=-1,
    passthrough=False  # if True, meta-learner also gets original features
)

# Train on full dataset
stack_model.fit(x_test ,y_test)

In [None]:
import joblib

joblib.dump(stack_model, "stacked_final_model.pkl")