In [124]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\COMPUMARTS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\COMPUMARTS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
!pip install xgboost

In [225]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

data = pd.read_csv('dataset.csv')

def preprocess_text(text):
    return ' '.join([WordNetLemmatizer().lemmatize(word) for word in text.lower().split() if word not in stopwords.words('english')]) if isinstance(text, str) else ""
    
data['question1'] = data['question1'].apply(preprocess_text)
data['question2'] = data['question2'].apply(preprocess_text)

X = data[['question1', 'question2']]
y = data['is_duplicate']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_pairs_q1 = X_train['question1']
train_pairs_q2 = X_train['question2']
test_pairs_q1 = X_test['question1']
test_pairs_q2 = X_test['question2']

vectorizer = TfidfVectorizer(ngram_range=(1, 2))
tfidf_train_q1 = vectorizer.fit_transform(train_pairs_q1)
tfidf_train_q2 = vectorizer.transform(train_pairs_q2)
tfidf_test_q1 = vectorizer.transform(test_pairs_q1)
tfidf_test_q2 = vectorizer.transform(test_pairs_q2)

cosine_sim_train = cosine_similarity(tfidf_train_q1, tfidf_train_q2).diagonal()
cosine_sim_test = cosine_similarity(tfidf_test_q1, tfidf_test_q2).diagonal()

X_train_cosine = cosine_sim_train.reshape(-1, 1)
X_test_cosine = cosine_sim_test.reshape(-1, 1)

models = {
    'Support Vector Classifier': SVC(C=1),
    'Random Forest Classifier': RandomForestClassifier(n_estimators=89, min_samples_split=2, min_samples_leaf=1),
    'Gradient Boosting Classifier': GradientBoostingClassifier(n_estimators=45, learning_rate=0.3),
    'XGBoost Classifier': XGBClassifier(n_estimators=12, learning_rate=0.3)
}

results = {}
for name, model in models.items():
    model.fit(X_train_cosine, y_train)
    y_pred = model.predict(X_test_cosine)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
print("Accuracy:")
for name, accuracy in results.items():
    print(f"{name}: {accuracy:.2f}")


Accuracy:
Support Vector Classifier: 0.72
Random Forest Classifier: 0.66
Gradient Boosting Classifier: 0.70
XGBoost Classifier: 0.64
