In [1]:
# If you're in a fresh environment, uncomment installs:
# !pip install numpy pandas scikit-learn xgboost nltk bs4 fuzzywuzzy[speedup] python-Levenshtein gensim tensorflow matplotlib seaborn

import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')

# Viz
import matplotlib.pyplot as plt
import seaborn as sns

# Sklearn
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Extra models
from xgboost import XGBClassifier

# Text utils
from fuzzywuzzy import fuzz
import distance

# NLTK & stemming
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

STOP_WORDS = set(stopwords.words('english'))
STEMMER = PorterStemmer()

# Gensim for Word2Vec
import gensim
from gensim.models import Word2Vec

# Deep learning
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, models, callbacks

import pickle


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Load your data (same schema as Quora Kaggle)
df = pd.read_csv('train_quora.csv')

# Optional: speed up with a sample
new_df = df.sample(400000, random_state=2).reset_index(drop=True)

new_df.head()


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,398782,496695,532029,What is the best marketing automation tool for...,What is the best marketing automation tool for...,1
1,115086,187729,187730,I am poor but I want to invest. What should I do?,I am quite poor and I want to be very rich. Wh...,0
2,327711,454161,454162,I am from India and live abroad. I met a guy f...,T.I.E.T to Thapar University to Thapar Univers...,0
3,367788,498109,491396,Why do so many people in the U.S. hate the sou...,My boyfriend doesnt feel guilty when he hurts ...,0
4,151235,237843,50930,Consequences of Bhopal gas tragedy?,What was the reason behind the Bhopal gas trag...,0


In [3]:
new_df.shape

(400000, 6)

In [4]:
def decontract_and_clean(q: str) -> str:
    q = str(q).lower().strip()
    q = q.replace('%', ' percent').replace('$', ' dollar ').replace('₹', ' rupee ').replace('€', ' euro ').replace('@', ' at ')
    q = q.replace('[math]', '')
    q = q.replace(',000,000,000 ', 'b ').replace(',000,000 ', 'm ').replace(',000 ', 'k ')
    q = re.sub(r'([0-9]+)000000000', r'\1b', q)
    q = re.sub(r'([0-9]+)000000', r'\1m', q)
    q = re.sub(r'([0-9]+)000', r'\1k', q)

    contractions = {
        "ain't":"am not","aren't":"are not","can't":"can not","can't've":"can not have","'cause":"because",
        "could've":"could have","couldn't":"could not","couldn't've":"could not have","didn't":"did not",
        "doesn't":"does not","don't":"do not","hadn't":"had not","hadn't've":"had not have","hasn't":"has not",
        "haven't":"have not","he'd":"he would","he'd've":"he would have","he'll":"he will","he'll've":"he will have",
        "he's":"he is","how'd":"how did","how'd'y":"how do you","how'll":"how will","how's":"how is",
        "i'd":"i would","i'd've":"i would have","i'll":"i will","i'll've":"i will have","i'm":"i am","i've":"i have",
        "isn't":"is not","it'd":"it would","it'd've":"it would have","it'll":"it will","it'll've":"it will have",
        "it's":"it is","let's":"let us","ma'am":"madam","mayn't":"may not","might've":"might have","mightn't":"might not",
        "mightn't've":"might not have","must've":"must have","mustn't":"must not","mustn't've":"must not have",
        "needn't":"need not","needn't've":"need not have","o'clock":"of the clock","oughtn't":"ought not",
        "oughtn't've":"ought not have","shan't":"shall not","sha'n't":"shall not","shan't've":"shall not have",
        "she'd":"she would","she'd've":"she would have","she'll":"she will","she'll've":"she will have","she's":"she is",
        "should've":"should have","shouldn't":"should not","shouldn't've":"should not have","so've":"so have","so's":"so as",
        "that'd":"that would","that'd've":"that would have","that's":"that is","there'd":"there would","there'd've":"there would have",
        "there's":"there is","they'd":"they would","they'd've":"they would have","they'll":"they will","they'll've":"they will have",
        "they're":"they are","they've":"they have","to've":"to have","wasn't":"was not","we'd":"we would","we'd've":"we would have",
        "we'll":"we will","we'll've":"we will have","we're":"we are","we've":"we have","weren't":"were not",
        "what'll":"what will","what'll've":"what will have","what're":"what are","what's":"what is","what've":"what have",
        "when's":"when is","when've":"when have","where'd":"where did","where's":"where is","where've":"where have",
        "who'll":"who will","who'll've":"who will have","who's":"who is","who've":"who have","why's":"why is","why've":"why have",
        "will've":"will have","won't":"will not","won't've":"will not have","would've":"would have","wouldn't":"would not",
        "wouldn't've":"would not have","y'all":"you all","y'all'd":"you all would","y'all'd've":"you all would have",
        "y'all're":"you all are","y'all've":"you all have","you'd":"you would","you'd've":"you would have",
        "you'll":"you will","you'll've":"you will have","you're":"you are","you've":"you have"
    }
    q = ' '.join([contractions.get(w, w) for w in q.split()])
    q = q.replace("'ve"," have").replace("n't"," not").replace("'re"," are").replace("'ll"," will")

    q = BeautifulSoup(q, "html.parser").get_text()
    q = re.sub(r'\W', ' ', q).strip()
    return q

def stem_text(q: str) -> str:
    return ' '.join(STEMMER.stem(w) for w in q.split() if w and w not in STOP_WORDS)

# Apply preprocess + stemming
new_df['question1'] = new_df['question1'].apply(lambda x: stem_text(decontract_and_clean(x)))
new_df['question2'] = new_df['question2'].apply(lambda x: stem_text(decontract_and_clean(x)))

new_df[['question1','question2','is_duplicate']].head()


Unnamed: 0,question1,question2,is_duplicate
0,best market autom tool small mid size compani,best market autom tool small mid size compani,1
1,poor want invest,quit poor want rich,0
2,india live abroad met guy franc parti want date,e thapar univers thapar univers institut engin...,0
3,mani peopl u hate southern state,boyfriend doesnt feel guilti hurt cri tell kil...,0
4,consequ bhopal ga tragedi,reason behind bhopal ga tragedi,0


In [5]:
def common_words(q1, q2):
    w1 = set(q1.split()); w2 = set(q2.split())
    return len(w1 & w2)

def total_words(q1, q2):
    w1 = set(q1.split()); w2 = set(q2.split())
    return len(w1) + len(w2)

def token_features(q1, q2):
    SAFE_DIV = 1e-4
    q1_t, q2_t = q1.split(), q2.split()
    if not q1_t or not q2_t: return [0.]*8

    q1_words = set([w for w in q1_t if w not in STOP_WORDS])
    q2_words = set([w for w in q2_t if w not in STOP_WORDS])
    q1_stops = set([w for w in q1_t if w in STOP_WORDS])
    q2_stops = set([w for w in q2_t if w in STOP_WORDS])

    c_words = len(q1_words & q2_words)
    c_stops = len(q1_stops & q2_stops)
    c_tokens = len(set(q1_t) & set(q2_t))

    feats = [0.]*8
    feats[0] = c_words / (min(len(q1_words), len(q2_words)) + SAFE_DIV)
    feats[1] = c_words / (max(len(q1_words), len(q2_words)) + SAFE_DIV)
    feats[2] = c_stops / (min(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    feats[3] = c_stops / (max(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    feats[4] = c_tokens / (min(len(q1_t), len(q2_t)) + SAFE_DIV)
    feats[5] = c_tokens / (max(len(q1_t), len(q2_t)) + SAFE_DIV)
    feats[6] = int(q1_t[-1] == q2_t[-1])
    feats[7] = int(q1_t[0] == q2_t[0])
    return feats

def length_features(q1, q2):
    q1_t, q2_t = q1.split(), q2.split()
    if not q1_t or not q2_t: return [0.,0.,0.]
    f0 = abs(len(q1_t) - len(q2_t))
    f1 = (len(q1_t) + len(q2_t)) / 2
    lcs = list(distance.lcsubstrings(q1, q2))
    f2 = (len(lcs[0]) / (min(len(q1), len(q2)) + 1.0)) if lcs else 0.0
    return [f0, f1, f2]

def fuzzy_features(q1, q2):
    return [
        fuzz.QRatio(q1, q2),
        fuzz.partial_ratio(q1, q2),
        fuzz.token_sort_ratio(q1, q2),
        fuzz.token_set_ratio(q1, q2),
    ]

# Build engineered feature matrix
feats = []
for q1, q2 in zip(new_df['question1'], new_df['question2']):
    row = []
    row += [len(q1), len(q2), len(q1.split()), len(q2.split()),
            common_words(q1, q2), total_words(q1, q2)]
    row += token_features(q1, q2)
    row += length_features(q1, q2)
    row += fuzzy_features(q1, q2)
    feats.append(row)

feat_cols = [
    'q1_len','q2_len','q1_num_words','q2_num_words','word_common','word_total',
    'cwc_min','cwc_max','csc_min','csc_max','ctc_min','ctc_max','last_word_eq','first_word_eq',
    'abs_len_diff','mean_len','longest_substr_ratio',
    'fuzz_ratio','fuzz_partial_ratio','token_sort_ratio','token_set_ratio'
]
engineered_X = pd.DataFrame(feats, columns=feat_cols, index=new_df.index)
y = new_df['is_duplicate'].values

engineered_X.shape, y.shape


((400000, 21), (400000,))

In [6]:
from scipy.sparse import hstack

# Prepare raw texts
q1_texts = new_df['question1'].tolist()
q2_texts = new_df['question2'].tolist()

# -------------------------------
# CountVectorizer
# -------------------------------
cv = CountVectorizer(max_features=4000, ngram_range=(1,2))
q1_bow = cv.fit_transform(q1_texts)
q2_bow = cv.transform(q2_texts)

# Keep sparse, don't use .toarray()
bow_X = hstack([q1_bow, q2_bow])

# -------------------------------
# TF-IDF
# -------------------------------
tfidf = TfidfVectorizer(max_features=4000, ngram_range=(1,2))
q1_tfidf = tfidf.fit_transform(q1_texts)
q2_tfidf = tfidf.transform(q2_texts)

# Keep sparse
tfidf_X = hstack([q1_tfidf, q2_tfidf])

# -------------------------------
# Combine engineered features + TF-IDF
# Convert engineered features (pandas DataFrame) to sparse first
# -------------------------------
from scipy.sparse import csr_matrix

engineered_sparse = csr_matrix(engineered_X.values)

# Combine (engineered + tfidf)
combined_X = hstack([engineered_sparse, tfidf_X])

# Check shapes
print("Engineered:", engineered_X.shape)
print("TF-IDF:", tfidf_X.shape)
print("Combined:", combined_X.shape)


Engineered: (400000, 21)
TF-IDF: (400000, 8000)
Combined: (400000, 8021)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    combined_X, y, test_size=0.2, random_state=42, stratify=y
)
X_train.shape, X_test.shape


((320000, 8021), (80000, 8021))

In [8]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

# Faster SVM variant for text data
svc = LinearSVC(max_iter=2000, random_state=42)

# Only C is tuned (kernel & gamma not needed for LinearSVC)
param_grid = {
    'C': [0.5, 1, 2, 5]
}

cv5 = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid = GridSearchCV(svc, param_grid, scoring='f1', cv=cv5, n_jobs=-1, verbose=1)

# Train
grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("Best CV F1:", grid.best_score_)

# Best model
svm_best = grid.best_estimator_
pred = svm_best.predict(X_test)

print("LinearSVC Accuracy:", accuracy_score(y_test, pred))
print("LinearSVC F1:", f1_score(y_test, pred))
print(confusion_matrix(y_test, pred))


Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best params: {'C': 0.5}
Best CV F1: 0.7073613273500656
LinearSVC Accuracy: 0.79325
LinearSVC F1: 0.7115854083839018
[[43056  7404]
 [ 9136 20404]]


In [9]:
# Logistic Regression on combined features
logit = LogisticRegression(max_iter=2000, n_jobs=-1 if hasattr(LogisticRegression,'n_jobs') else None)
scores = cross_val_score(logit, X_train, y_train, cv=5, scoring='f1', n_jobs=-1)
print("LogReg CV F1:", scores.mean())
logit.fit(X_train, y_train)
print("LogReg Test F1:", f1_score(y_test, logit.predict(X_test)))

# XGBoost quick baseline
xgb = XGBClassifier(
    n_estimators=300, max_depth=6, learning_rate=0.1,
    subsample=0.8, colsample_bytree=0.8, eval_metric='logloss', n_jobs=-1
)
xgb.fit(X_train, y_train)
pred_xgb = xgb.predict(X_test)
print("XGB Acc:", accuracy_score(y_test, pred_xgb))
print("XGB F1 :", f1_score(y_test, pred_xgb))


LogReg CV F1: 0.6957645936184196
LogReg Test F1: 0.6973252652810664
XGB Acc: 0.798775
XGB F1 : 0.718841693446975


In [10]:
# Train a Word2Vec model on both q1 & q2 tokens
sentences = [q.split() for q in (q1_texts + q2_texts)]
w2v_model = Word2Vec(sentences, vector_size=200, window=5, min_count=2, workers=4, sg=1, epochs=10)

# Helper: average Word2Vec for a sentence
def sentence_avg_w2v(sent, model, size=200):
    words = [w for w in sent.split() if w in model.wv]
    if not words: return np.zeros(size)
    return np.mean(model.wv[words], axis=0)

# Build Avg W2V features
q1_w2v = np.vstack([sentence_avg_w2v(s, w2v_model, 200) for s in q1_texts])
q2_w2v = np.vstack([sentence_avg_w2v(s, w2v_model, 200) for s in q2_texts])
avgw2v_X = np.hstack([q1_w2v, q2_w2v])

# TF-IDF weights (from the TF-IDF vectorizer built earlier)
idf = dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))

def sentence_tfidf_w2v(sent, model, idf_dict, size=200):
    words = [w for w in sent.split() if w in model.wv]
    if not words:
        return np.zeros(size)
    weights, vecs = [], []
    for w in words:
        weights.append(idf_dict.get(w, 1.0))
        vecs.append(model.wv[w])
    weights = np.array(weights)
    vecs = np.array(vecs)
    return np.average(vecs, axis=0, weights=weights)

q1_tw2v = np.vstack([sentence_tfidf_w2v(s, w2v_model, idf, 200) for s in q1_texts])
q2_tw2v = np.vstack([sentence_tfidf_w2v(s, w2v_model, idf, 200) for s in q2_texts])
tfidf_w2v_X = np.hstack([q1_tw2v, q2_tw2v])

# Combine engineered + tfidf_w2v (often strong)
X_mix_w2v = np.hstack([engineered_X.values, tfidf_w2v_X])

X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = train_test_split(
    X_mix_w2v, y, test_size=0.2, random_state=42, stratify=y
)

logit2 = LogisticRegression(max_iter=2000)
logit2.fit(X_train_w2v, y_train_w2v)
pred2 = logit2.predict(X_test_w2v)
print("TFIDF-W2V + Engineered — LogReg Acc:", accuracy_score(y_test_w2v, pred2))
print("TFIDF-W2V + Engineered — LogReg F1 :", f1_score(y_test_w2v, pred2))


TFIDF-W2V + Engineered — LogReg Acc: 0.7574125
TFIDF-W2V + Engineered — LogReg F1 : 0.6563368808769102


In [11]:
# Build a shared tokenizer over both questions
all_texts = q1_texts + q2_texts
max_words = 40000
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(all_texts)

seq_q1 = tokenizer.texts_to_sequences(q1_texts)
seq_q2 = tokenizer.texts_to_sequences(q2_texts)

max_len = 30  # adjust if you want
X1 = pad_sequences(seq_q1, maxlen=max_len, padding='post', truncating='post')
X2 = pad_sequences(seq_q2, maxlen=max_len, padding='post', truncating='post')

X_train_nn_q1, X_test_nn_q1, y_train_nn, y_test_nn = train_test_split(X1, y, test_size=0.2, random_state=42, stratify=y)
X_train_nn_q2, X_test_nn_q2, _, _ = train_test_split(X2, y, test_size=0.2, random_state=42, stratify=y)

vocab_size = min(max_words, len(tokenizer.word_index) + 1)
vocab_size, X1.shape, X2.shape


(40000, (400000, 30), (400000, 30))

In [12]:
# Build Embedding + Average + Dense
embed_dim = 100

inp1 = layers.Input(shape=(max_len,))
inp2 = layers.Input(shape=(max_len,))

emb = layers.Embedding(vocab_size, embed_dim, input_length=max_len)

x1 = layers.GlobalAveragePooling1D()(emb(inp1))
x2 = layers.GlobalAveragePooling1D()(emb(inp2))

x = layers.Concatenate()([x1, x2])
x = layers.Dense(128, activation='relu')(x)
x = layers.Dropout(0.3)(x)
out = layers.Dense(1, activation='sigmoid')(x)

ann_model = models.Model([inp1, inp2], out)
ann_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

es = callbacks.EarlyStopping(patience=2, restore_best_weights=True, monitor='val_accuracy')
hist = ann_model.fit(
    [X_train_nn_q1, X_train_nn_q2], y_train_nn,
    validation_split=0.1, epochs=6, batch_size=512, callbacks=[es], verbose=1
)

ann_eval = ann_model.evaluate([X_test_nn_q1, X_test_nn_q2], y_test_nn, verbose=0)
print("ANN Test — Acc:", ann_eval[1])


Epoch 1/6
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 80ms/step - accuracy: 0.7081 - loss: 0.5620 - val_accuracy: 0.7447 - val_loss: 0.5182
Epoch 2/6
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 79ms/step - accuracy: 0.7490 - loss: 0.5060 - val_accuracy: 0.7529 - val_loss: 0.5063
Epoch 3/6
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 79ms/step - accuracy: 0.7636 - loss: 0.4809 - val_accuracy: 0.7608 - val_loss: 0.4897
Epoch 4/6
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 81ms/step - accuracy: 0.7801 - loss: 0.4501 - val_accuracy: 0.7594 - val_loss: 0.4881
Epoch 5/6
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 81ms/step - accuracy: 0.7901 - loss: 0.4300 - val_accuracy: 0.7633 - val_loss: 0.4908
Epoch 6/6
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 83ms/step - accuracy: 0.8021 - loss: 0.4081 - val_accuracy: 0.7574 - val_loss: 0.4950
ANN Test — Acc: 0.7654

In [13]:
embed_dim = 100

inp1 = layers.Input(shape=(max_len,))
inp2 = layers.Input(shape=(max_len,))
emb = layers.Embedding(vocab_size, embed_dim, input_length=max_len)

def cnn_branch(x):
    x = emb(x)
    x1 = layers.Conv1D(128, 3, activation='relu')(x)
    x1 = layers.GlobalMaxPooling1D()(x1)
    x2 = layers.Conv1D(128, 4, activation='relu')(x)
    x2 = layers.GlobalMaxPooling1D()(x2)
    x3 = layers.Conv1D(128, 5, activation='relu')(x)
    x3 = layers.GlobalMaxPooling1D()(x3)
    x = layers.Concatenate()([x1, x2, x3])
    return x

b1 = cnn_branch(inp1)
b2 = cnn_branch(inp2)
x = layers.Concatenate()([b1, b2])
x = layers.Dense(256, activation='relu')(x)
x = layers.Dropout(0.4)(x)
out = layers.Dense(1, activation='sigmoid')(x)

cnn_model = models.Model([inp1, inp2], out)
cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

es = callbacks.EarlyStopping(patience=2, restore_best_weights=True, monitor='val_accuracy')
cnn_model.fit([X_train_nn_q1, X_train_nn_q2], y_train_nn,
              validation_split=0.1, epochs=6, batch_size=256, callbacks=[es], verbose=1)

cnn_eval = cnn_model.evaluate([X_test_nn_q1, X_test_nn_q2], y_test_nn, verbose=0)
print("CNN Test — Acc:", cnn_eval[1])


Epoch 1/6
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m170s[0m 149ms/step - accuracy: 0.7567 - loss: 0.4936 - val_accuracy: 0.7893 - val_loss: 0.4463
Epoch 2/6
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 148ms/step - accuracy: 0.8284 - loss: 0.3734 - val_accuracy: 0.8007 - val_loss: 0.4346
Epoch 3/6
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 152ms/step - accuracy: 0.8820 - loss: 0.2685 - val_accuracy: 0.7999 - val_loss: 0.4861
Epoch 4/6
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m174s[0m 154ms/step - accuracy: 0.9185 - loss: 0.1904 - val_accuracy: 0.7941 - val_loss: 0.5702
CNN Test — Acc: 0.8020125031471252


In [14]:
embed_dim = 100

inp1 = layers.Input(shape=(max_len,))
inp2 = layers.Input(shape=(max_len,))
emb = layers.Embedding(vocab_size, embed_dim, input_length=max_len, mask_zero=False)

def lstm_branch(x):
    x = emb(x)
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=False))(x)
    return x

b1 = lstm_branch(inp1)
b2 = lstm_branch(inp2)
x = layers.Concatenate()([b1, b2])
x = layers.Dense(128, activation='relu')(x)
x = layers.Dropout(0.4)(x)
out = layers.Dense(1, activation='sigmoid')(x)

lstm_model = models.Model([inp1, inp2], out)
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

es = callbacks.EarlyStopping(patience=2, restore_best_weights=True, monitor='val_accuracy')
lstm_model.fit([X_train_nn_q1, X_train_nn_q2], y_train_nn,
               validation_split=0.1, epochs=6, batch_size=256, callbacks=[es], verbose=1)

lstm_eval = lstm_model.evaluate([X_test_nn_q1, X_test_nn_q2], y_test_nn, verbose=0)
print("BiLSTM Test — Acc:", lstm_eval[1])


Epoch 1/6
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m373s[0m 324ms/step - accuracy: 0.7497 - loss: 0.5044 - val_accuracy: 0.7745 - val_loss: 0.4678
Epoch 2/6
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m357s[0m 317ms/step - accuracy: 0.8010 - loss: 0.4207 - val_accuracy: 0.7850 - val_loss: 0.4480
Epoch 3/6
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m315s[0m 280ms/step - accuracy: 0.8304 - loss: 0.3647 - val_accuracy: 0.7835 - val_loss: 0.4598
Epoch 4/6
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m288s[0m 256ms/step - accuracy: 0.8529 - loss: 0.3201 - val_accuracy: 0.7916 - val_loss: 0.4753
Epoch 5/6
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m292s[0m 260ms/step - accuracy: 0.8698 - loss: 0.2846 - val_accuracy: 0.7883 - val_loss: 0.5110
Epoch 6/6
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m284s[0m 252ms/step - accuracy: 0.8845 - loss: 0.2543 - val_accuracy: 0.7902 - val_loss: 0.547

In [15]:
# Best classical model (example: SVM from grid search)
with open('model_quora.pkl', 'wb') as f:
    pickle.dump(svm_best, f)

with open('tfidf.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

with open('cv.pkl', 'wb') as f:
    pickle.dump(cv, f)

# Save tokenizer & DL models if you want Streamlit to use them later
tokenizer_json = tokenizer.to_json()
with open('tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(tokenizer_json)

ann_model.save('ann_model.h5')
cnn_model.save('cnn_model.h5')
lstm_model.save('bilstm_model.h5')

print("Saved: model_quora.pkl, tfidf.pkl, cv.pkl, tokenizer.json, ann_model.h5, cnn_model.h5, bilstm_model.h5")




Saved: model_quora.pkl, tfidf.pkl, cv.pkl, tokenizer.json, ann_model.h5, cnn_model.h5, bilstm_model.h5
