In [1]:
import pandas as pd

# --- Load Data ---
questions = pd.read_csv("/kaggle/input/stackoverflow-dataset/Questions.csv", encoding='ISO-8859-1')
tags = pd.read_csv("/kaggle/input/stackoverflow-dataset/Tags.csv")
answers = pd.read_csv("/kaggle/input/stackoverflow-2/Answers.csv", encoding='ISO-8859-1')

# --- Clean Answers ---
answers = answers.dropna(subset=["Body"])
answers = answers.sort_values("CreationDate")

# Get only the earliest answer per question
earliest_answers = answers.groupby("ParentId").first().reset_index()
earliest_answers = earliest_answers[["ParentId", "Body"]].rename(columns={"ParentId": "Id", "Body": "Body_answer"})

# --- Rename question body ---
questions.rename(columns={"Body": "Body_question"}, inplace=True)

# --- Merge Answers with Questions ---
questions = pd.merge(questions, earliest_answers, on="Id", how="left")
questions["Body_answer"] = questions["Body_answer"].fillna("")

# --- Combine Text ---
questions["text"] = questions["Title"] + " " + questions["Body_question"] + " " + questions["Body_answer"]


# --- Clean text ---
import re

def clean(text):
    text = re.sub(r"<[^>]+>", " ", text)  # Remove HTML
    text = re.sub(r"\s+", " ", text)      # Collapse whitespace
    text = text.strip()
    return text.lower()

questions["text"] = questions["text"].apply(clean)

# --- Filter to Top 10 Tags ---
top_tags = tags["Tag"].value_counts().nlargest(10).index
filtered_tags = tags[tags["Tag"].isin(top_tags)]

In [2]:
# --- MultiLabel Binarization ---
from sklearn.preprocessing import MultiLabelBinarizer

tag_df = filtered_tags.groupby("Id")["Tag"].apply(list).reset_index()
data = pd.merge(questions, tag_df, on="Id")
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(data["Tag"])

# --- Tokenize ---
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_texts = data["text"].tolist()
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_texts)
X = tokenizer.texts_to_sequences(X_texts)
X = pad_sequences(X, maxlen=150)

# --- Train/Validation Split ---
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Model ---
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dropout, Dense
from tensorflow.keras.metrics import AUC
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=150),
    Bidirectional(LSTM(64, return_sequences=False)),
    Dropout(0.4),
    Dense(y_train.shape[1], activation='sigmoid')
])

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=[AUC(name="auc")]
)

# --- Callbacks ---
early_stop = EarlyStopping(monitor="val_auc", patience=3, mode="max", restore_best_weights=True)

# --- Train ---
history = model.fit(
    X_train, y_train,
    epochs=15,
    batch_size=32,
    validation_data=(X_val, y_val),
    callbacks=[early_stop]
)

2025-06-05 12:57:51.604621: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749128271.798087      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749128271.857312      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
I0000 00:00:1749128481.346528      35 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


Epoch 1/15


I0000 00:00:1749128487.271149     102 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m17659/17659[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m333s[0m 18ms/step - auc: 0.9184 - loss: 0.1806 - val_auc: 0.9828 - val_loss: 0.0918
Epoch 2/15
[1m17659/17659[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m326s[0m 18ms/step - auc: 0.9826 - loss: 0.0915 - val_auc: 0.9844 - val_loss: 0.0876
Epoch 3/15
[1m17659/17659[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m326s[0m 18ms/step - auc: 0.9856 - loss: 0.0839 - val_auc: 0.9846 - val_loss: 0.0870
Epoch 4/15
[1m17659/17659[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m325s[0m 18ms/step - auc: 0.9872 - loss: 0.0791 - val_auc: 0.9837 - val_loss: 0.0878
Epoch 5/15
[1m17659/17659[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m327s[0m 19ms/step - auc: 0.9886 - loss: 0.0745 - val_auc: 0.9835 - val_loss: 0.0886
Epoch 6/15
[1m17659/17659[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m325s[0m 18ms/step - auc: 0.9898 - loss: 0.0707 - val_auc: 0.9825 - val_loss: 0.0905


In [3]:
# --- Save model ---
model.save("bilstm_stackoverflow_with_answers.h5") 

In [4]:
from sklearn.metrics import f1_score, precision_score, recall_score
import numpy as np

# Predict probabilities
y_pred_proba = model.predict(X_val)

# Binarize predictions with threshold (commonly 0.5)
y_pred_binary = (y_pred_proba >= 0.5).astype(int)

# Compute metrics
micro_f1 = f1_score(y_val, y_pred_binary, average='micro')
macro_f1 = f1_score(y_val, y_pred_binary, average='macro')
precision = precision_score(y_val, y_pred_binary, average='micro')
recall = recall_score(y_val, y_pred_binary, average='micro')

print("Micro F1:", micro_f1)
print("Macro F1:", macro_f1)
print("Precision:", precision)
print("Recall:", recall)

[1m4415/4415[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 6ms/step
Micro F1: 0.8501218234299563
Macro F1: 0.844516582422164
Precision: 0.9083035119411494
Recall: 0.7989451234616384


In [5]:
# Tokenize all data
X_all_seq = tokenizer.texts_to_sequences(questions["text"])
X_all_pad = pad_sequences(X_all_seq, maxlen=150)

# Predict
y_all_pred_probs = model.predict(X_all_pad, batch_size=32)
y_all_pred = (y_all_pred_probs >= 0.5).astype(int)

# Convert to tag labels
predicted_tags_all = mlb.inverse_transform(y_all_pred)

# Add to DataFrame
questions["Predicted Tags"] = [" ".join(tags) for tags in predicted_tags_all]

# Save to CSV
questions[["Id", "text", "Predicted Tags"]].to_csv("top10_stackoverflow_predictions.csv", index=False)

[1m39507/39507[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m255s[0m 6ms/step
