In [102]:
import re
import ast
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import MultiLabelBinarizer
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")


In [104]:
df = pd.read_csv("stackoverflow_30k.csv")
df.head()

Unnamed: 0,Title,Body,Tags
0,SQL/JS How to re-arrange duplicate rows into one,How can I regroup all cells that belongs to Us...,"['php', 'jquery', 'sql']"
1,Scraping Ajax based Review Page with Scrapy,There. I am trying to scrape a website. Everyt...,"['python', 'ajax', 'scrapy']"
2,Python - mock imported dictionary,At the top of the code I want to test I have a...,"['python', 'unit-testing', 'dictionary', 'mock..."
3,angular ui.select module is not available?,Trying to setup the ui-select angular directiv...,"['javascript', 'html', 'css', 'angularjs', 'ui..."
4,Java - error in prompts,This is what I'm trying to do: Write a program...,"['java', 'output']"


In [5]:
df.drop('Title', axis=1, inplace=True)

In [7]:
df.head(7)

Unnamed: 0,Body,Tags
0,How can I regroup all cells that belongs to Us...,"['php', 'jquery', 'sql']"
1,There. I am trying to scrape a website. Everyt...,"['python', 'ajax', 'scrapy']"
2,At the top of the code I want to test I have a...,"['python', 'unit-testing', 'dictionary', 'mock..."
3,Trying to setup the ui-select angular directiv...,"['javascript', 'html', 'css', 'angularjs', 'ui..."
4,This is what I'm trying to do: Write a program...,"['java', 'output']"
5,I am using Django and pretty new to it. My pur...,"['html', 'django', 'bootstrap-modal']"
6,This project (example) depends on a project de...,"['gradle', 'libgdx']"


### Data Cleaning

In [9]:
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

In [13]:
def clean_text(text):
    text = BeautifulSoup(str(text), "html.parser").get_text()
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"[^a-zA-Z]", " ", text)
    text = text.lower()
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return " ".join(tokens)


In [15]:
df["text"] = df["Body"].astype(str)
df["cleaned_text"] = df["text"].apply(clean_text)

In [17]:
df = df.sample(n=10000, random_state=42)
df.to_csv("stackoverflow_10k_sample.csv", index=False)

### Multilabel encoding

In [19]:
# Ensure tags are list objects
df["Tags"] = df["Tags"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df["Tags"])


In [21]:
tag_counts = y.sum(axis=0)
mask = tag_counts >= 5
y = y[:, mask]
mlb.classes_ = mlb.classes_[mask]
print(f"Tags after filtering: {y.shape[1]}")

Tags after filtering: 875


### Text vectorization

In [23]:
df["cleaned_text"] = df["Body"].astype(str).apply(clean_text)

In [82]:
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df["cleaned_text"])


### Train & test split

In [84]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [86]:
def train_model(X_train, y_train, classifier):
    model = OneVsRestClassifier(classifier)
    model.fit(X_train, y_train)
    return model             #training the model

In [88]:
def evaluate_model(model, X_test, y_test): #evaluate the model
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average="micro")
    return f1

In [90]:
lr_clf = LogisticRegression(max_iter=1000)       # Initialize model
lr_model = train_model(X_train, y_train, lr_clf) # Train model
lr_f1 = evaluate_model(lr_model, X_test, y_test) # Evaluate
print(f"Logistic Regression - F1 Score: {lr_f1:.4f}")

Logistic Regression - F1 Score: 0.1803


In [38]:
nb_clf = MultinomialNB()
nb_model = train_model(X_train, y_train, nb_clf)
nb_f1 = evaluate_model(nb_model, X_test, y_test)
print(f"Naive Bayes - F1 Score(micro): {nb_f1:.4f}")

Naive Bayes - F1 Score(micro): 0.0985


In [40]:
svc_clf = LinearSVC()
svc_model = train_model(X_train, y_train, svc_clf)
svc_f1 = evaluate_model(svc_model, X_test, y_test)
print(f"Linear SVC - F1 Score (micro): {svc_f1:.4f}")

Linear SVC - F1 Score (micro): 0.3778


### svc giving the best f1 micro result ,so i am choosing svc

In [None]:
## Hyperparameter testing

In [94]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier

logreg = OneVsRestClassifier(LogisticRegression(solver='liblinear'))

param_grid = {
    'estimator__C': [0.01, 0.1, 1.0, 5.0],
    'estimator__penalty': ['l1', 'l2']
}

grid = GridSearchCV(logreg, param_grid, scoring='f1_micro', cv=3)
grid.fit(X_train, y_train)

print("✅ Best Params:", grid.best_params_)
print("✅ Best F1 Score:", grid.best_score_)


✅ Best Params: {'estimator__C': 5.0, 'estimator__penalty': 'l1'}
✅ Best F1 Score: 0.3732765561812215


In [96]:
lr_final = OneVsRestClassifier(LogisticRegression(
    solver='liblinear',
    class_weight='balanced',
    penalty='l1',
    C=5.0,
    max_iter=1000
))

lr_final.fit(X_train, y_train)


In [100]:


# 2. Run threshold tuning
y_proba = lr_model.predict_proba(X_test)
y_pred_thresh = (y_proba >= 0.3).astype(int)

# 3. Evaluate improved F1
print("F1 Score (micro):", f1_score(y_test, y_pred_thresh, average="micro"))


F1 Score (micro): 0.2795347146492185


In [92]:
from sklearn.metrics import classification_report   #This prints precision, recall, F1 for each individual tag.
y_pred = lr_model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=mlb.classes_))

                               precision    recall  f1-score   support

                    .htaccess       0.00      0.00      0.00         3
                         .net       0.00      0.00      0.00        26
                     .net-8.0       0.00      0.00      0.00         3
                    .net-core       0.00      0.00      0.00         4
                           3d       0.00      0.00      0.00         3
                accessibility       0.00      0.00      0.00         1
             active-directory       0.00      0.00      0.00         1
                 activerecord       0.00      0.00      0.00         0
                          adb       0.00      0.00      0.00         0
             addeventlistener       0.00      0.00      0.00         1
                    aggregate       0.00      0.00      0.00         3
        aggregation-framework       0.00      0.00      0.00         1
                      airflow       0.00      0.00      0.00         2
     

In [80]:
from sklearn.metrics import hamming_loss #This shows how many labels per sample were misclassified on average:
print("Hamming Loss:", hamming_loss(y_test, lr_model.predict(X_test)))


Hamming Loss: 0.002344


In [106]:
joblib.dump(lr_final, "lr_full_model.pkl")


['lr_full_model.pkl']

In [66]:
joblib.dump(tfidf, "tfidf.pkl")
joblib.dump(mlb, "mlb.pkl")



['mlb.pkl']

In [68]:
print("✅ Tags used in training:", mlb.classes_)
print("🔢 Total unique tags trained on:", len(mlb.classes_))


✅ Tags used in training: ['.htaccess' '.net' '.net-8.0' '.net-core' '3d' 'accessibility'
 'active-directory' 'activerecord' 'adb' 'addeventlistener' 'aggregate'
 'aggregation-framework' 'airflow' 'ajax' 'alexa-skills-kit' 'algorithm'
 'amazon-cognito' 'amazon-dynamodb' 'amazon-ec2'
 'amazon-elastic-beanstalk' 'amazon-iam' 'amazon-rds' 'amazon-redshift'
 'amazon-s3' 'amazon-sns' 'amazon-sqs' 'amazon-web-services' 'anaconda'
 'android' 'android-fragments' 'android-intent' 'android-jetpack'
 'android-jetpack-compose' 'android-layout' 'android-ndk'
 'android-recyclerview' 'android-room' 'android-studio' 'angular'
 'angular-cli' 'angular-material' 'angular8' 'angularjs' 'animation'
 'ansible' 'apache' 'apache-camel' 'apache-kafka' 'apache-kafka-connect'
 'apache-spark' 'apache-spark-sql' 'apollo' 'appium' 'architecture'
 'arduino' 'arguments' 'arkit' 'arm' 'arraylist' 'arrays'
 'artificial-intelligence' 'ascii' 'asp.net' 'asp.net-core'
 'asp.net-core-mvc' 'asp.net-core-webapi' 'asp.net-mvc'