In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


In [2]:
df = pd.read_csv("data_preprocessed.csv")

In [3]:
df.dropna(inplace = True)
df.drop_duplicates(inplace = True)

In [4]:
X = df['clean_comment']
y = df['category']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 13)

In [6]:
print("df shape:", df.shape)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

df shape: (36243, 2)
X_train shape: (28994,)
X_test shape: (7249,)
y_train shape: (28994,)
y_test shape: (7249,)


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,3))

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)




In [8]:
print(f"X_train_vec_shape : {X_train_vec.shape}")

X_train_vec_shape : (28994, 10000)


In [9]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold

stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


base_learners = [
    ('lgb', lgb.LGBMClassifier(class_weight='balanced', objective='multiclass', num_class=3, verbose=-1)),
    ('xgb', xgb.XGBClassifier(objective='multi:softprob', num_class=3, 
                              eval_metric='mlogloss')),
    (('svc', SVC(probability=True, class_weight='balanced')))
]

# Meta-model
meta_model = KNeighborsClassifier(n_neighbors=5)

# Stacking classifier
stacking_clf = StackingClassifier(
    estimators=base_learners,
    final_estimator=meta_model,
    cv=stratified_kfold,
    passthrough=False,
    n_jobs=1
)


stacking_clf.fit(X_train_vec, y_train)
y_pred = stacking_clf.predict(X_test_vec)
print(f"Multiclass Stacking Accuracy: {accuracy_score(y_test, y_pred):.4f}")



Multiclass Stacking Accuracy: 0.8616
