In [1]:
# importing required libraries and downloading required resources
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# loading the data sample and generating input and output columns
df = pd.read_csv("/kaggle/input/method-singlelbl-subclass-csv/method_singlelbl_subclass.csv")
# generate input by combining prompt and reply
df['Combined_text'] = df.apply(lambda row: ' '.join(nltk.word_tokenize(str(row['Prompt']) + " " + str(row['Reply']))), axis=1)
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['Subclass'])

In [4]:
# train and test split
X = df['Combined_text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [5]:
# vectorizing the text using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [6]:
# training the XGBoost model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=100, learning_rate=0.1, max_depth=6)
xgb_model.fit(X_train_vectorized, y_train)

In [8]:
# evaluating the model
y_pred = xgb_model.predict(X_test_vectorized)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.9425427872860636
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.78      0.88         9
           1       1.00      0.93      0.97        46
           2       0.91      0.99      0.95       354
           3       1.00      0.81      0.90        16
           4       0.95      0.89      0.92       137
           5       0.98      0.92      0.95       256

    accuracy                           0.94       818
   macro avg       0.97      0.89      0.93       818
weighted avg       0.95      0.94      0.94       818

