In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from tqdm import tqdm

# Step 1: Load the Dataset
data = pd.read_csv('cleaned_train.csv')

# Step 2: Preprocess the Data
data['text'] = data['text'].astype(str).fillna("")  # Ensure text data is clean

# Encode labels
label_encoder = LabelEncoder()
data['category_encoded'] = label_encoder.fit_transform(data['category'])

# Step 3: Split the Data
X = data['text']  # Input text
y = data['category_encoded']  # Encoded labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Convert Text to Features Using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2), stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Step 5: Train the XGBoost Model with Progress Bar
xgb_model = XGBClassifier(random_state=42, eval_metric='mlogloss', objective='multi:softmax', num_class=len(label_encoder.classes_))

# Wrap the training process with tqdm for the progress bar
with tqdm(total=100, desc="Training Progress", unit="step") as pbar:
    for _ in range(1):  # Dummy loop to simulate steps; XGBoost handles training internally
        xgb_model.fit(
            X_train_tfidf,
            y_train,
            eval_set=[(X_test_tfidf, y_test)],
            verbose=False
        )
        pbar.update(100)  # Update progress bar to 100% since training is one complete step

# Step 6: Predict on the Test Data
y_pred = xgb_model.predict(X_test_tfidf)
unique_classes = sorted(set(y_test))  # Ensure all classes in y_test are covered
class_names = label_encoder.inverse_transform(unique_classes)
# Step 7: Generate the Classification Report
report = classification_report(y_test, y_pred,
                               labels=unique_classes,
                               target_names=class_names,
                               zero_division=0)
print(report)


Training Progress: 100%|██████████| 100/100 [09:37<00:00,  5.78s/step]


                                                      precision    recall  f1-score   support

                               Any Other Cyber Crime       0.48      0.23      0.31      2064
Child Pornography CPChild Sexual Abuse Material CSAM       0.71      0.27      0.39        63
                                Cryptocurrency Crime       0.67      0.39      0.49       102
                      Cyber Attack/ Dependent Crimes       1.00      1.00      1.00       715
                                     Cyber Terrorism       0.00      0.00      0.00        33
      Hacking  Damage to computercomputer system etc       0.49      0.33      0.40       349
                            Online Cyber Trafficking       0.00      0.00      0.00        29
                              Online Financial Fraud       0.80      0.95      0.87     10497
                            Online Gambling  Betting       0.50      0.05      0.09        85
               Online and Social Media Related Crime       