In [1]:
!pip install transformers torch pandas scikit-learn matplotlib nltk seaborn imbalanced-learn
!pip install nltk pyspellchecker
!pip install ipywidgets==7.5.0

Collecting pyspellchecker
  Downloading pyspellchecker-0.8.1-py3-none-any.whl.metadata (9.4 kB)
Downloading pyspellchecker-0.8.1-py3-none-any.whl (6.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m38.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.1
Collecting ipywidgets==7.5.0
  Downloading ipywidgets-7.5.0-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting widgetsnbextension~=3.5.0 (from ipywidgets==7.5.0)
  Downloading widgetsnbextension-3.5.2-py2.py3-none-any.whl.metadata (1.3 kB)
Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets==7.5.0)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading ipywidgets-7.5.0-py2.py3-none-any.whl (121 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.5/121.5 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading widgetsnbextension-3.5.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━

In [None]:
import pandas as pd
import re
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, AdamW
from torch.utils.data import Dataset
from nltk.corpus import stopwords
import nltk
import os
from imblearn.over_sampling import SMOTE
import numpy as np
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier

# Disable Weights and Biases
os.environ["WANDB_DISABLED"] = "true"

# Download necessary NLTK data
nltk.download('stopwords')

# Define stop words
nltk_stop_words = set(stopwords.words('english'))
generic_stop_words = [
    "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are",
    "aren't", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both",
    "but", "by", "could", "couldn't", "did", "didn't", "do", "does", "doesn't", "doing", "don't",
    "down", "during", "each", "few", "for", "from", "further", "had", "hadn't", "has", "hasn't",
    "have", "haven't", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers",
    "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if",
    "in", "into", "is", "isn't", "it", "it's", "its", "itself", "let's", "me", "more", "most",
    "mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other",
    "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "shan't", "she", "she'd",
    "she'll", "she's", "should", "shouldn't", "so", "some", "such", "than", "that", "that's", "the",
    "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd",
    "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up",
    "very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've", "were", "weren't", "what",
    "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom",
    "why", "why's", "with", "won't", "would", "wouldn't", "you", "you'd", "you'll", "you're",
    "you've", "your", "yours", "yourself", "yourselves"
]
project_specific_stop_words = [
    "complaint", "victim", "fraud", "type", "based", "model", "data", "preprocessing",
    "categorizes", "classification", "used", "develop", "development", "parameters",
    "accuracy", "precision", "recall", "f1", "score", "evaluate", "evaluation", "prepare", "final",
    "text", "tokenization", "cleaning", "stemming", "removal", "output", "outputs"
]

# Combine stop words
stop_words = nltk_stop_words.union(generic_stop_words, project_specific_stop_words)

# Preprocess function for text data without lemmatization and spell checking
def preprocess_text(text):
    if not isinstance(text, str):
        text = ""
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # Remove non-alphabetical characters
    tokens = text.split()
    tokens = [word for word in tokens if word and word not in stop_words]
    return ' '.join(tokens)

# Load dataset
data = pd.read_csv('/content/60__Training_Data.csv')

# Preprocess text column
data['processed_text'] = data['crimeaditionalinfo'].fillna('').astype(str).apply(preprocess_text)  # Adjust column name if needed
data['label'] = data['category'].factorize()[0]  # Convert target labels to numerical values

# Check for class imbalance
class_counts = data['label'].value_counts()
print("Class distribution:\n", class_counts)

# Split the data before oversampling
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['processed_text'], data['label'], test_size=0.2, random_state=42, stratify=data['label']
)

# Convert train_texts to TF-IDF features for balancing
tfidf = TfidfVectorizer(max_features=15000, ngram_range=(1, 3), min_df=3)
X_train_tfidf = tfidf.fit_transform(train_texts)
y_train = np.array(train_labels)

# Apply SMOTE to balance classes in training data with reduced k_neighbors to avoid errors
smote = SMOTE(random_state=42, k_neighbors=1)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_tfidf, y_train)

# Update train_texts and train_labels after SMOTE
train_texts = tfidf.inverse_transform(X_train_balanced)
train_labels = y_train_balanced
train_texts = [' '.join(text) for text in train_texts]

### DistilBERT Model Training and Evaluation ###
# Define tokenizer and tokenize data
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts, padding=True, truncation=True, max_length=256, return_tensors='pt')
val_encodings = tokenizer(val_texts.tolist(), padding=True, truncation=True, max_length=256, return_tensors='pt')

# Create Dataset Class
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encodings, train_labels)
val_dataset = TextDataset(val_encodings, val_labels.tolist())

# Load DistilBERT model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', num_labels=len(set(data['label']))
)

# Define Training Arguments with early stopping and gradient clipping
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    per_device_train_batch_size=8,  # Smaller batch size for better generalization
    per_device_eval_batch_size=16,
    num_train_epochs=4,  # Increased number of epochs for better learning
    weight_decay=0.1,
    learning_rate=1e-5,  # Lower learning rate for more stable training
    load_best_model_at_end=True,
    save_total_limit=3,
    lr_scheduler_type="cosine",
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    fp16=True  # Mixed precision for faster training
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    optimizers=(AdamW(model.parameters(), lr=1e-5, weight_decay=0.01), None),
    compute_metrics=lambda p: {
        'accuracy': accuracy_score(p.label_ids, np.argmax(p.predictions, axis=1)),
        'f1_weighted': f1_score(p.label_ids, np.argmax(p.predictions, axis=1), average='weighted')
    }
)

# Train the model
trainer.train()

# Evaluate the DistilBERT model
predictions = trainer.predict(val_dataset)
distilbert_preds = np.argmax(predictions.predictions, axis=1)

### XGBoost Model Training and Evaluation ###
# Convert text to TF-IDF features with more n-grams
X_test_tfidf = tfidf.transform(val_texts)

# XGBoost model training and evaluation
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [300, 500, 700],
    'max_depth': [6, 8, 10],
    'colsample_bytree': [0.8, 1.0],
    'subsample': [0.8, 1.0]
}

xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=len(set(data['label'])), eval_metric='mlogloss')
xgb_cv = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='f1_weighted', cv=3, n_jobs=-1, verbose=3)
xgb_cv.fit(X_train_balanced, y_train_balanced)
xgb_best = xgb_cv.best_estimator_

# Predict on test set using XGBoost
xgb_preds = xgb_best.predict(X_test_tfidf)

### Ensemble using StackingClassifier ###
stacking_model = StackingClassifier(estimators=[
    ('xgb', xgb_best)
], final_estimator=RandomForestClassifier(n_estimators=300, max_depth=15, random_state=42), cv=5)
stacking_model.fit(X_train_balanced, y_train_balanced)

# Predict using ensemble model
ensemble_preds = stacking_model.predict(X_test_tfidf)

# Print Classification Report, F1 Score, and Accuracy for the Ensemble Model
print("Ensemble Model Classification Report:\n", classification_report(val_labels, ensemble_preds))
print("Ensemble Model F1 Score:", f1_score(val_labels, ensemble_preds, average='weighted'))
print("Ensemble Model Accuracy Score:", accuracy_score(val_labels, ensemble_preds))

# Confusion Matrix for Ensemble Model
ensemble_conf_matrix = confusion_matrix(val_labels, ensemble_preds)
plt.figure(figsize=(12, 10))
sns.heatmap(ensemble_conf_matrix, annot=True, fmt='d', cmap='Oranges',
            xticklabels=set(data['category']), yticklabels=set(data['category']))
plt.xlabel('Predicted Labels (Ensemble)')
plt.ylabel('True Labels')
plt.title('Ensemble Model Confusion Matrix')
plt.show()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Class distribution:
 label
1     45720
0      9680
4      8884
3      2916
2      2306
9      1503
5      1383
7      1217
6       390
10      339
8       297
13      149
11      118
12       45
14        2
Name: count, dtype: int64


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted
1,0.1748,2.677071,0.467645,0.512215
