In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import numpy as np

# Step 1: Load the dataset
import pandas as pd
data = pd.read_csv('balanced_crime_dataset.csv')  # Adjust the file path if needed

# Step 2: Preprocess text data
data['cleaned_message'] = data['Message'].apply(lambda x: x.lower())  # Example preprocessing

# Step 3: Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['Category'])
X = data['cleaned_message']

# Step 4: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Tokenize text using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_data(text_data):
    return tokenizer(
        text_data.tolist(),
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors='pt'
    )

train_encodings = tokenize_data(X_train)
test_encodings = tokenize_data(X_test)

# Create Torch Datasets and DataLoaders
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(y_train, dtype=torch.long))
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], torch.tensor(y_test, dtype=torch.long))

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# Step 7: Load the BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Step 8: Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Step 9: Train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(10):  # Train for 3 epochs (you can increase this)
    model.train()
    epoch_loss = 0
    correct_preds = 0
    total_preds = 0
    
    # Training Loop
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        
        # Calculate accuracy
        _, preds = torch.max(logits, dim=1)
        correct_preds += torch.sum(preds == labels).item()
        total_preds += labels.size(0)
    
    print(f"Epoch {epoch+1}: Loss = {epoch_loss/len(train_loader)}, Accuracy = {correct_preds/total_preds}")

    # Step 10: Evaluate the model on the testing data (Validation Accuracy)
    model.eval()
    val_correct_preds = 0
    val_total_preds = 0
    
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            
            # Get predictions
            _, preds = torch.max(logits, dim=1)
            val_correct_preds += torch.sum(preds == labels).item()
            val_total_preds += labels.size(0)
    
    val_accuracy = val_correct_preds / val_total_preds
    print(f"Validation Accuracy: {val_accuracy}")

# Step 11: Evaluate the model
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        _, preds = torch.max(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Convert predictions and labels to NumPy arrays for evaluation
all_preds = np.array(all_preds)
all_labels = np.array(all_labels)

# Step 12: Output performance metrics
print("Accuracy:", accuracy_score(all_labels, all_preds))
print("Classification Report:\n", classification_report(all_labels, all_preds))




  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 43/43 [01:16<00:00,  1.78s/it]


Epoch 1: Loss = 1.5708793734395228, Accuracy = 0.6179941002949852
Validation Accuracy: 1.0


Epoch 2: 100%|██████████| 43/43 [01:13<00:00,  1.72s/it]


Epoch 2: Loss = 0.6766346672246623, Accuracy = 0.995575221238938
Validation Accuracy: 1.0


Epoch 3: 100%|██████████| 43/43 [01:13<00:00,  1.72s/it]


Epoch 3: Loss = 0.2161720235680425, Accuracy = 1.0
Validation Accuracy: 1.0


Epoch 4: 100%|██████████| 43/43 [01:16<00:00,  1.79s/it]


Epoch 4: Loss = 0.09072296884517338, Accuracy = 1.0
Validation Accuracy: 1.0


Epoch 5: 100%|██████████| 43/43 [01:13<00:00,  1.72s/it]


Epoch 5: Loss = 0.05448558978563131, Accuracy = 1.0
Validation Accuracy: 1.0


Epoch 6: 100%|██████████| 43/43 [01:17<00:00,  1.80s/it]


Epoch 6: Loss = 0.03731808759445368, Accuracy = 1.0
Validation Accuracy: 1.0


Epoch 7: 100%|██████████| 43/43 [01:12<00:00,  1.69s/it]


Epoch 7: Loss = 0.028754531210938166, Accuracy = 1.0
Validation Accuracy: 1.0


Epoch 8: 100%|██████████| 43/43 [01:18<00:00,  1.82s/it]


Epoch 8: Loss = 0.022468822076916695, Accuracy = 1.0
Validation Accuracy: 1.0


Epoch 9: 100%|██████████| 43/43 [01:15<00:00,  1.77s/it]


Epoch 9: Loss = 0.018338827329666116, Accuracy = 1.0
Validation Accuracy: 1.0


Epoch 10: 100%|██████████| 43/43 [01:18<00:00,  1.83s/it]


Epoch 10: Loss = 0.015591048895446368, Accuracy = 1.0
Validation Accuracy: 1.0
Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       1.00      1.00      1.00        18
           2       1.00      1.00      1.00        23
           3       1.00      1.00      1.00        20
           4       1.00      1.00      1.00        25
           5       1.00      1.00      1.00        29
           6       1.00      1.00      1.00        21
           7       1.00      1.00      1.00        23

    accuracy                           1.00       170
   macro avg       1.00      1.00      1.00       170
weighted avg       1.00      1.00      1.00       170



In [2]:
# Step 13: Predict on new messages
new_messages = ["lets steal  credit cards ", " weapons  ", "happy birthday "]
new_messages_cleaned = [msg.lower() for msg in new_messages]
new_messages_encodings = tokenizer(new_messages_cleaned, padding=True, truncation=True, max_length=128, return_tensors='pt')
new_messages_input_ids = new_messages_encodings['input_ids'].to(device)
new_messages_attention_mask = new_messages_encodings['attention_mask'].to(device)

model.eval()
with torch.no_grad():
    outputs = model(new_messages_input_ids, attention_mask=new_messages_attention_mask)
    logits = outputs.logits
    _, predictions = torch.max(logits, dim=1)

predicted_labels = label_encoder.inverse_transform(predictions.cpu().numpy())

for msg, label in zip(new_messages, predicted_labels):
    print(f"Message: '{msg}' => Predicted Label: {label}")

Message: 'lets steal  credit cards ' => Predicted Label: Money Laundering
Message: ' weapons  ' => Predicted Label: Human Trafficking
Message: 'happy birthday ' => Predicted Label: Normal


In [3]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Load the dataset
import pandas as pd
data = pd.read_csv('balanced_crime_dataset.csv')  # Adjust the file path if needed

# Preprocess text data (cleaning)
data['cleaned_message'] = data['Message'].apply(lambda x: x.lower())  # Example preprocessing

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['Category'])  # Fit the encoder again on the original labels

# Save the classes for later use (to map predictions back to the original labels)
np.save('classes.npy', label_encoder.classes_)

In [4]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the trained model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=8)  # Adjust num_labels based on your dataset

# Load the LabelEncoder to map back predictions to the original categories
label_encoder = LabelEncoder()
# Load the classes array with allow_pickle=True
label_encoder.classes_ = np.load('classes.npy', allow_pickle=True)  # Assuming you saved this previously, if not adjust accordingly

# Create unseen data for testing
new_messages = [
    "Let's trade illegal weapons for money",  # Expected to be classified as Illegal Weapons Trade
    "You are a stupid idiot",  # Expected to be classified as Insult
    "I'll report this incident to the authorities",  # Expected to be classified as Legal/Normal
    "Making threats to harm others over social media",  # Expected to be classified as Harassment/Threats
    "I am selling drugs, contact me at xxx-xxx-xxxx",  # Expected to be classified as Drug Trade
]

# Step 1: Preprocess and Tokenize the unseen messages
new_messages_cleaned = [msg.lower() for msg in new_messages]  # Example preprocessing
new_messages_encodings = tokenizer(new_messages_cleaned, padding=True, truncation=True, max_length=128, return_tensors='pt')

# Step 2: Prepare the input for the model (move tensors to device)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
new_messages_input_ids = new_messages_encodings['input_ids'].to(device)
new_messages_attention_mask = new_messages_encodings['attention_mask'].to(device)

# Step 3: Predict labels for the new data
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    outputs = model(new_messages_input_ids, attention_mask=new_messages_attention_mask)
    logits = outputs.logits
    _, predictions = torch.max(logits, dim=1)

# Step 4: Convert predictions to original labels
predicted_labels = label_encoder.inverse_transform(predictions.cpu().numpy())

# Print out the messages with their predicted labels
for msg, label in zip(new_messages, predicted_labels):
    print(f"Message: '{msg}' => Predicted Label: {label}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Message: 'Let's trade illegal weapons for money' => Predicted Label: Illegal Weapons Trade
Message: 'You are a stupid idiot' => Predicted Label: Illegal Weapons Trade
Message: 'I'll report this incident to the authorities' => Predicted Label: Illegal Weapons Trade
Message: 'Making threats to harm others over social media' => Predicted Label: Illegal Weapons Trade
Message: 'I am selling drugs, contact me at xxx-xxx-xxxx' => Predicted Label: Illegal Weapons Trade
