In [5]:
import os, json, wandb
import torch
import spacy
import numpy as np 
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
import plotly.express as px
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sentence_transformers import SentenceTransformer
from lightgbm import LGBMClassifier

In [2]:
train = pl.read_csv("train.csv")
test = pl.read_csv("test.csv")

In [3]:
train = train.filter(pl.col("crimeaditionalinfo").is_not_null())
test = test.filter(pl.col("crimeaditionalinfo").is_not_null())

In [4]:
spacy.prefer_gpu()
spacy_nlp = spacy.load("en_core_web_md")
model = SentenceTransformer("all-MiniLM-L6-v2")



In [5]:
train_text = []
for text in tqdm(train["crimeaditionalinfo"]):
    train_text.append(" ".join([d.lemma_ for d in spacy_nlp(text) if not d.is_punct]))
test_text = []
for text in tqdm(test["crimeaditionalinfo"]):
    test_text.append(" ".join([d.lemma_ for d in spacy_nlp(text) if not d.is_punct]))

  0%|          | 0/93665 [00:00<?, ?it/s]

  0%|          | 0/31222 [00:00<?, ?it/s]

In [9]:
train = train.with_columns(pl.col("category").str.to_lowercase(), 
                    pl.col("sub_category").str.to_lowercase().fill_null("NULL"), 
                    pl.col("crimeaditionalinfo").str.to_lowercase().str.strip_chars())
test = test.with_columns(pl.col("category").str.to_lowercase(), 
                    pl.col("sub_category").str.to_lowercase().fill_null("NULL"), 
                    pl.col("crimeaditionalinfo").str.to_lowercase().str.strip_chars())

train = train.with_columns(pl.Series("text", train_text).str.to_lowercase().str.strip_chars())
test = test.with_columns(pl.Series("text", test_text).str.to_lowercase().str.strip_chars())



### For Null text use category = "online financial fraud",  sub_category = "upi related frauds"

In [14]:
train_class = train["category"] + " - " + train["sub_category"]
test_class = test["category"] + " - " + test["sub_category"]

le = LabelEncoder()
x = le.fit_transform(train_class)
train = train.with_columns(pl.Series("label", x))
le.classes_ = np.append(le.classes_, "NULL")

x = le.transform(["NULL" if x not in le.classes_ else x for x in test_class])
test = test.with_columns(pl.Series("label", x))

In [17]:
train.write_csv("train_cleaned.csv")
test.write_csv("test_cleaned.csv")

In [2]:
train = pl.read_csv("train_cleaned.csv")
test = pl.read_csv("test_cleaned.csv")

In [3]:

tf_idf = TfidfVectorizer()
train_tf = tf_idf.fit_transform(train["text"])
test_tf = tf_idf.transform(test["text"])

In [4]:
model = SentenceTransformer("all-MiniLM-L6-v2")
train_emb = model.encode(train["text"].to_list(), batch_size=128, show_progress_bar=True, device="cuda")
test_emb = model.encode(test["text"].to_list(), batch_size=128, show_progress_bar=True, device="cuda")

Batches:   0%|          | 0/732 [00:00<?, ?it/s]

Batches:   0%|          | 0/244 [00:00<?, ?it/s]

In [3]:
train_emb = np.load("train_emb.npy")
test_emb = np.load("test_emb.npy")

In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# Convert the model architecture
class TextClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(TextClassifier, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        self.layer2 = nn.Sequential(
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        self.layer3 = nn.Sequential(
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        self.output = nn.Linear(128, num_classes)
        
    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        return self.output(x)

# Convert data to PyTorch tensors
X_train = torch.FloatTensor(train_emb)
y_train = torch.LongTensor(train["label"].to_numpy())
X_test = torch.FloatTensor(test_emb)
y_test = torch.LongTensor(test["label"].to_numpy())

# Create data loaders
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=64)

# Initialize model, loss function and optimizer
model = TextClassifier(train_emb.shape[1], train["label"].n_unique())
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    
    # Evaluation
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_X, batch_y in test_loader:
            outputs = model(batch_X)
            _, predicted = torch.max(outputs.data, 1)
            total += batch_y.size(0)
            correct += (predicted == batch_y).sum().item()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Test Accuracy: {100 * correct / total:.2f}%')

Epoch [1/10], Test Accuracy: 52.15%
Epoch [2/10], Test Accuracy: 53.43%
Epoch [3/10], Test Accuracy: 53.85%
Epoch [4/10], Test Accuracy: 54.21%
Epoch [5/10], Test Accuracy: 54.30%
Epoch [6/10], Test Accuracy: 54.62%
Epoch [7/10], Test Accuracy: 54.32%
Epoch [8/10], Test Accuracy: 54.28%
Epoch [9/10], Test Accuracy: 54.85%
Epoch [10/10], Test Accuracy: 54.71%


In [12]:
def evaluate_model(model, test_loader, device='cpu'):
    model.eval()  # Set model to evaluation mode
    correct = 0
    total = 0
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():  # Disable gradient calculation
        for inputs, labels in test_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            # Forward pass
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            
            # Calculate accuracy
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
            # Store predictions and labels
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    accuracy = 100 * correct / total
    print(f'Test Accuracy: {accuracy:.2f}%')
    
    return accuracy, all_predictions, all_labels

# Run evaluation
model.eval()
accuracy, predictions, true_labels = evaluate_model(model, test_loader)

Test Accuracy: 54.71%


In [None]:
naive_bayes_classifier = LGBMClassifier()
naive_bayes_classifier.fit(train_emb, train["label"])

#predicted y
y_pred = naive_bayes_classifier.predict(test_emb)
accuracy_score(test["label"], y_pred), confusion_matrix(test["label"], y_pred)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.159707 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97920
[LightGBM] [Info] Number of data points in the train set: 93665, number of used features: 384
[LightGBM] [Info] Start training from score -2.153074
[LightGBM] [Info] Start training from score -5.509944
[LightGBM] [Info] Start training from score -5.273694
[LightGBM] [Info] Start training from score -5.265395
[LightGBM] [Info] Start training from score -5.224904
[LightGBM] [Info] Start training from score -5.155911
[LightGBM] [Info] Start training from score -5.191730
[LightGBM] [Info] Start training from score -5.167084
[LightGBM] [Info] Start training from score -5.216998
[LightGBM] [Info] Start training from score -5.199437
[LightGBM] [Info] Start training from score -6.366076
[LightGBM] [Info] Start training from score -6.765349
[LightGBM] [Info] Start training from score -5.592408
[LightG

(0.3208955223880597,
 array([[687,  31,  38, ...,  35,  29,   0],
        [  9,   9,   1, ...,   1,   6,   0],
        [ 16,   2,  30, ...,   2,   0,   0],
        ...,
        [ 45,  18,   3, ...,  14,  19,   0],
        [ 54,  11,   2, ...,   8,  29,   0],
        [  0,   0,   0, ...,   0,   0,   0]]))