In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
import spacy
import warnings
warnings.filterwarnings('ignore')


In [2]:
df = pd.read_excel('test_transactions0.1.xls')

# Preprocessing: Encode the target variable
le = LabelEncoder()
df['Category_Encoded'] = le.fit_transform(df['Category'])

X = df['Description']
y = df['Category_Encoded']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data loaded and split successfully.")
print(f"Number of training samples: {len(X_train)}")
print(f"Number of testing samples: {len(X_test)}")

Data loaded and split successfully.
Number of training samples: 4000
Number of testing samples: 1000


## 1.1. TF-IDF Vectorization

In [3]:
def preprocess_text_basic(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    return text

# Apply preprocessing
X_train_preprocessed_1 = X_train.apply(preprocess_text_basic)
X_test_preprocessed_1 = X_test.apply(preprocess_text_basic)

In [4]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000) # Limit features to avoid sparsity issues

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_preprocessed_1)
X_test_tfidf = tfidf_vectorizer.transform(X_test_preprocessed_1)

print(f"Shape of TF-IDF training features: {X_train_tfidf.shape}")
print(f"Shape of TF-IDF testing features: {X_test_tfidf.shape}")

Shape of TF-IDF training features: (4000, 1000)
Shape of TF-IDF testing features: (1000, 1000)


In [5]:
# Train an XGBoost model
xgb_model_1 = xgb.XGBClassifier(objective='multi:softmax', num_class=len(le.classes_),
                                 use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb_model_1.fit(X_train_tfidf, y_train)

# Evaluate
y_pred_1 = xgb_model_1.predict(X_test_tfidf)
print(f"Accuracy (Pipeline 1): {accuracy_score(y_test, y_pred_1):.4f}")
print(f"Classification Report (Pipeline 1):\n{classification_report(y_test, y_pred_1, target_names=le.classes_)}")

Accuracy (Pipeline 1): 0.9990
Classification Report (Pipeline 1):
                 precision    recall  f1-score   support

Cash Withdrawal       1.00      1.00      1.00        57
        Charity       1.00      1.00      1.00        65
         Dining       0.98      1.00      0.99        60
      Education       1.00      1.00      1.00        60
  Entertainment       1.00      1.00      1.00        67
           Fuel       1.00      1.00      1.00        57
      Groceries       1.00      1.00      1.00        62
     Healthcare       1.00      1.00      1.00        51
      Insurance       1.00      1.00      1.00        65
     Investment       1.00      1.00      1.00        88
  Miscellaneous       1.00      1.00      1.00        53
         Salary       1.00      1.00      1.00        57
       Shopping       1.00      0.99      0.99        72
          Taxes       1.00      1.00      1.00        73
         Travel       1.00      1.00      1.00        59
      Utilities      

# 1.2. Pipeline 2: Bag-of-Words (Count Vectorization) with PCA

In [6]:
count_vectorizer = CountVectorizer(max_features=1000)

X_train_counts = count_vectorizer.fit_transform(X_train_preprocessed_1)
X_test_counts = count_vectorizer.transform(X_test_preprocessed_1)

print(f"Shape of CountVec training features (before PCA): {X_train_counts.shape}")

Shape of CountVec training features (before PCA): (4000, 1000)


In [7]:
# PCA for dimensionality reduction
# Convert sparse matrix to dense for PCA if memory allows, or use TruncatedSVD for sparse
pca = PCA(n_components=50)

X_train_pca = pca.fit_transform(X_train_counts.toarray())
X_test_pca = pca.transform(X_test_counts.toarray())

print(f"Shape of CountVec training features (after PCA): {X_train_pca.shape}")
print(f"Shape of CountVec testing features (after PCA): {X_test_pca.shape}")

Shape of CountVec training features (after PCA): (4000, 50)
Shape of CountVec testing features (after PCA): (1000, 50)


In [8]:
# Train an XGBoost model
xgb_model_2 = xgb.XGBClassifier(objective='multi:softmax', num_class=len(le.classes_),
                                 use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb_model_2.fit(X_train_pca, y_train)

# Evaluate
y_pred_2 = xgb_model_2.predict(X_test_pca)
print(f"Accuracy (Pipeline 2): {accuracy_score(y_test, y_pred_2):.4f}")
print(f"Classification Report (Pipeline 2):\n{classification_report(y_test, y_pred_2, target_names=le.classes_)}")

Accuracy (Pipeline 2): 1.0000
Classification Report (Pipeline 2):
                 precision    recall  f1-score   support

Cash Withdrawal       1.00      1.00      1.00        57
        Charity       1.00      1.00      1.00        65
         Dining       1.00      1.00      1.00        60
      Education       1.00      1.00      1.00        60
  Entertainment       1.00      1.00      1.00        67
           Fuel       1.00      1.00      1.00        57
      Groceries       1.00      1.00      1.00        62
     Healthcare       1.00      1.00      1.00        51
      Insurance       1.00      1.00      1.00        65
     Investment       1.00      1.00      1.00        88
  Miscellaneous       1.00      1.00      1.00        53
         Salary       1.00      1.00      1.00        57
       Shopping       1.00      1.00      1.00        72
          Taxes       1.00      1.00      1.00        73
         Travel       1.00      1.00      1.00        59
      Utilities      

## 1.3. Pipeline 3: Word Embeddings (SpaCy) with Averaging

In [9]:
nlp = spacy.load("en_core_web_sm")

def get_word_embedding(text):
    doc = nlp(text)
    # Filter out tokens that are punctuation or whitespace and have vectors
    # Ensure all vectors are of the same dimension, if not, handle appropriately
    # Some tokens might not have vectors, so check for .has_vector
    vectors = [token.vector for token in doc if token.has_vector and not token.is_punct and not token.is_space]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        # Return a zero vector if no valid words with embeddings are found
        return np.zeros(nlp.vocab.vectors.shape[1])

# Apply the function to create document embeddings
X_train_embeddings = np.array([get_word_embedding(text) for text in X_train])
X_test_embeddings = np.array([get_word_embedding(text) for text in X_test])

print(f"Shape of Word Embedding training features: {X_train_embeddings.shape}")
print(f"Shape of Word Embedding testing features: {X_test_embeddings.shape}")

# Train an XGBoost model
xgb_model_3 = xgb.XGBClassifier(objective='multi:softmax', num_class=len(le.classes_),
                                 use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb_model_3.fit(X_train_embeddings, y_train)

# Evaluate
y_pred_3 = xgb_model_3.predict(X_test_embeddings)
print(f"Accuracy (Pipeline 3): {accuracy_score(y_test, y_pred_3):.4f}")
print(f"Classification Report (Pipeline 3):\n{classification_report(y_test, y_pred_3, target_names=le.classes_)}")

Shape of Word Embedding training features: (4000, 96)
Shape of Word Embedding testing features: (1000, 96)
Accuracy (Pipeline 3): 0.9990
Classification Report (Pipeline 3):
                 precision    recall  f1-score   support

Cash Withdrawal       1.00      1.00      1.00        57
        Charity       1.00      1.00      1.00        65
         Dining       1.00      1.00      1.00        60
      Education       1.00      1.00      1.00        60
  Entertainment       1.00      1.00      1.00        67
           Fuel       1.00      1.00      1.00        57
      Groceries       1.00      1.00      1.00        62
     Healthcare       1.00      1.00      1.00        51
      Insurance       0.98      1.00      0.99        65
     Investment       1.00      1.00      1.00        88
  Miscellaneous       1.00      1.00      1.00        53
         Salary       1.00      1.00      1.00        57
       Shopping       1.00      1.00      1.00        72
          Taxes       1.00  

In [10]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm import tqdm

In [11]:
class TransactionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_len, return_tensors='pt')
        self.labels = labels.long()

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

In [12]:
# Encode labels
label_enc = LabelEncoder()
y_encoded = torch.tensor(label_enc.fit_transform(df['Category']))

# Tokenizer and Dataset
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
X_train_text, X_test_text, y_train_encoded, y_test_encoded = train_test_split(df['Description'], y_encoded, test_size=0.2, random_state=42)

train_dataset = TransactionDataset(list(X_train_text), y_train_encoded, tokenizer)
test_dataset = TransactionDataset(list(X_test_text), y_test_encoded, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [13]:
# Encode labels
label_enc = LabelEncoder()
y_encoded = torch.tensor(label_enc.fit_transform(df['Category']))

# Tokenizer and Dataset
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
X_train_text, X_test_text, y_train_encoded, y_test_encoded = train_test_split(df['Description'], y_encoded, test_size=0.2, random_state=42)

train_dataset = TransactionDataset(list(X_train_text), y_train_encoded, tokenizer)
test_dataset = TransactionDataset(list(X_test_text), y_test_encoded, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

In [14]:
num_labels = len(label_enc.classes_)
model_bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
optimizer = AdamW(model_bert.parameters(), lr=5e-5)
epochs = 3

lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=epochs * len(train_loader)
)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print (f"The device being used is {device}")
model_bert.to(device)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The device being used is cuda


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [15]:
model_bert.train()
for epoch in range(epochs):
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model_bert(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())

# Evaluate BERT
model_bert.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model_bert(**batch)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch['labels'].cpu().numpy())

print("BERT Classification Report:\n", classification_report(all_labels, all_preds, target_names=label_enc.classes_))

Epoch 0: 100%|██████████| 500/500 [01:07<00:00,  7.45it/s, loss=0.0199]
Epoch 1: 100%|██████████| 500/500 [01:06<00:00,  7.55it/s, loss=0.00954]
Epoch 2: 100%|██████████| 500/500 [01:06<00:00,  7.54it/s, loss=0.00714]


BERT Classification Report:
                  precision    recall  f1-score   support

Cash Withdrawal       1.00      1.00      1.00        57
        Charity       1.00      1.00      1.00        65
         Dining       1.00      1.00      1.00        60
      Education       1.00      1.00      1.00        60
  Entertainment       1.00      1.00      1.00        67
           Fuel       1.00      1.00      1.00        57
      Groceries       1.00      1.00      1.00        62
     Healthcare       1.00      1.00      1.00        51
      Insurance       1.00      1.00      1.00        65
     Investment       1.00      1.00      1.00        88
  Miscellaneous       1.00      1.00      1.00        53
         Salary       1.00      1.00      1.00        57
       Shopping       1.00      1.00      1.00        72
          Taxes       1.00      1.00      1.00        73
         Travel       1.00      1.00      1.00        59
      Utilities       1.00      1.00      1.00        54

