In [69]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")

In [63]:


data = pd.read_csv('../data/export.csv')

data['text'] = data['title'] + " " + data['description']

label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['category'])

print("Label classes:", label_encoder.classes_)

train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['text'], data['label'], test_size=0.2, random_state=42
)


Label classes: ['business' 'captial' 'debt' 'sports']


# Tokenize

In [64]:
tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_texts), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x)

def pad_to_min_length(tokens, min_length):
    if len(tokens) < min_length:
        tokens.extend([0] * (min_length - len(tokens)))
    return tokens


# create data set and data loader

In [65]:


class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, text_pipeline, label_pipeline):
        self.texts = texts
        self.labels = labels
        self.text_pipeline = text_pipeline
        self.label_pipeline = label_pipeline

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.text_pipeline(self.texts.iloc[idx])
        text = pad_to_min_length(text, 10) 
        label = self.label_pipeline(self.labels.iloc[idx])
        return torch.tensor(text, dtype=torch.int64), torch.tensor(label, dtype=torch.int64)

def collate_batch(batch):
    label_list, text_list = [], []
    for _text, _label in batch:
        label_list.append(_label)
        text_list.append(_text)
    return pad_sequence(text_list, batch_first=True, padding_value=0), torch.tensor(label_list, dtype=torch.int64)

train_dataset = TextClassificationDataset(train_texts, train_labels, text_pipeline, label_pipeline)
test_dataset = TextClassificationDataset(test_texts, test_labels, text_pipeline, label_pipeline)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_batch)


# Define Modek


In [70]:


class TextCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.conv1 = nn.Conv2d(1, 100, (2, embed_dim))
        self.conv2 = nn.Conv2d(1, 100, (3, embed_dim))
        self.conv3 = nn.Conv2d(1, 100, (4, embed_dim))
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(300, num_class)

    def forward(self, text):
        embedded = self.embedding(text)  # [batch_size, seq_len, embed_dim]
        embedded = embedded.unsqueeze(1)  # Add channel dimension: [batch_size, 1, seq_len, embed_dim]
        conv1 = F.relu(self.conv1(embedded).squeeze(3))  # [batch_size, 100, seq_len-1]
        conv2 = F.relu(self.conv2(embedded).squeeze(3))  # [batch_size, 100, seq_len-2]
        conv3 = F.relu(self.conv3(embedded).squeeze(3))  # [batch_size, 100, seq_len-3]
        pooled1 = F.max_pool1d(conv1, conv1.size(2)).squeeze(2)  # [batch_size, 100]
        pooled2 = F.max_pool1d(conv2, conv2.size(2)).squeeze(2)  # [batch_size, 100]
        pooled3 = F.max_pool1d(conv3, conv3.size(2)).squeeze(2)  # [batch_size, 100]
        pooled = torch.cat((pooled1, pooled2, pooled3), 1)  # [batch_size, 300]
        dropped = self.dropout(pooled)
        return self.fc(dropped)

vocab_size = len(vocab)
embed_dim = 64
num_class = len(label_encoder.classes_)

model = TextCNN(vocab_size, embed_dim, num_class).to(device)


# Train Model

In [71]:


num_epochs = 10
learning_rate = 0.001

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# epoch
for epoch in range(num_epochs):
    model.train()
    total_acc, total_loss = 0, 0
    for text, label in train_dataloader:
        text, label = text.to(device), label.to(device)
        optimizer.zero_grad()
        output = model(text)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        total_acc += (output.argmax(1) == label).sum().item()
    print(f'Epoch: {epoch+1}, Loss: {total_loss / len(train_dataloader)}, Accuracy: {total_acc / len(train_dataset)}')

# Evaluation
model.eval()
total_acc, total_loss = 0, 0
with torch.no_grad():
    for text, label in test_dataloader:
        text, label = text.to(device), label.to(device)
        output = model(text)
        loss = criterion(output, label)
        total_loss += loss.item()
        total_acc += (output.argmax(1) == label).sum().item()
print(f'Test Loss: {total_loss / len(test_dataloader)}, Test Accuracy: {total_acc / len(test_dataset)}')


Epoch: 1, Loss: 0.7738236763339111, Accuracy: 0.6362385321100917
Epoch: 2, Loss: 0.3549399093009423, Accuracy: 0.8637614678899083
Epoch: 3, Loss: 0.22011934015629947, Accuracy: 0.9137614678899083
Epoch: 4, Loss: 0.1770742992884007, Accuracy: 0.9311926605504587
Epoch: 5, Loss: 0.11001681585026823, Accuracy: 0.9605504587155963
Epoch: 6, Loss: 0.09978864275836859, Accuracy: 0.9642201834862385
Epoch: 7, Loss: 0.0927256397444053, Accuracy: 0.9642201834862385
Epoch: 8, Loss: 0.06472936972462828, Accuracy: 0.9775229357798165
Epoch: 9, Loss: 0.053077777435976095, Accuracy: 0.9811926605504587
Epoch: 10, Loss: 0.04461906292050591, Accuracy: 0.9871559633027523
Test Loss: 0.033229712293379836, Test Accuracy: 0.9908424908424909


# Evaluate Model

## Eval another test code

In [72]:
def preprocess_text(text, min_length=10): 
    tokens = vocab(tokenizer(text))
    tokens = pad_to_min_length(tokens, min_length)
    return torch.tensor(tokens, dtype=torch.int64).unsqueeze(0) 

def predict(model, text, min_length=10):
    model.eval()
    with torch.no_grad():
        processed_text = preprocess_text(text, min_length).to(device)
        output = model(processed_text)
        prediction = output.argmax(1).item()
        return prediction

def evaluate_texts(model, texts, min_length=10):
    for i, text in enumerate(texts, 1):
        prediction = predict(model, text, min_length)
        category = label_encoder.inverse_transform([prediction])[0]
        result = "about venture capital" if category == "captial" else "not about venture capital"
        print(f"Test {i} is {result}")

test1 = "New startup raises $20M in Series B funding"
test2 = "New VC fund launches with $100M"
test3 = "Startup raises $10M in Series A funding"
test4 = "Technology conference in San Francisco"
test5 = "New VC fund launches with $100M"
test6 = "Kakao Healthcare raised $130M in Debt Capital from Anchor Equity Partners"
test7 = "Kakao Healthcare raised $100M in Series A funding"

evaluate_texts(model, [test1, test2, test3, test4, test5, test6, test7])


Test 1 is about venture capital
Test 2 is not about venture capital
Test 3 is about venture capital
Test 4 is not about venture capital
Test 5 is not about venture capital
Test 6 is not about venture capital
Test 7 is about venture capital
