In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load the data
data = pd.read_csv('../data/database.csv')

# Combine relevant text columns (e.g., title, description, summary)
data['text'] = data['title'] + " " + data['description']

# Encode the labels
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['category'])

# Split the data into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['text'], data['label'], test_size=0.2, random_state=42
)

# Tokenize

In [7]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import torch

tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

# Build vocabulary
vocab = build_vocab_from_iterator(yield_tokens(train_texts), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

# Define text pipeline
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x)

# Padding function
def pad_to_min_length(tokens, min_length):
    if len(tokens) < min_length:
        tokens.extend([0] * (min_length - len(tokens)))
    return tokens



# create data set and data loader

In [7]:
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, text_pipeline, label_pipeline):
        self.texts = texts
        self.labels = labels
        self.text_pipeline = text_pipeline
        self.label_pipeline = label_pipeline

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.text_pipeline(self.texts.iloc[idx])
        label = self.label_pipeline(self.labels.iloc[idx])
        return torch.tensor(text, dtype=torch.int64), torch.tensor(label, dtype=torch.int64)

def collate_batch(batch):
    label_list, text_list = [], []
    for _text, _label in batch:
        label_list.append(_label)
        text_list.append(torch.tensor(_text, dtype=torch.int64))
    text_list = [pad_to_min_length(t.tolist(), 10) for t in text_list]  # Ensure minimum length
    return pad_sequence([torch.tensor(t) for t in text_list], batch_first=True, padding_value=0), torch.tensor(label_list, dtype=torch.int64)

# Create datasets
train_dataset = TextClassificationDataset(train_texts, train_labels, text_pipeline, label_pipeline)
test_dataset = TextClassificationDataset(test_texts, test_labels, text_pipeline, label_pipeline)

# Create dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_batch)


# Generate Modek


In [9]:
import torch.nn as nn
import torch.nn.functional as F
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class TextCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.conv1 = nn.Conv2d(1, 100, (2, embed_dim))
        self.conv2 = nn.Conv2d(1, 100, (3, embed_dim))
        self.conv3 = nn.Conv2d(1, 100, (4, embed_dim))
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(300, num_class)

    def forward(self, text):
        embedded = self.embedding(text)  # [batch_size, seq_len, embed_dim]
        embedded = embedded.unsqueeze(1)  # Add channel dimension: [batch_size, 1, seq_len, embed_dim]
        conv1 = F.relu(self.conv1(embedded).squeeze(3))  # [batch_size, 100, seq_len-1]
        conv2 = F.relu(self.conv2(embedded).squeeze(3))  # [batch_size, 100, seq_len-2]
        conv3 = F.relu(self.conv3(embedded).squeeze(3))  # [batch_size, 100, seq_len-3]
        pooled1 = F.max_pool1d(conv1, conv1.size(2)).squeeze(2)  # [batch_size, 100]
        pooled2 = F.max_pool1d(conv2, conv2.size(2)).squeeze(2)  # [batch_size, 100]
        pooled3 = F.max_pool1d(conv3, conv3.size(2)).squeeze(2)  # [batch_size, 100]
        pooled = torch.cat((pooled1, pooled2, pooled3), 1)  # [batch_size, 300]
        dropped = self.dropout(pooled)
        return self.fc(dropped)

# Define hyperparameters
vocab_size = len(vocab)
embed_dim = 64
num_class = len(label_encoder.classes_)

# Initialize the model
model = TextCNN(vocab_size, embed_dim, num_class).to(device)


# Train Model

In [16]:
import torch.optim as optim

# Training parameters
num_epochs = 30
learning_rate = 0.001

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_acc, total_loss = 0, 0
    for text, label in train_dataloader:
        text, label = text.to(device), label.to(device)
        optimizer.zero_grad()
        output = model(text)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        total_acc += (output.argmax(1) == label).sum().item()
    print(f'Epoch: {epoch+1}, Loss: {total_loss / len(train_dataloader)}, Accuracy: {total_acc / len(train_dataset)}')

# Evaluation
model.eval()
total_acc, total_loss = 0, 0
with torch.no_grad():
    for text, label in test_dataloader:
        text, label = text.to(device), label.to(device)
        output = model(text)
        loss = criterion(output, label)
        total_loss += loss.item()
        total_acc += (output.argmax(1) == label).sum().item()
print(f'Test Loss: {total_loss / len(test_dataloader)}, Test Accuracy: {total_acc / len(test_dataset)}')


  text_list.append(torch.tensor(_text, dtype=torch.int64))


Epoch: 1, Loss: 0.1028687134385109, Accuracy: 1.0
Epoch: 2, Loss: 0.1258189082145691, Accuracy: 0.9259259259259259
Epoch: 3, Loss: 0.18206366896629333, Accuracy: 0.9259259259259259
Epoch: 4, Loss: 0.18366214632987976, Accuracy: 0.8888888888888888
Epoch: 5, Loss: 0.0896931141614914, Accuracy: 1.0
Epoch: 6, Loss: 0.12098497897386551, Accuracy: 0.9629629629629629
Epoch: 7, Loss: 0.09514112770557404, Accuracy: 0.9629629629629629
Epoch: 8, Loss: 0.12605875730514526, Accuracy: 0.9629629629629629
Epoch: 9, Loss: 0.041234757751226425, Accuracy: 1.0
Epoch: 10, Loss: 0.06213943287730217, Accuracy: 1.0
Epoch: 11, Loss: 0.04084010794758797, Accuracy: 1.0
Epoch: 12, Loss: 0.030329955741763115, Accuracy: 1.0
Epoch: 13, Loss: 0.02694159373641014, Accuracy: 1.0
Epoch: 14, Loss: 0.06088976934552193, Accuracy: 0.9629629629629629
Epoch: 15, Loss: 0.05064237490296364, Accuracy: 0.9629629629629629
Epoch: 16, Loss: 0.008770997636020184, Accuracy: 1.0
Epoch: 17, Loss: 0.02402966283261776, Accuracy: 1.0
Epoch

# Evaluate Model

In [17]:
# Evaluation
model.eval()
total_acc, total_loss = 0, 0
with torch.no_grad():
    for text, label in test_dataloader:
        text, label = text.to(device), label.to(device)
        output = model(text)
        loss = criterion(output, label)
        total_loss += loss.item()
        total_acc += (output.argmax(1) == label).sum().item()
print(f'Test Loss: {total_loss / len(test_dataloader)}, Test Accuracy: {total_acc / len(test_dataset)}')


Test Loss: 0.5163837671279907, Test Accuracy: 0.7142857142857143


  text_list.append(torch.tensor(_text, dtype=torch.int64))


## Eval another test code

In [18]:
def preprocess_text(text, min_length=10):  # Ensure a minimum length for the text
    # Tokenize and transform the text
    tokens = vocab(tokenizer(text))
    tokens = pad_to_min_length(tokens, min_length)
    return torch.tensor(tokens, dtype=torch.int64).unsqueeze(0)  # Add batch dimension

def predict(model, text, min_length=10):
    model.eval()
    with torch.no_grad():
        processed_text = preprocess_text(text, min_length).to(device)
        output = model(processed_text)
        prediction = output.argmax(1).item()
        return prediction

def evaluate_texts(model, texts, min_length=10):
    for i, text in enumerate(texts, 1):
        prediction = predict(model, text, min_length)
        category = label_encoder.inverse_transform([prediction])[0]
        # result = "about venture capital" if category == "captial" else "not about venture capital"
        print(f"Test {i} is {category}")

# Example test strings
test1 = "debt debt debt"
test2 = "Venture capital debt"
test3 = "Startup raises $10M in Series A funding"
test4 = "Technology conference in San Francisco"
test5 = "New VC fund launches with $100M"

# Evaluate and print results
evaluate_texts(model, [test1, test2, test3, test4, test5])


Test 1 is captial
Test 2 is captial
Test 3 is captial
Test 4 is captial
Test 5 is captial
