# Train

### Preprocessing the Text Data

Tokenize the text.
Convert tokens to indices (e.g., using a vocabulary).
Pad sequences to ensure uniform input size.

### Creating the Dataset

Load the CSV data.
Process the text and labels.
Create a custom PyTorch Dataset and DataLoader.

### Building the CNN Model

Define a CNN architecture for text classification.

### Training and Evaluation

Train the model on the training data.
Evaluate the model on the test data.

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import nltk
import torch.nn.functional as F


In [5]:
# Ensure nltk resources are available
nltk.download('punkt')
nltk.download('stopwords')

# Load the data
df = pd.read_csv('../data/database.csv')

# Preprocess the text data
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/macbookair13/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/macbookair13/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

df['processed_text'] = df['description'].apply(preprocess_text)

# Build vocabulary
all_words = [word for text in df['processed_text'] for word in text]
vocab = Counter(all_words)
vocab = {word: i+1 for i, (word, _) in enumerate(vocab.most_common(10000))}

# Convert text to sequences of indices
def text_to_sequence(text, vocab):
    return [vocab.get(word, 0) for word in text]

df['sequence'] = df['processed_text'].apply(lambda x: text_to_sequence(x, vocab))

# Pad sequences
max_len = 100

def pad_sequence(seq, max_len):
    return seq[:max_len] + [0] * (max_len - len(seq))

df['padded_sequence'] = df['sequence'].apply(lambda x: pad_sequence(x, max_len))

# Encode labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['Category'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['padded_sequence'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42)


In [7]:
# Create custom dataset
class VentureCapitalDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)

train_dataset = VentureCapitalDataset(X_train, y_train)
test_dataset = VentureCapitalDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [8]:
# Define the CNN model
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.conv1 = nn.Conv2d(1, 100, (3, embed_dim))
        self.pool = nn.MaxPool2d((max_len - 3 + 1, 1))
        self.fc1 = nn.Linear(100, num_classes)

    def forward(self, x):
        x = self.embedding(x).unsqueeze(1)  # Add channel dimension
        x = F.relu(self.conv1(x)).squeeze(3)
        x = self.pool(x).squeeze(2)
        x = self.fc1(x)
        return x

In [9]:
# Initialize the model, loss function, and optimizer
vocab_size = len(vocab) + 1
embed_dim = 50
num_classes = len(label_encoder.classes_)
model = TextCNN(vocab_size, embed_dim, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training the model
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}')



RuntimeError: mat1 and mat2 shapes cannot be multiplied (14x98 and 100x1)

In [None]:
# Evaluate the model
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
print(f'Accuracy: {100 * correct / total:.2f}%')