In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

# Define the CNN model
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_filters, filter_sizes, output_dim, dropout):
        super(CNN, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(fs, embedding_dim))
            for fs in filter_sizes
        ])
        self.fc = nn.Linear(len(filter_sizes) * num_filters, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        embedded = self.embedding(text)  # [batch_size, seq_len, embedding_dim]
        embedded = embedded.unsqueeze(1)  # [batch_size, 1, seq_len, embedding_dim]

        conved = [nn.functional.relu(conv(embedded)).squeeze(3) for conv in self.convs]  # [batch_size, num_filters, seq_len - filter_sizes[n] + 1]

        pooled = [nn.functional.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]  # [batch_size, num_filters]

        cat = self.dropout(torch.cat(pooled, dim=1))
        
        return self.fc(cat)

raw_data = pd.read_csv("../data/adjusted-labels-comms-exclusive.csv")

sentences = raw_data["Sentence"]
labels = raw_data["Label"]

X_train, X_test, y_train, y_test = train_test_split(sentences, labels, test_size=0.2, random_state=42)

# Step 2: Feature extraction
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Example, you can choose parameters as needed
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Move data to GPU
X = X_train_tfidf.to(device)
y = labels.to(device)

# Hyperparameters
vocab_size = 10  # Example vocabulary size
embedding_dim = 50
num_filters = 100
filter_sizes = [2, 3, 4]
output_dim = 1  # Number of classes
dropout = 0.5

# Initialize the model and move it to GPU
model = CNN(vocab_size, embedding_dim, num_filters, filter_sizes, output_dim, dropout).to(device)

# Define loss and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
def train(model, X, y, optimizer, criterion, epochs=10):
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        output = model(X).squeeze(1)
        loss = criterion(output, y.float())
        loss.backward()
        optimizer.step()
        print(f'Epoch: {epoch+1}, Loss: {loss.item()}')

# Train the model
train(model, X, y, optimizer, criterion)

2024-03-27 18:22:14.525095: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
