In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from collections import Counter

# Load preprocessed data
df = pd.read_csv('C:/Users/Rudra Thakar/Jupyter/preprocessed_dataset.csv')
texts = df['Combined Text'].values
labels = df['Classification'].values

# Tokenization and Vocabulary Building
def build_vocab(texts, max_words=5000):
    all_words = ' '.join(texts).split()
    word_counts = Counter(all_words)
    vocab = {word: idx + 1 for idx, (word, _) in enumerate(word_counts.most_common(max_words - 1))}
    vocab['<PAD>'] = 0  # Padding token
    return vocab

# Convert text to sequences
def text_to_sequences(texts, vocab, max_len=100):
    sequences = []
    for text in texts:
        seq = [vocab.get(word, 0) for word in text.split()]  # 0 for unknown words
        if len(seq) > max_len:
            seq = seq[:max_len]
        else:
            seq = seq + [0] * (max_len - len(seq))  # Pad with 0s
        sequences.append(seq)
    return np.array(sequences)

# Build vocabulary and sequences
vocab = build_vocab(texts, max_words=5000)
X = text_to_sequences(texts, vocab, max_len=100)
y = labels

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.long)
y_train = torch.tensor(y_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.float32)

# Create DataLoader
train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Define the ANN model
class NewsClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, max_len):
        super(NewsClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(embedding_dim * max_len, 128)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(128, 64)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        x = self.fc3(x)
        x = self.sigmoid(x)
        return x

# Hyperparameters
vocab_size = len(vocab) + 1  # +1 for padding
embedding_dim = 50
max_len = 100

# Initialize model, loss, and optimizer
model = NewsClassifier(vocab_size, embedding_dim, max_len)
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs).squeeze()
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader):.4f}')

# Evaluation
model.eval()
with torch.no_grad():
    outputs = model(X_test).squeeze()
    predictions = (outputs >= 0.5).float()
    accuracy = (predictions == y_test).float().mean()
    print(f'Test Accuracy: {accuracy.item():.4f}')

# Save the model (optional)
torch.save(model.state_dict(), 'news_classifier_pytorch.pth')