In [1]:
import os
import re
from collections import defaultdict
import nltk
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

In [2]:
def load_docs(directory):
    stop_words = set(stopwords.words('english'))
    docs = []
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            text = file.read()
            cleaned_text = re.sub(r"[^a-zA-Z\s']", "", text)  # Remove non-alphabetic characters
            filtered_text = re.sub(r"\b[a-zA-Z']\b", "", cleaned_text)  # Remove single characters
            cleaned_text = re.sub(r"\n", " ", filtered_text)  # Replace new lines with space
            cleaned_text = re.sub(r"\s+", " ", cleaned_text)  # Replace multiple spaces with single space
            docs.append(cleaned_text)
    return docs

neg_dir = '../NLP/TEXT/txt_sentoken/neg/'
pos_dir = '../NLP/TEXT/txt_sentoken/pos/'

# Load negative and positive documents
neg_docs = load_docs(neg_dir)
pos_docs = load_docs(pos_dir)

In [3]:
neg_docs[0]

'in the line of duty is the critically praised series of television movies dealing with the reallife incidents that claimed lives of law enforcement officers in usa the twilight murders another one from the series is dealing with the case of gordon kahl played by rod steiger old farmer from north dakota who would rather spend year in prison than pay taxes to the despised government after being released he still refuses to pay taxes and the warrant is issued for his arrest when the marshals come to arrest him it turns out that kahl isn alone many poor farmers in rural northwest share his extremist antigovernment beliefs and the routine operation turns into shootout that would leave federal officers that that brings fbi on the scene and agent mayberly michael gross is supervising the manhunt however his efforts seem fruitless since kahl still has many supporters some of them even in the local law enforcement after betrayed thriller by costa gavras hollywood mostly ignored the disturbing 

In [4]:
print(len(neg_docs), len(pos_docs))

1000 1000


In [5]:
all_words = pos_docs + neg_docs
labels = [1] * len(pos_docs) + [0] * len(neg_docs)

In [6]:
# Split the data
train_x, test_x, train_y, test_y = train_test_split(all_words, labels, test_size=0.2, stratify=labels)

print(len(train_y),len(test_y))

1600 400


In [7]:
# Download NLTK tokenizer data
nltk.download('punkt')

# Step 1: Tokenize the sentences using nltk
train_x_tokenized = [nltk.word_tokenize(sentence.lower()) for sentence in train_x]
test_x_tokenized = [nltk.word_tokenize(sentence.lower()) for sentence in test_x]

# Step 2: Build a simple vocabulary from the tokenized data
def build_vocab(tokenized_sentences, min_freq=1):
    word_freq = defaultdict(int)
    for sentence in tokenized_sentences:
        for word in sentence:
            word_freq[word] += 1
    
    # Only keep words that appear more than `min_freq` times
    vocab = {word: idx + 1 for idx, (word, freq) in enumerate(word_freq.items()) if freq >= min_freq}
    vocab["<PAD>"] = 0  # Padding token
    return vocab

# Combine training and test data to build the vocabulary
combined_data = train_x_tokenized + test_x_tokenized
vocab = build_vocab(combined_data)

# Step 3: Convert the tokenized sentences to numerical format using the vocabulary
def tokenize(sentences, vocab):
    return [[vocab.get(word, vocab["<PAD>"]) for word in sentence] for sentence in sentences]

train_x_tokenized_indices = tokenize(train_x_tokenized, vocab)
test_x_tokenized_indices = tokenize(test_x_tokenized, vocab)

# Step 4: Pad the tokenized sequences to a fixed length (max_len)
def pad_sequences(tokenized_sentences, max_len):
    return [sentence[:max_len] + [0] * (max_len - len(sentence)) if len(sentence) < max_len else sentence[:max_len] for sentence in tokenized_sentences]

max_len = 5  # Set max sequence length
train_x_padded = pad_sequences(train_x_tokenized_indices, max_len)
test_x_padded = pad_sequences(test_x_tokenized_indices, max_len)

# Convert to PyTorch tensors
train_x_tensor = torch.tensor(train_x_padded, dtype=torch.long)
test_x_tensor = torch.tensor(test_x_padded, dtype=torch.long)

# Convert labels to tensors
train_y_tensor = torch.tensor(train_y, dtype=torch.float32)
test_y_tensor = torch.tensor(test_y, dtype=torch.float32)

# Create DataLoader
train_dataset = TensorDataset(train_x_tensor, train_y_tensor)
test_dataset = TensorDataset(test_x_tensor, test_y_tensor)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

[nltk_data] Downloading package punkt to /home/ali/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
# Define the model
class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes, max_len):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.conv1d = nn.Conv1d(in_channels=embed_dim, out_channels=32, kernel_size=4)
        self.dropout = nn.Dropout(0.5)
        self.maxpool = nn.MaxPool1d(kernel_size=2)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(32 * ((max_len - 4 + 1) // 2), 10)
        self.fc2 = nn.Linear(10, num_classes)
    
    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)  # Change shape to (batch_size, embed_dim, max_len)
        x = F.relu(self.conv1d(x))
        x = self.dropout(x)
        x = self.maxpool(x)
        x = self.flatten(x)
        x = F.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))
        return x

# Example usage
vocab_size = len(vocab)  # Use the actual vocabulary size
embed_dim = 100
num_classes = 1  # Binary classification (0 or 1)
max_len = 5  # Example max sequence length (make sure it matches your padded sequence length)

# Initialize the model
model = TextClassificationModel(vocab_size, embed_dim, num_classes, max_len)

# Training steps
def train_model(model, train_loader, criterion, optimizer, num_epochs=4):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for batch_x, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_x)
            loss = criterion(outputs.squeeze(), batch_y.float())  # Squeeze output for binary cross-entropy loss
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}')

# Loss and optimizer
criterion = nn.BCELoss()  # Binary Cross-Entropy loss for binary classification
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Create DataLoader
train_dataset = TensorDataset(train_x_tensor, train_y_tensor)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

# Train the model
train_model(model, train_loader, criterion, optimizer, num_epochs=6)


Epoch [1/6], Loss: 0.6947
Epoch [2/6], Loss: 0.6615
Epoch [3/6], Loss: 0.6119
Epoch [4/6], Loss: 0.5154
Epoch [5/6], Loss: 0.3997
Epoch [6/6], Loss: 0.3206
