In [1]:
import os
import re
from collections import defaultdict
import nltk
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

In [2]:
def load_docs(directory):
    stop_words = set(stopwords.words('english'))
    docs = []
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            text = file.read()
            cleaned_text = re.sub(r"[^a-zA-Z\s']", "", text)  # Remove non-alphabetic characters
            filtered_text = re.sub(r"\b[a-zA-Z']\b", "", cleaned_text)  # Remove single characters
            cleaned_text = re.sub(r"\n", " ", filtered_text)  # Replace new lines with space
            cleaned_text = re.sub(r"\s+", " ", cleaned_text)  # Replace multiple spaces with single space
            docs.append(cleaned_text)
    return docs

neg_dir = '../NLP/TEXT/txt_sentoken/neg/'
pos_dir = '../NLP/TEXT/txt_sentoken/pos/'

# Load negative and positive documents
neg_docs = load_docs(neg_dir)
pos_docs = load_docs(pos_dir)

In [3]:
print(len(neg_docs), len(pos_docs))

1000 1000


In [4]:
all_words = pos_docs + neg_docs
labels = [1] * len(pos_docs) + [0] * len(neg_docs)

In [5]:
# Split the data
train_x, test_x, train_y, test_y = train_test_split(all_words, labels, test_size=0.2, stratify=labels)

print(len(train_y),len(test_y))

1600 400


In [6]:
train_x[0]

"ingredients starving artist lusting after beautiful woman from his childhood synopsis great expectations is modernized version of charles dickens novel in the original charles dickens classic an orphan boy named pip learns about life through his friendship with an escaped convict his relationship with bitter old lady named miss havisham and his hopeless lifetime infatuation with havisham snobbish adopted daughter estella the gist of the story is that an anonymous benefactor sends pip to london for pip to become an upper class gentleman pip leaves for london with great expectations to be groomed into gentleman so that he can one day be classy enough to marry estella but life has way of complicating things pip becomes an arrogant until he learns just who his secret benefactor is in this new version of great expectations poor florida lad named finn ethan hawke with talent for drawing has an early memory of helping an escaped convict robert de niro but soon the focus of finn life becomes 

In [7]:
# Download NLTK tokenizer data
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ali/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
# Step 1: Tokenize the sentences using nltk
train_x_tokenized = [nltk.word_tokenize(sentence.lower()) for sentence in train_x]
test_x_tokenized = [nltk.word_tokenize(sentence.lower()) for sentence in test_x]

In [9]:
train_x_tokenized[0]

['ingredients',
 'starving',
 'artist',
 'lusting',
 'after',
 'beautiful',
 'woman',
 'from',
 'his',
 'childhood',
 'synopsis',
 'great',
 'expectations',
 'is',
 'modernized',
 'version',
 'of',
 'charles',
 'dickens',
 'novel',
 'in',
 'the',
 'original',
 'charles',
 'dickens',
 'classic',
 'an',
 'orphan',
 'boy',
 'named',
 'pip',
 'learns',
 'about',
 'life',
 'through',
 'his',
 'friendship',
 'with',
 'an',
 'escaped',
 'convict',
 'his',
 'relationship',
 'with',
 'bitter',
 'old',
 'lady',
 'named',
 'miss',
 'havisham',
 'and',
 'his',
 'hopeless',
 'lifetime',
 'infatuation',
 'with',
 'havisham',
 'snobbish',
 'adopted',
 'daughter',
 'estella',
 'the',
 'gist',
 'of',
 'the',
 'story',
 'is',
 'that',
 'an',
 'anonymous',
 'benefactor',
 'sends',
 'pip',
 'to',
 'london',
 'for',
 'pip',
 'to',
 'become',
 'an',
 'upper',
 'class',
 'gentleman',
 'pip',
 'leaves',
 'for',
 'london',
 'with',
 'great',
 'expectations',
 'to',
 'be',
 'groomed',
 'into',
 'gentleman',
 's

In [10]:
# Step 2: Build a simple vocabulary from the tokenized data
def build_vocab(tokenized_sentences, min_freq=1):
    word_freq = defaultdict(int)
    for sentence in tokenized_sentences:
        for word in sentence:
            word_freq[word] += 1
    
    # Only keep words that appear more than `min_freq` times
    vocab = {word: idx + 1 for idx, (word, freq) in enumerate(word_freq.items()) if freq >= min_freq}
    vocab["<PAD>"] = 0  # Padding token
    return vocab

# Combine training and test data to build the vocabulary
combined_data = train_x_tokenized + test_x_tokenized
vocab = build_vocab(combined_data)

In [11]:
print("hello" , vocab["hello"])
print("world", vocab["world"])

hello 10841
world 1529


In [12]:
# Step 3: Convert the tokenized sentences to numerical format using the vocabulary
def tokenize(sentences, vocab):
    return [[vocab.get(word, vocab["<PAD>"]) for word in sentence] for sentence in sentences]

train_x_tokenized_indices = tokenize(train_x_tokenized, vocab)
test_x_tokenized_indices = tokenize(test_x_tokenized, vocab)

In [13]:
# Numeric form of first sentence:
train_x_tokenized_indices[0]

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 18,
 19,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 9,
 34,
 35,
 25,
 36,
 37,
 9,
 38,
 35,
 39,
 40,
 41,
 28,
 42,
 43,
 44,
 9,
 45,
 46,
 47,
 35,
 43,
 48,
 49,
 50,
 51,
 22,
 52,
 17,
 22,
 53,
 14,
 54,
 25,
 55,
 56,
 57,
 29,
 58,
 59,
 60,
 29,
 58,
 61,
 25,
 62,
 63,
 64,
 29,
 65,
 60,
 59,
 35,
 12,
 13,
 58,
 66,
 67,
 68,
 64,
 69,
 54,
 70,
 71,
 72,
 73,
 66,
 74,
 75,
 58,
 76,
 51,
 77,
 32,
 78,
 79,
 17,
 80,
 81,
 29,
 82,
 25,
 83,
 84,
 70,
 30,
 85,
 86,
 9,
 87,
 56,
 14,
 21,
 88,
 89,
 16,
 17,
 12,
 13,
 90,
 91,
 92,
 28,
 93,
 94,
 95,
 35,
 96,
 60,
 97,
 78,
 25,
 98,
 99,
 17,
 100,
 25,
 36,
 37,
 101,
 102,
 103,
 77,
 104,
 22,
 105,
 17,
 93,
 32,
 82,
 106,
 107,
 70,
 108,
 109,
 47,
 35,
 51,
 110,
 111,
 25,
 62,
 112,
 113,
 35,
 114,
 115,
 44,
 116,
 117,
 118,
 58,
 119,
 120,
 21,
 121,
 122,
 51,
 123,
 120,
 21,
 124,
 1

In [14]:
# Step 4: Pad the tokenized sequences to a fixed length (max_len)
def pad_sequences(tokenized_sentences, max_len):
    return [sentence[:max_len] + [0] * (max_len - len(sentence)) if len(sentence) < max_len else sentence[:max_len] for sentence in tokenized_sentences]

max_len = 50  # Set max sequence length
train_x_padded = pad_sequences(train_x_tokenized_indices, max_len)
test_x_padded = pad_sequences(test_x_tokenized_indices, max_len)

In [15]:
train_x_padded[2]

[72,
 17,
 22,
 401,
 410,
 21,
 22,
 411,
 412,
 143,
 413,
 412,
 143,
 414,
 14,
 360,
 415,
 58,
 412,
 143,
 416,
 21,
 54,
 151,
 417,
 418,
 17,
 419,
 420,
 143,
 421,
 148,
 422,
 423,
 424,
 425,
 426,
 427,
 44,
 428,
 429,
 430,
 394,
 44,
 418,
 17,
 431,
 432,
 433,
 21]

In [16]:
# Convert to PyTorch tensors
train_x_tensor = torch.tensor(train_x_padded, dtype=torch.long)
test_x_tensor = torch.tensor(test_x_padded, dtype=torch.long)

# Convert labels to tensors
train_y_tensor = torch.tensor(train_y, dtype=torch.float32)
test_y_tensor = torch.tensor(test_y, dtype=torch.float32)

# Create DataLoader
train_dataset = TensorDataset(train_x_tensor, train_y_tensor)
test_dataset = TensorDataset(test_x_tensor, test_y_tensor)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

In [17]:
# Define the model
class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes, max_len):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.conv1d = nn.Conv1d(in_channels=embed_dim, out_channels=32, kernel_size=4)
        self.dropout = nn.Dropout(0.5)
        self.maxpool = nn.MaxPool1d(kernel_size=2)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(32 * ((max_len - 4 + 1) // 2), 10)
        self.fc2 = nn.Linear(10, num_classes)
    
    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)  # Change shape to (batch_size, embed_dim, max_len)
        x = F.relu(self.conv1d(x))
        x = self.dropout(x)
        x = self.maxpool(x)
        x = self.flatten(x)
        x = F.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))
        return x

# Example usage
vocab_size = len(vocab)  # Use the actual vocabulary size
embed_dim = 100
num_classes = 1  # Binary classification (0 or 1)

# Initialize the model
model = TextClassificationModel(vocab_size, embed_dim, num_classes, max_len)

# Training steps
def train_model(model, train_loader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for batch_x, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_x)
            loss = criterion(outputs.squeeze(), batch_y.float())  # Squeeze output for binary cross-entropy loss
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}')

# Loss and optimizer
criterion = nn.BCELoss()  # Binary Cross-Entropy loss for binary classification
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Create DataLoader
train_dataset = TensorDataset(train_x_tensor, train_y_tensor)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

# Train the model
train_model(model, train_loader, criterion, optimizer, num_epochs=12)


Epoch [1/12], Loss: 0.6980
Epoch [2/12], Loss: 0.6695
Epoch [3/12], Loss: 0.5330
Epoch [4/12], Loss: 0.2668
Epoch [5/12], Loss: 0.1560
Epoch [6/12], Loss: 0.1229
Epoch [7/12], Loss: 0.0950
Epoch [8/12], Loss: 0.0648
Epoch [9/12], Loss: 0.0567
Epoch [10/12], Loss: 0.0624
Epoch [11/12], Loss: 0.0483
Epoch [12/12], Loss: 0.0491


In [18]:
torch.save(model.state_dict(), 'pos_neg.pt')