# NLP Evolution for Sentiment Analysis - From Rules to Transformers

This notebook demonstrates how sentiment analysis has evolved in the field of Natural Language Processing (NLP), moving through different eras:

1. Simple Python using keyword matching
2. Regex-based pattern matching
3. Naive Bayes from scratch using token frequencies
4. RNN (Recurrent Neural Network) using PyTorch
5. Transformer model using PyTorch

All models are trained from scratch with minimal datasets to serve as a clear educational guide.

In [None]:
# Section 1: Sentiment Analysis Using Simple Python and Keywords
# -------------------------------------------------------------
# This is the simplest form of sentiment analysis using fixed word lists.
# It checks for presence of positive or negative words in the sentence.
import re

def simple_sentiment_analysis(text):
    positive_words = ['good', 'happy', 'joy', 'awesome', 'excellent']
    negative_words = ['bad', 'sad', 'pain', 'terrible', 'awful']
    
    text = text.lower()
    pos_count = sum(1 for word in positive_words if word in text)
    neg_count = sum(1 for word in negative_words if word in text)

    if pos_count > neg_count:
        return 'Positive'
    elif neg_count > pos_count:
        return 'Negative'
    else:
        return 'Neutral'

print("Simple Python:", simple_sentiment_analysis("I feel good and happy today!"))

In [None]:
# Section 2: Using Regex-Based Features
# -------------------------------------
# Regular expressions help match word boundaries more accurately.

def regex_sentiment_analysis(text):
    pos_pattern = r"\b(good|happy|joy|awesome|excellent)\b"
    neg_pattern = r"\b(bad|sad|pain|terrible|awful)\b"
    
    pos_matches = re.findall(pos_pattern, text, flags=re.IGNORECASE)
    neg_matches = re.findall(neg_pattern, text, flags=re.IGNORECASE)

    if len(pos_matches) > len(neg_matches):
        return 'Positive'
    elif len(neg_matches) > len(pos_matches):
        return 'Negative'
    else:
        return 'Neutral'

print("Regex-based:", regex_sentiment_analysis("This is a terrible and awful situation."))

In [None]:
# Section 3: Naive Bayes From Scratch (Basic NLP)
# ----------------------------------------------
# Introduces tokenization, vocabulary, and probability-based classification.
from collections import defaultdict
import math

class NaiveBayesSentiment:
    def __init__(self):
        self.word_probs = defaultdict(lambda: {'positive': 1, 'negative': 1})
        self.class_probs = {'positive': 1, 'negative': 1}
        self.vocab = set()

    def tokenize(self, text):
        return re.findall(r'\b\w+\b', text.lower())

    def train(self, texts, labels):
        for text, label in zip(texts, labels):
            self.class_probs[label] += 1
            tokens = self.tokenize(text)
            for token in tokens:
                self.word_probs[token][label] += 1
                self.vocab.add(token)

    def predict(self, text):
        tokens = self.tokenize(text)
        total = sum(self.class_probs.values())
        log_probs = {}
        for label in self.class_probs:
            log_prob = math.log(self.class_probs[label] / total)
            for token in tokens:
                if token in self.vocab:
                    word_count = self.word_probs[token][label]
                    total_words = sum(self.word_probs[t][label] for t in self.vocab)
                    log_prob += math.log(word_count / total_words)
            log_probs[label] = log_prob
        return max(log_probs, key=log_probs.get)

texts = ["I love this movie", "This movie is terrible", "Happy and joyful experience", "Bad and sad ending"]
labels = ['positive', 'negative', 'positive', 'negative']

nb_model = NaiveBayesSentiment()
nb_model.train(texts, labels)
print("Naive Bayes:", nb_model.predict("What a joyful and lovely experience"))

In [None]:
# Section 4: RNN for Sentiment Analysis (from scratch using PyTorch)
# --------------------------------------------------------------------
# We use an RNN model for sentiment classification trained on a small dataset.
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Basic Dataset
rnn_texts = ["I love this movie", "This movie is terrible", "Happy and joyful experience", "Bad and sad ending"]
rnn_labels = [1, 0, 1, 0]  # 1 for positive, 0 for negative

# Tokenization and Vocabulary
def tokenize(text):
    return text.lower().split()

vocab = set(word for sentence in rnn_texts for word in tokenize(sentence))
word2idx = {word: idx+1 for idx, word in enumerate(vocab)}
word2idx['<PAD>'] = 0

def encode(text):
    return [word2idx[word] for word in tokenize(text)]

class SentimentDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = [torch.tensor(encode(text)) for text in texts]
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

def collate_fn(batch):
    texts, labels = zip(*batch)
    lengths = [len(t) for t in texts]
    padded = nn.utils.rnn.pad_sequence(texts, batch_first=True)
    return padded, torch.tensor(labels)

dataset = SentimentDataset(rnn_texts, rnn_labels)
loader = DataLoader(dataset, batch_size=2, collate_fn=collate_fn)

class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=10, hidden_dim=8):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = self.embedding(x)
        _, h_n = self.rnn(x)
        return torch.sigmoid(self.fc(h_n.squeeze(0)))

model = RNNModel(len(word2idx))
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training Loop
for epoch in range(10):
    for inputs, labels in loader:
        outputs = model(inputs).squeeze()
        loss = criterion(outputs, labels.float())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# Test Prediction
def predict_sentiment_rnn(text):
    model.eval()
    encoded = torch.tensor(encode(text)).unsqueeze(0)
    output = model(encoded)
    return 'Positive' if output.item() > 0.5 else 'Negative'

print("RNN:", predict_sentiment_rnn("What a joyful experience"))

In [None]:
# Section 5: Transformer for Sentiment Analysis (PyTorch)
# --------------------------------------------------------
# Implementing a small transformer for sentiment classification
from torch.nn import TransformerEncoder, TransformerEncoderLayer

class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model=16, nhead=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        encoder_layer = TransformerEncoderLayer(d_model=d_model, nhead=nhead)
        self.transformer = TransformerEncoder(encoder_layer, num_layers=1)
        self.fc = nn.Linear(d_model, 1)

    def forward(self, x):
        x = self.embedding(x).permute(1, 0, 2)  # Transformer expects [seq_len, batch, embed_dim]
        x = self.transformer(x)
        x = x.mean(dim=0)
        return torch.sigmoid(self.fc(x))

transformer_model = TransformerModel(len(word2idx))
optimizer = optim.Adam(transformer_model.parameters(), lr=0.01)
criterion = nn.BCELoss()

# Train Transformer Model
for epoch in range(10):
    for inputs, labels in loader:
        outputs = transformer_model(inputs).squeeze()
        loss = criterion(outputs, labels.float())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# Prediction with Transformer
def predict_sentiment_transformer(text):
    transformer_model.eval()
    encoded = torch.tensor(encode(text)).unsqueeze(0)
    output = transformer_model(encoded)
    return 'Positive' if output.item() > 0.5 else 'Negative'

print("Transformer:", predict_sentiment_transformer("An awesome and joyful time!"))