In [185]:
import torch
import torch.nn as nn
from torch.optim import Adam
import random

import numpy as np
from torchsummary import summary


In [186]:
train_data = []
train_label = []
for _ in range(50):
    start = random.randint(0, 97)  # Random start to generate diverse sequences
    seq = [start, start+1, start+2]
    train_data.append(seq)
    train_label.append(start+3)  # Next number in sequence

In [196]:
print(len(train_data))
print(len(train_label))
print(train_data[1])
print(train_label[1])

50
50
[96, 97, 98]
99


In [187]:
# Convert to PyTorch tensors
X = torch.FloatTensor(train_data).unsqueeze(-1)  # Shape: [50, 3, 1]
y = torch.FloatTensor(train_label)

# Simple RNN model
class SimpleRNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.rnn = nn.RNN(input_size=1, hidden_size=16, batch_first=True)
        self.linear = nn.Linear(16, 1)
        
    def forward(self, x):
        output, _ = self.rnn(x)
        return self.linear(output[:, -1, :]).squeeze()

# Create and train model
model = SimpleRNN()

In [188]:
loss_fn = nn.MSELoss()
optimizer = Adam(model.parameters(), lr=0.01)

In [189]:
# Train
for epoch in range(500):
    model.train()
    optimizer.zero_grad()
    
    prediction = model(X)
    loss = loss_fn(prediction, y)
    
    loss.backward()
    optimizer.step()
    
    if epoch % 50 == 0:
        print(f'Epoch: {epoch} | Loss: {loss.item():.4f}')

Epoch: 0 | Loss: 3660.5479
Epoch: 50 | Loss: 2864.2576
Epoch: 100 | Loss: 2230.7095
Epoch: 150 | Loss: 1732.7688
Epoch: 200 | Loss: 1344.2004
Epoch: 250 | Loss: 1040.6659
Epoch: 300 | Loss: 805.0283
Epoch: 350 | Loss: 621.6252
Epoch: 400 | Loss: 480.4293
Epoch: 450 | Loss: 373.4489


In [190]:
# Test several sequences
test_sequences = [
    [7,8,9],
    [11,12,13],
    [20,21,22]
]

model.eval()
with torch.no_grad():
    for seq in test_sequences:
        test_input = torch.FloatTensor(seq).reshape(1, 3, 1)
        pred = model(test_input)
        print(f'\nSequence {seq} -> Predicted: {pred.item():.4f}, Expected: {seq[-1]+1}')


Sequence [7, 8, 9] -> Predicted: 10.0128, Expected: 10

Sequence [11, 12, 13] -> Predicted: 13.9411, Expected: 14

Sequence [20, 21, 22] -> Predicted: 22.9613, Expected: 23


# https://www.tensorflow.org/text/guide/word_embeddings

In [12]:
import torch
import torch.nn as nn

import kagglehub
import glob
import os
import pandas as pd

import tqdm

In [4]:
# Download latest version
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

print("Path to dataset files:", path)

Path to dataset files: /home/azureuser/.cache/kagglehub/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/versions/1


In [5]:
# probably want to change the name of the directory w/ "mv IMDB Dataset.csv dataset.csv" 
path = '/home/azureuser/.cache/kagglehub/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/versions/1/IMDB Dataset.csv'
os.path.isfile(path)

True

In [6]:
dataset = pd.read_csv(path)
print(len(dataset))
print(dataset.shape)
pd.set_option('display.max_colwidth', 200)
dataset.head()

50000
(50000, 2)


Unnamed: 0,review,sentiment
0,"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me...",positive
1,"A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire p...",positive
2,"I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. The plot is simplistic, but the dialogue i...",positive
3,Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenl...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is a visually stunning film to watch. Mr. Mattei offers us a vivid portrait about human relations. This is a movie that seems to be telling us what mone...",positive


In [7]:
def tokenizer(text):

    text = text.replace('<br />', ' ')
    text = text.replace('<br/>', ' ')
    text = text.replace('<br>', ' ')
    text = text.lower()    
    for punctuation in ".,!?:;:":
        text = text.replace(punctuation, f'') # Add space before and after the punctuation
    tokens = text.split()
    return tokens

text = "My name is John Doe!"
print(tokenizer(text))
    

['my', 'name', 'is', 'john', 'doe']


In [8]:
sample_review = dataset['review'][0]
print(tokenizer(sample_review))

['one', 'of', 'the', 'other', 'reviewers', 'has', 'mentioned', 'that', 'after', 'watching', 'just', '1', 'oz', 'episode', "you'll", 'be', 'hooked', 'they', 'are', 'right', 'as', 'this', 'is', 'exactly', 'what', 'happened', 'with', 'me', 'the', 'first', 'thing', 'that', 'struck', 'me', 'about', 'oz', 'was', 'its', 'brutality', 'and', 'unflinching', 'scenes', 'of', 'violence', 'which', 'set', 'in', 'right', 'from', 'the', 'word', 'go', 'trust', 'me', 'this', 'is', 'not', 'a', 'show', 'for', 'the', 'faint', 'hearted', 'or', 'timid', 'this', 'show', 'pulls', 'no', 'punches', 'with', 'regards', 'to', 'drugs', 'sex', 'or', 'violence', 'its', 'is', 'hardcore', 'in', 'the', 'classic', 'use', 'of', 'the', 'word', 'it', 'is', 'called', 'oz', 'as', 'that', 'is', 'the', 'nickname', 'given', 'to', 'the', 'oswald', 'maximum', 'security', 'state', 'penitentary', 'it', 'focuses', 'mainly', 'on', 'emerald', 'city', 'an', 'experimental', 'section', 'of', 'the', 'prison', 'where', 'all', 'the', 'cells', 

In [9]:
from collections import Counter

def build_vocab(texts, max_words=10000):
    word_counter = Counter()
    for text in texts:
        tokens = tokenizer(text)
        word_counter.update(tokens)

    vocab = ['<PAD>', '<UNK>']
    vocab += [word for word, count in word_counter.most_common(max_words)]

    word_to_idx = {word:idx for idx, word in enumerate(vocab)}
    return word_to_idx

In [None]:
vocab = build_vocab(dataset['review'])
print("Vocabulary size:", len(vocab))
vocab_20 = list(vocab.items())[:10]
for word, idx in vocab_20:
    print(f'{word}: {idx}')

In [75]:
#Create custom dataset 
from torch.utils.data import Dataset, DataLoader
class MovieReview(Dataset):
    def __init__(self, reviews, labels, vocab, max_len=200):
        self.reviews = list(reviews)
        self.labels = list(labels) 
        self.vocab = vocab
        self.max_len = max_len
    
    def __len__(self):
        return len(self.reviews)
    
    def __getitem__(self, idx):
        review = self.reviews[idx]
        label = self.labels[idx]
        tokens = tokenizer(review)[:self.max_len]

        indices = [self.vocab.get(token, 1) for token in tokens] 

        if len(indices) < self.max_len:
            indices += [0] * (self.max_len - len(indices))
        
        return {
            'input_ids': torch.tensor(indices),
            'labels': torch.tensor(1 if label == 'positive' else 0)
        }

In [79]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(dataset['review'], dataset['sentiment'])

In [85]:
print(f'x_train: {len(x_train)} x_test: {len(x_test)} y_train: {len(y_train)} y_test: {len(y_test)}')

x_train: 37500 x_test: 12500 y_train: 37500 y_test: 12500


In [86]:
train_dataset = MovieReview(x_train, y_train, vocab)
test_dataset = MovieReview(x_test, y_test, vocab)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [91]:
sample_batch = next(iter(train_loader))
print(sample_batch['input_ids'])
print(sample_batch['labels'])

tensor([[   9,    5,   51,  ...,    0,    0,    0],
        [  49,   20,   74,  ...,    0,    0,    0],
        [   8,   11,   50,  ...,   46,  146, 2950],
        ...,
        [   1,  228,   36,  ...,   42,    6,  924],
        [   9,  382, 6066,  ...,  763,   13, 5409],
        [   9,    5,   26,  ...,   63,    4, 2579]])
tensor([1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1,
        0, 1, 0, 0, 0, 0, 1, 1])


In [98]:
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters())

In [107]:
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim=200, hidden_dim=256, n_layers=2):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)

        self.lstm = nn.LSTM(input_size=embedding_dim, 
                            hidden_size=hidden_dim, 
                            num_layers=n_layers,
                            batch_first=True, 
                            bidirectional=True)
        
        self.linear1 = nn.Linear(hidden_dim * 2, 1)

        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        print("Input shape:", x.shape)

        embedded = self.dropout(self.embeddings(x))
        print("Embedded shape:", embedded.shape)


        lstm_out, _ = self.lstm(embedded)

        print(f'lstm output shape: {lstm_out.shape}')

        out = self.linear1(lstm_out[:, -1, :])
        print("Output shape:", out.shape)


        torch.sigmoid(out)

model = SentimentLSTM(vocab_size=len(vocab))
model

SentimentLSTM(
  (embeddings): Embedding(10000, 200)
  (lstm): LSTM(200, 256, num_layers=2, batch_first=True, bidirectional=True)
  (linear1): Linear(in_features=512, out_features=1, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)

In [111]:
def train_model(model, train_loader, criterion, optimizer, epochs=5):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            # Get data
            inputs = batch['input_ids']
            labels = batch['labels'].float()
            
            # Zero gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(inputs)
            outputs = outputs.squeeze()
            print(f'...{outputs.shape}')
            
            loss = criterion(outputs, labels)
            
            # Backward pass
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
        avg_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch+1}, Loss: {avg_loss:.4f}')

In [112]:
train_model(model, train_loader, criterion, optimizer)

Input shape: torch.Size([32, 200])
Embedded shape: torch.Size([32, 200, 200])
lstm output shape: torch.Size([32, 200, 512])
Output shape: torch.Size([32, 1])


AttributeError: 'NoneType' object has no attribute 'squeeze'

In [11]:
import torch.nn as nn

class SentimentRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim=100, hidden_dim=256, n_layers=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                           batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, 1)
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        lstm_out, _ = self.lstm(embedded)
        out = self.fc(lstm_out[:, -1, :])
        return torch.sigmoid(out)

# Create model and optimizer
model = SentimentRNN(len(vocab))
optimizer = torch.optim.Adam(model.parameters())

def train_model(model, train_loader, criterion, optimizer, epochs=5):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch_idx, batch in tqdm(enumerate(train_loader), desc='Training'):
            # Get data
            inputs = batch['input_ids']
            labels = batch['labels'].float()
            
            # Zero gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(inputs)
            outputs = outputs.squeeze()
            
            # Calculate loss
            loss = criterion(outputs, labels)
            
            # Backward pass
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
            if batch_idx % 1 == 0:
                print(f'Epoch: {epoch+1}, Batch: {batch_idx}, Loss: {loss.item():.4f}')
        
        avg_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch+1} Average Loss: {avg_loss:.4f}')

# Train
train_model(model, train_loader, criterion, optimizer)

NameError: name 'vocab' is not defined