In [None]:
!pip install portalocker



In [None]:
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from collections import defaultdict, Counter
import random
import portalocker

In [None]:
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from torchtext.datasets import IMDB
from collections import Counter

tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter, special_tokens):
    for token in special_tokens:
        yield [token]
    for _, text in data_iter:
        yield tokenizer(text)

special_tokens = ["<unk>", "<pad>", "<start>", "<end>"]

train_iter = IMDB(split='train')

vocab = build_vocab_from_iterator(yield_tokens(train_iter, special_tokens), specials=special_tokens)

vocab.set_default_index(vocab["<unk>"])


In [None]:
def build_ngram_model(data, n=3):
    model = defaultdict(Counter)
    for sentence in data:
        for i in range(len(sentence)-n+1):
            context = tuple(sentence[i:i+n-1])
            target = sentence[i+n-1]
            model[context][target] += 1
    return model


In [None]:
import portalocker
with open("test.lock", "w") as lock_file:
    portalocker.lock(lock_file, portalocker.LOCK_EX)
    input("Lock acquired. Press Enter to release lock...")

import sys
print(sys.executable)


Lock acquired. Press Enter to release lock...
/usr/bin/python3


In [None]:
import torch.nn as nn

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
    def forward(self, x):
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded)
        output = self.fc(output[:, -1, :])
        return output


In [None]:
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from collections import Counter

def load_data(data_type='train'):
    tokenizer = get_tokenizer('basic_english')
    counter = Counter()
    for label, line in IMDB(split=data_type):
        counter.update(tokenizer(line))
    tokenized_text = [tok for tok, cnt in counter.items() for _ in range(cnt)]
    return tokenized_text

tokenized_text = load_data('train')

In [None]:
def build_ngram_model(tokenized_text, n=3):
    model = {}
    for i in range(len(tokenized_text)-n):
        gram = tuple(tokenized_text[i:i+n-1])
        next_word = tokenized_text[i+n-1]
        if gram not in model:
            model[gram] = {}
        if next_word not in model[gram]:
            model[gram][next_word] = 0
        model[gram][next_word] += 1
    for gram in model.keys():
        total = float(sum(model[gram].values()))
        for word in model[gram]:
            model[gram][word] /= total
    return model


In [None]:
import random

def generate_text(model, start_text, num_words=20, n=3):
    result = start_text.split()
    for _ in range(num_words):
        state = tuple(result[-(n-1):])
        next_words = model.get(state, None)
        if not next_words:
            break
        next_word = random.choices(list(next_words.keys()), weights=next_words.values())[0]
        result.append(next_word)
    return ' '.join(result)


In [None]:
tokenized_text = load_data('train')
ngram_model = build_ngram_model(tokenized_text, n=3)

for _ in range(5):
    print(generate_text(ngram_model, "My favorite movie", num_words=20, n=3))


My favorite movie
My favorite movie
My favorite movie
My favorite movie
My favorite movie


In [None]:
from torch.utils.data import Dataset
import torch

class IMDBDataset(Dataset):
    def __init__(self, data_iter, vocab, tokenizer):
        self.vocab = vocab
        self.tokenizer = tokenizer
        self.data = []
        for label, text in data_iter:
            numericalized_text = [self.vocab.get('<start>', self.vocab.get('<unk>'))] + \
                                 [self.vocab.get(token, self.vocab.get('<unk>')) for token in self.tokenizer(text)] + \
                                 [self.vocab.get('<end>', self.vocab.get('<unk>'))]
            self.data.append((numericalized_text, label))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        numericalized_text, label = self.data[idx]
        input_sequence = torch.tensor(numericalized_text[:-1], dtype=torch.long)
        target_sequence = torch.tensor(numericalized_text[1:], dtype=torch.long)
        return input_sequence, target_sequence, label

In [None]:
embedding_dim = 100
hidden_dim = 256
vocab_size = len(vocab)

model = LSTMModel(vocab_size, embedding_dim, hidden_dim)


In [None]:
from collections import Counter
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from torchtext.datasets import IMDB
from torchtext.vocab import Vocab

tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

train_iter = IMDB(split='train')

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=['<unk>', '<pad>', '<start>', '<end>'])

vocab.set_default_index(vocab['<unk>'])

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.optim as optim

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.lstm(embedded)
        output = self.fc(output)
        return output

class TextDataset(Dataset):
    def __init__(self, input_sequences, targets):
        self.input_sequences = input_sequences
        self.targets = targets

    def __len__(self):
        return len(self.input_sequences)

    def __getitem__(self, idx):
        return self.input_sequences[idx], self.targets[idx]

def collate_fn(batch):
    input_sequences, targets = zip(*batch)
    input_sequences_padded = pad_sequence([torch.tensor(seq) for seq in input_sequences], batch_first=True, padding_value=0)
    targets_padded = pad_sequence([torch.tensor(tgt) for tgt in targets], batch_first=True, padding_value=0)
    return input_sequences_padded, targets_padded

vocab_size = 10000
num_classes = 20

input_sequences = torch.randint(0, vocab_size, (1000, 10))
targets = torch.randint(0, num_classes, (1000, 10))

dataset = TextDataset(input_sequences, targets)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

model = LSTMModel(vocab_size, embedding_dim=100, hidden_dim=256, num_classes=num_classes)
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

loss_values = []

num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for input_sequences, targets in train_loader:
        input_sequences, targets = input_sequences.long(), targets.long()

        optimizer.zero_grad()
        predictions = model(input_sequences)

        loss = criterion(predictions.view(-1, num_classes), targets.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    average_loss = total_loss / len(train_loader)
    loss_values.append(average_loss)
    print(f'Epoch {epoch+1}, Loss: {average_loss}')

  input_sequences_padded = pad_sequence([torch.tensor(seq) for seq in input_sequences], batch_first=True, padding_value=0)
  targets_padded = pad_sequence([torch.tensor(tgt) for tgt in targets], batch_first=True, padding_value=0)


Epoch 1, Loss: 2.997724272310734
Epoch 2, Loss: 2.9617259427905083
Epoch 3, Loss: 2.9171161204576492
Epoch 4, Loss: 2.8302216082811356
Epoch 5, Loss: 2.6873877570033073


In [None]:
import os

file_path = '/content/bul.txt'
if os.path.exists(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = f.read()
    print("Dataset loaded successfully.")
else:
    print("File not found. Please check the file path.")


File not found. Please check the file path.


In [None]:
!pip install gensim
import gensim.downloader as api

# Download the GloVe embeddings
api.load('glove-wiki-gigaword-100')

# Load the embeddings into a dictionary
glove_embeddings = api.load('glove-wiki-gigaword-100')

# Get the word vector for a given word
word_vector = glove_embeddings.get_vector('word')



In [None]:
from gensim.models import KeyedVectors

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2024-03-20 03:50:07--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-03-20 03:50:07--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-03-20 03:50:08--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
!unzip glove.6B.zip

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [None]:
# Check if the file exists at the specified path
!ls /content/glove.6B.100d.txt

# If the file does not exist, download it
if not os.path.exists('/content/glove.6B.100d.txt'):
    !wget http://nlp.stanford.edu/data/glove.6B.zip
    !unzip glove.6B.zip

# Update the file path if necessary
glove_path = '/content/glove.6B.100d.txt'

# Load the embeddings
glove_embeddings = load_glove_embeddings(glove_path)

/content/glove.6B.100d.txt


In [None]:
glove_path = '/content/glove.6B.100d.txt'
import numpy as np

def load_glove_embeddings(path):
    embeddings = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

glove_embeddings = load_glove_embeddings(glove_path)
import torch
import torch.nn as nn

vocab = ['hello', 'world', '<unk>', '<pad>']
vocab_size = len(vocab)
embedding_dim = 100
weights_matrix = np.zeros((vocab_size, embedding_dim))

for i, word in enumerate(vocab):
    try:
        weights_matrix[i] = glove_embeddings[word]
    except KeyError:
        weights_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim, ))

embedding_layer = nn.Embedding(vocab_size, embedding_dim)
embedding_layer.weight.data.copy_(torch.from_numpy(weights_matrix))


tensor([[ 2.6688e-01,  3.9632e-01,  6.1690e-01, -7.7451e-01, -1.0390e-01,
          2.6697e-01,  2.7880e-01,  3.0992e-01,  5.4685e-03, -8.5256e-02,
          7.3602e-01, -9.8432e-02,  5.4790e-01, -3.0305e-02,  3.3479e-01,
          1.4094e-01, -7.0003e-03,  3.2569e-01,  2.2902e-01,  4.6557e-01,
         -1.9531e-01,  3.7491e-01, -7.1390e-01, -5.1775e-01,  7.7039e-01,
          1.0881e+00, -6.6011e-01, -1.6234e-01,  9.1190e-01,  2.1046e-01,
          4.7494e-02,  1.0019e+00,  1.1133e+00,  7.0094e-01, -8.6960e-02,
          4.7571e-01,  1.6360e-01, -4.4469e-01,  4.4690e-01, -9.3817e-01,
          1.3101e-02,  8.5964e-02, -6.7456e-01,  4.9662e-01, -3.7827e-02,
         -1.1038e-01, -2.8612e-01,  7.4606e-02, -3.1527e-01, -9.3774e-02,
         -5.7069e-01,  6.6865e-01,  4.5307e-01, -3.4154e-01, -7.1660e-01,
         -7.5273e-01,  7.5212e-02,  5.7903e-01, -1.1910e-01, -1.1379e-01,
         -1.0026e-01,  7.1341e-01, -1.1574e+00, -7.4026e-01,  4.0452e-01,
          1.8023e-01,  2.1449e-01,  3.