In [53]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm

In [54]:
data = pd.read_csv("diffusion/train.csv")

In [55]:
data.head()

Unnamed: 0,prompt
0,"man waking up, dark and still room, cinematic ..."
1,Yate con familia feliz navegando por el mar ca...
2,"Many friendly alien race individuals. fantasy,..."
3,"theo james as cyclops, cyberpunk futuristic ne..."
4,Portrait of a beautiful woman with long hair o...


In [56]:
data = data.astype('str')

In [57]:
data['prompt'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 726362 entries, 0 to 726361
Series name: prompt
Non-Null Count   Dtype 
--------------   ----- 
726362 non-null  object
dtypes: object(1)
memory usage: 5.5+ MB


In [58]:
# 1. Text Cleaning (basic example, may need to be adjusted based on your dataset)
def clean_text(text):
    text = text.lower()
    text = text.replace('.', ' ').replace(',', ' ')
    text = ' '.join(text.split())  # Remove extra spaces
    return text

data['cleaned_text'] = data['prompt'].apply(clean_text)

In [59]:
data.head()

Unnamed: 0,prompt,cleaned_text
0,"man waking up, dark and still room, cinematic ...",man waking up dark and still room cinematic li...
1,Yate con familia feliz navegando por el mar ca...,yate con familia feliz navegando por el mar ca...
2,"Many friendly alien race individuals. fantasy,...",many friendly alien race individuals fantasy s...
3,"theo james as cyclops, cyberpunk futuristic ne...",theo james as cyclops cyberpunk futuristic neo...
4,Portrait of a beautiful woman with long hair o...,portrait of a beautiful woman with long hair o...


In [60]:
# 2. Tokenization
from collections import Counter
from itertools import chain

def tokenize(text):
    return text.split()

tokenized_texts = data['cleaned_text'].apply(tokenize)

In [61]:
word_counts = Counter(chain(*tokenized_texts[:10000]))
vocab = sorted(word_counts, key=word_counts.get, reverse=True)
word_to_idx = {word: idx + 1 for idx, word in enumerate(vocab)}
word_to_idx['<pad>'] = 0
idx_to_word = {idx: word for word, idx in word_to_idx.items()}
total_words = len(word_to_idx)

In [62]:
# 3. Indexing and Sequence Creation
input_sequences = []
for line in tokenized_texts[:10000]:
    token_list = [word_to_idx[word] for word in line]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [63]:
input_sequences

[[52, 10074],
 [52, 10074, 140],
 [52, 10074, 140, 62],
 [52, 10074, 140, 62, 3],
 [52, 10074, 140, 62, 3, 257],
 [52, 10074, 140, 62, 3, 257, 177],
 [52, 10074, 140, 62, 3, 257, 177, 33],
 [52, 10074, 140, 62, 3, 257, 177, 33, 53],
 [52, 10074, 140, 62, 3, 257, 177, 33, 53, 5104],
 [52, 10074, 140, 62, 3, 257, 177, 33, 53, 5104, 77],
 [52, 10074, 140, 62, 3, 257, 177, 33, 53, 5104, 77, 115],
 [52, 10074, 140, 62, 3, 257, 177, 33, 53, 5104, 77, 115, 59],
 [52, 10074, 140, 62, 3, 257, 177, 33, 53, 5104, 77, 115, 59, 1831],
 [52, 10074, 140, 62, 3, 257, 177, 33, 53, 5104, 77, 115, 59, 1831, 6],
 [52, 10074, 140, 62, 3, 257, 177, 33, 53, 5104, 77, 115, 59, 1831, 6, 6610],
 [52,
  10074,
  140,
  62,
  3,
  257,
  177,
  33,
  53,
  5104,
  77,
  115,
  59,
  1831,
  6,
  6610,
  6611],
 [52,
  10074,
  140,
  62,
  3,
  257,
  177,
  33,
  53,
  5104,
  77,
  115,
  59,
  1831,
  6,
  6610,
  6611,
  6],
 [52,
  10074,
  140,
  62,
  3,
  257,
  177,
  33,
  53,
  5104,
  77,
  115,
  59,

In [64]:
# 4. Padding
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = [seq + [0] * (max_sequence_len - len(seq)) for seq in input_sequences]

In [65]:
# 5. Prepare Labels
input_sequences = np.array(input_sequences)
X, y = input_sequences[:, :-1], input_sequences[:, -1]


In [66]:
X[:10]

array([[   52, 10074,     0, ...,     0,     0,     0],
       [   52, 10074,   140, ...,     0,     0,     0],
       [   52, 10074,   140, ...,     0,     0,     0],
       ...,
       [   52, 10074,   140, ...,     0,     0,     0],
       [   52, 10074,   140, ...,     0,     0,     0],
       [   52, 10074,   140, ...,     0,     0,     0]])

In [67]:
# 6. Train-Test Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

In [68]:
# 7. Dataset and DataLoader
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = TextDataset(X_train, y_train)
val_dataset = TextDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4)


In [69]:
# 8. Model Building
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size, num_layers=2, dropout=0.2):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers=num_layers, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.dropout(x)
        x = self.fc(x[:, -1, :])
        return x


In [70]:
model = LSTMModel(total_words, 128, 256, total_words, num_layers=3, dropout=0.3)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [71]:
# 9. Model Training
def train(model, loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for X_batch, y_batch in loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

In [72]:
def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            total_loss += loss.item()
    return total_loss / len(loader)

In [73]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [74]:
device

device(type='cuda')

In [None]:
epochs = 30
for epoch in tqdm(range(epochs)):
    train_loss = train(model, train_loader, criterion, optimizer, device)
    val_loss = evaluate(model, val_loader, criterion, device)
    print(f'Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

  3%|▎         | 1/30 [06:26<3:06:47, 386.46s/it]

Epoch 1/30, Train Loss: 0.0143, Val Loss: 0.0000


  7%|▋         | 2/30 [12:52<3:00:16, 386.29s/it]

Epoch 2/30, Train Loss: 0.0002, Val Loss: 0.0000


 13%|█▎        | 4/30 [25:45<2:47:22, 386.25s/it]

Epoch 4/30, Train Loss: 0.0001, Val Loss: 0.0000


 17%|█▋        | 5/30 [32:11<2:40:57, 386.28s/it]

Epoch 5/30, Train Loss: 0.0001, Val Loss: 0.0000


 20%|██        | 6/30 [38:37<2:34:30, 386.26s/it]

Epoch 6/30, Train Loss: 0.0001, Val Loss: 0.0000


 23%|██▎       | 7/30 [45:03<2:28:01, 386.16s/it]

Epoch 7/30, Train Loss: 0.0001, Val Loss: 0.0000


 27%|██▋       | 8/30 [51:29<2:21:35, 386.16s/it]

Epoch 8/30, Train Loss: 0.0002, Val Loss: 0.0000


 30%|███       | 9/30 [57:55<2:15:09, 386.17s/it]

Epoch 9/30, Train Loss: 0.0002, Val Loss: 0.0000


 33%|███▎      | 10/30 [1:04:21<2:08:42, 386.13s/it]

Epoch 10/30, Train Loss: 0.0002, Val Loss: 0.0000


 37%|███▋      | 11/30 [1:10:49<2:02:24, 386.53s/it]

Epoch 11/30, Train Loss: 0.0002, Val Loss: 0.0000


 40%|████      | 12/30 [1:17:15<1:55:56, 386.47s/it]

Epoch 12/30, Train Loss: 0.0002, Val Loss: 0.0000


 43%|████▎     | 13/30 [1:23:41<1:49:27, 386.35s/it]

Epoch 13/30, Train Loss: 0.0002, Val Loss: 0.0000


 47%|████▋     | 14/30 [1:30:07<1:43:00, 386.26s/it]

Epoch 14/30, Train Loss: 0.0002, Val Loss: 0.0000


 50%|█████     | 15/30 [1:36:33<1:36:32, 386.18s/it]

Epoch 15/30, Train Loss: 0.0001, Val Loss: 0.0000


 53%|█████▎    | 16/30 [1:43:00<1:30:06, 386.17s/it]

Epoch 16/30, Train Loss: 0.0002, Val Loss: 0.0000


 57%|█████▋    | 17/30 [1:49:26<1:23:40, 386.18s/it]

Epoch 17/30, Train Loss: 0.0002, Val Loss: 0.0000


 60%|██████    | 18/30 [1:55:52<1:17:13, 386.15s/it]

Epoch 18/30, Train Loss: 0.0002, Val Loss: 0.0000


 63%|██████▎   | 19/30 [2:02:18<1:10:47, 386.16s/it]

Epoch 19/30, Train Loss: 0.0002, Val Loss: 0.0000


 67%|██████▋   | 20/30 [2:08:44<1:04:20, 386.09s/it]

Epoch 20/30, Train Loss: 0.0002, Val Loss: 0.0000


 70%|███████   | 21/30 [2:15:10<57:54, 386.08s/it]  

Epoch 21/30, Train Loss: 0.0002, Val Loss: 0.0000


 73%|███████▎  | 22/30 [2:21:36<51:28, 386.06s/it]

Epoch 22/30, Train Loss: 0.0002, Val Loss: 0.0000


 77%|███████▋  | 23/30 [2:28:02<45:02, 386.02s/it]

Epoch 23/30, Train Loss: 0.0003, Val Loss: 0.0000


 80%|████████  | 24/30 [2:34:28<38:36, 386.10s/it]

Epoch 24/30, Train Loss: 0.0002, Val Loss: 0.0000


 83%|████████▎ | 25/30 [2:40:54<32:10, 386.06s/it]

Epoch 25/30, Train Loss: 0.0002, Val Loss: 0.0000


 87%|████████▋ | 26/30 [2:47:20<25:44, 386.10s/it]

Epoch 26/30, Train Loss: 0.0002, Val Loss: 0.0000


 90%|█████████ | 27/30 [2:53:46<19:18, 386.08s/it]

Epoch 27/30, Train Loss: 0.0002, Val Loss: 0.0000


 93%|█████████▎| 28/30 [3:00:12<12:52, 386.05s/it]

Epoch 28/30, Train Loss: 0.0003, Val Loss: 0.0000


 97%|█████████▋| 29/30 [3:06:38<06:26, 386.04s/it]

Epoch 29/30, Train Loss: 0.0002, Val Loss: 0.0000


100%|██████████| 30/30 [3:13:04<00:00, 386.16s/it]

Epoch 30/30, Train Loss: 0.0002, Val Loss: 0.0000





In [None]:
torch.save(model,"diffusion_prompt_model.pt")

In [79]:
# 10. Generate Text (example function)
def generate_text(model, seed_text, next_words, max_sequence_len, device):
    model.eval()
    words = seed_text.split()
    for _ in range(next_words):
        token_list = [word_to_idx[word] for word in words if word in word_to_idx]
        token_list = [0] * (max_sequence_len - len(token_list)) + token_list
        token_list = torch.tensor(token_list[-max_sequence_len:], dtype=torch.long).unsqueeze(0).to(device)
        with torch.no_grad():
            predicted = model(token_list)
            predicted_word_index = torch.argmax(predicted, axis=-1).item()
            if predicted_word_index == word_to_idx['<pad>']:
                break
            words.append(idx_to_word[predicted_word_index])
    return ' '.join(words)




In [80]:
print(generate_text(model, "Many friendly", 5, max_sequence_len, device))

Many friendly
