In [1]:
# mounting GoogleDrive for Colab
try:
 from google.colab import drive
 drive.mount('/content/gdrive', force_remount=True)
 FOLDERNAME = 'Friends_Generator'
 %cd /content/gdrive/My\ Drive/$FOLDERNAME
except ImportError:
 pass

Mounted at /content/gdrive
/content/gdrive/My Drive/Friends_Generator


#Scraping Raw Data

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
#Run for each season (1 to 8 usable) [manually combining all of them (for quality control)]
season = 1            #To change
script = ''
for i in range(1,25):
  episode = str(season) + '0'*(i<10) + str(i) 
  url = f'https://www.oocities.org/friends_greatestsitcom4/script/{episode}.htm'

  response = requests.get(url, verify=False)
  soup = BeautifulSoup(response.text, 'html.parser')
  script += soup.get_text()

with open(f'script_{season}.txt', 'w') as file:
  file.write(script)

#Cleaning Data

In [None]:
with open('friends_script.txt', 'r') as file:
  script = file.read()

In [None]:
#Names of locutor are inconsistent
#line breaking is inconsistent
#quotation marks inconsistent

names = {'Rachel:' : 'RACHEL:',
        'Monica:' : 'MONICA:',
        'Phoebe:' : 'PHOEBA:',
        'Joey:' : 'JOEY:',
        'Chandler:' : 'CHANDLER:',
        'Ross:' : 'ROSS:',
        'RACH:' : 'RACHEL:',
        'MNCA:' : 'MONICA:',
        'PHOE:' : 'PHOEBE:',
        'CHAN:' : 'CHANDLER',
        'Janice:' : 'JANICE:',
        'Pete:' : 'PETE:',
        'All:' : 'ALL:',
        'The Girls:' : 'THE GIRLS:',
        'Kathy:' : 'KATHY:',
        'Emily:' : 'EMILY:',
        'Joshua:' : 'JOSHUA:',
        'Elizabeth:' : 'ELIZABETH:',
        'Mona:' : 'MONA:'
}

quotes = {'‘' : '"',
          '’' : '"',
          '“' : '"',
          '”' : '"'    
}

script = script.replace('\n\n', '\n')
script = script.replace('\t', ' ')
script = script.replace('�', '\'')
for name, replacement in names.items():
  script = script.replace(name, replacement)
for quote, replacement in quotes.items():
  script = script.replace(quote, replacement)

In [None]:
with open('friends_script.txt', 'w') as file:
  file.write(script)

#The Model

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm

In [3]:
batch_size = 64
block_size = 300 #max context size
max_iters = 20000 #training loop steps
eval_interval = 500 #every steps we eval loss
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200 #number of random batches used to eval loss
n_embd = 32 #number of embedding dimension
n_heads = 4 #number of attention heads
n_blocks = 6 #number of unit blocks in transformer
dropout = .1 #dropout in dropout layers in training

In [4]:
with open('friends_script.txt', 'r') as file:
  text = file.read()

In [5]:
#unique characters
chars = sorted(list(set(text)))
vocab_size = len(chars)

In [6]:
#mapping functions char to int (basic tokenizer)
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

In [7]:
#data loading and train/val split
data = torch.tensor(encode(text), dtype=torch.long)
train_data = data[:int(0.9*len(data))]
val_data = data[int(0.9*len(data)):]

In [8]:
def get_batch(split):
  #generate a random batch of data with input x and targets y
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+1+block_size] for i in ix])
  return x.to(device), y.to(device)

In [9]:
#function to give steady estimation of loss
@torch.no_grad()
def estimate_loss(model):
  out = {}
  model.eval()
  for split in ['train', 'val']:
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
      X, Y = get_batch(split)
      logits, loss = model(X, Y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  model.train()
  return out

In [10]:
class GPTLanguageModel(nn.Module):
  def __init__(self):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)   #embedding token
    self.position_embedding_table = nn.Embedding(block_size, n_embd) #create an embedding for position idx too
    self.transformer = nn.Sequential(*[Block(n_embd=n_embd, n_heads=n_heads) for _ in range(n_blocks)])
    self.layernorm_final = nn.LayerNorm(n_embd)
    self.language_modeling_head = nn.Linear(n_embd, vocab_size)     #linear layer

  def forward(self, idx, targets=None):
    #idx and targets are (B,T)
    B, T = idx.shape
    tok_emb = self.token_embedding_table(idx)  #(B,T,C) doesn't give directly logits as we go through intermediate dimension of embedding
    pos_emb = self.position_embedding_table(torch.arange(T, device=device)) #(T,C)
    x = tok_emb + pos_emb   #(B,T,C) pos_emb are broadcasted along B dim
    x = self.transformer(x) #(B,T,C)
    x = self.layernorm_final(x) #(B,T,C)
    logits = self.language_modeling_head(x)    #(B,T,vocab_size)

    if targets != None:
      B, T, C = logits.shape
      logits = logits.view(B*T,C) #(B*T,C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)
      return logits, loss
    return logits, None
  
  def generate(self, idx, max_new_tokens):      #Used to create new text (not in training phase)
    #idx is (B,T)
    for _ in range(max_new_tokens):
      idx_cond = idx[:, -block_size:]  #have to crop to the last block size of given context
      logits, loss = self(idx_cond) #get predictions
      logits = logits[:, -1, :] #focus on last time step (B,C) (history not used yet)
      probs = F.softmax(logits, dim=-1) #get probas (B,C)
      idx_next = torch.multinomial(probs, num_samples=1) #sample from the probs distribution (B,1)
      idx = torch.cat((idx, idx_next), dim=1) #append new index to running sequence (B,T+1)
    return idx

In [11]:
class Head(nn.Module):
  #One head of attention mechanism
  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(n_embd, head_size, bias=False)
    self.query = nn.Linear(n_embd, head_size, bias=False)
    self.value = nn.Linear(n_embd, head_size, bias=False)
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size))) #create the 'tril' variable from buffer as not a parameter of the model (lower triangular matrix of ones - will be useful for masking)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    B,T,C = x.shape
    k = self.key(x) #(B,T,C)
    q = self.query(x) #(B,T,C)
    v = self.value(x) #(B,T,C)

    #compute the weights to be used in attention 
    weights = q @ k.transpose(-2,-1) * torch.sqrt(torch.tensor(C)).item() #(B,T,C) @ (B,C,T) = (B,T,T) (also the paper introduces a 1/sqrt(C) factor to avoid (one-hot)-like weights after softmax)
    weights = weights.masked_fill(self.tril[:T,:T] == 0, float('-inf')) #masking upper triangular values as we can't communicate with future (B,T,T)
    weights = F.softmax(weights, dim=-1) #softmax to normalize (as probas) (B,T,T)
    weights = self.dropout(weights)

    #finnaly aggregate the values using these weights
    out = weights @ v #(B,T,T) @ (B,T,C) = (B,T,C)
    return out

In [12]:
class MultiHead(nn.Module):
#Multiple head model
  def __init__(self, num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.projection = nn.Linear(n_embd, n_embd)   #need to project the list back into shape for skip connection (+ with original x)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    out = torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.projection(out)
    #out = self.dropout(x)
    return out

In [13]:
class FeedForward(nn.Module):
#Final part of decoder-only-transformer block
  def __init__(self, n_embd):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embd, 4*n_embd),    #factor 4 comes from paper
        nn.ReLU(),
        nn.Linear(4*n_embd, n_embd),  #here to project the list back into shape for skip connection (+ with original x)
        nn.Dropout(dropout),
    )

  def forward(self, x):
    return self.net(x)

In [14]:
class Block(nn.Module):
#(decoder only)-Transformer unit block
  def __init__(self, n_embd, n_heads):
    super().__init__()
    head_size = n_embd // n_heads
    self.multi_attention = MultiHead(n_heads, head_size)
    self.feedforward = FeedForward(n_embd)
    self.layernorm1 = nn.LayerNorm(n_embd)
    self.layernorm2 = nn.LayerNorm(n_embd)

  def forward(self, x):
    x = x + self.multi_attention(self.layernorm1(x))  #with skip connection (layer normalization is applied before skip connection unlike in the paper)
    x = x + self.feedforward(self.layernorm2(x))  #with skip connection (layer normalization is applied before skip connection unlike in the paper)
    return x

#Training

In [None]:
#Model
model = GPTLanguageModel().to(device)

In [None]:
learning_rate=1e-2
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

#training loop
for iter in tqdm(range(max_iters)):
  #update learning rate
  if iter % 5000 == 0 and iter > 0:
    learning_rate /= 10
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

  #evaluate loss
  if iter % eval_interval == 0:
    losses = estimate_loss(model)
    print(f"iter {iter}: train_loss={losses['train']:.3f}, val_loss={losses['val']:.3f}")

  xb, yb = get_batch('train')
  logits, loss = model(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

final_loss = estimate_loss(model)['val']

In [None]:
#save model
torch.save(model, f'model_{final_loss:.2f}.pth')

In [None]:
print(sum(p.numel() for p in model.parameters())/1e3, 'k parameters')

91.487 k parameters


#Loading and Dumping model in pickel

In [15]:
#Loading and evaluating model
weights_file = 'model_1.33.pth'

model = torch.load(weights_file, map_location=device)
model.eval()

estimate_loss(model)

{'train': tensor(1.3097), 'val': tensor(1.3312)}

In [19]:
import pickle

pickle_model = {
    'model':model,
    }

with open('model_pickel.pkl', 'wb') as file:
  pickle.dump(pickle_model, file)


#Tests

In [None]:
context = 'CHANDLER: Stop messing around Joey!\n'
input = torch.tensor(encode(context), dtype=torch.long).view(1,len(context)).to(device)

In [None]:
print(decode(model.generate(idx = input, max_new_tokens=1000)[0].tolist()))

CHANDLER: Stop messing around Joey!
Hey.
JOEY: Oh my!
JOEY: No-no, you're gonna wants minutes Ursival
to room.
CHANDLER: We tell take the gigleact it, I got that. So I'm from him lod bar-
Ben: Oh we don't got Ross on, why don"t you what"s shone. I"m that? (She was over gass in off allief?
[Monica sfids."
Joey.NER: He doesn'te doing much ended
you know, that Ross"s old much guys to at And! (To Joey enough.)
RACHEL: Thank we six about at the 
guy.
MONICA: What you have a hoor?
[Scene: Central Perk, Chandler are catce onoment to Sweas.
ROSS: Oh, closh. Ross's puting coffect bunies?)
ROSS: Yeah, it up on look to sreame anything to of the picting is that didno her sulibly lown"t asrwall prolm up! My Thank it!
CHANDLER: Shaceles me. I donkstad! I"m 1400 up.
Monica) Mof ofezelow I"m up.
JOEY: What about then with be you would gonna say the 17; deack.
JOEY: No! (Some faceming lask.) 
RACL: It wanna go to molle, perseuler, hads By.
We"ll: (spils.)
Ross, Russ, even bags. This right sorry.JUNICA: