In [1]:
import pandas as pd

In [2]:
# Load the CSV file
df = pd.read_csv("/home/sehar/schittVision/schitts_creek_combined_dialogues.csv")

In [3]:
print(df.shape[0])  # Total number of rows
print(df.isnull().sum())  # Count NaN values
print(df[df['Dialogue'].str.strip() == ''])  # Check for empty strings
df = df.reset_index(drop=True)

36011
Character    0
Dialogue     0
dtype: int64
Empty DataFrame
Columns: [Character, Dialogue]
Index: []


In [4]:
df = df.dropna(subset=['Dialogue'])  # Drop rows where Dialogue is NaN
df = df[df['Dialogue'].str.strip() != '']  # Drop rows with empty Dialogue


In [5]:
print(df.shape[0])  # Confirm the number of rows


36011


In [6]:
# View the first few rows
print(df.head())

  Character                                           Dialogue
0   unknown  Episode transcripts for the TV show "Schitt's ...
1   unknown                                      (Birds chirp)
2   unknown                                   (Footsteps echo)
3   unknown                                   (Footstep clomp)
4   unknown                                (Doors creaks open)


In [7]:
# Check for missing values
print(df.isnull().sum())

Character    0
Dialogue     0
dtype: int64


In [8]:
df.value_counts

<bound method DataFrame.value_counts of       Character                                           Dialogue
0       unknown  Episode transcripts for the TV show "Schitt's ...
1       unknown                                      (Birds chirp)
2       unknown                                   (Footsteps echo)
3       unknown                                   (Footstep clomp)
4       unknown                                (Doors creaks open)
...         ...                                                ...
36006   unknown                                          Love you.
36007   unknown                                      Johnny: Wait!
36008   unknown                                      Stop the car!
36009   unknown                                        What is it?
36010   unknown                         Just wanted one last look.

[36011 rows x 2 columns]>

In [9]:
# Drop rows with missing dialogues
df.dropna(subset=['Dialogue'], inplace=True)

In [10]:
df.value_counts

<bound method DataFrame.value_counts of       Character                                           Dialogue
0       unknown  Episode transcripts for the TV show "Schitt's ...
1       unknown                                      (Birds chirp)
2       unknown                                   (Footsteps echo)
3       unknown                                   (Footstep clomp)
4       unknown                                (Doors creaks open)
...         ...                                                ...
36006   unknown                                          Love you.
36007   unknown                                      Johnny: Wait!
36008   unknown                                      Stop the car!
36009   unknown                                        What is it?
36010   unknown                         Just wanted one last look.

[36011 rows x 2 columns]>

In [11]:

import re
import pandas as pd

# Helper function to clean the dialogue
def clean_dialogue(dialogue):
    # Remove unwanted characters or excessive spaces
    dialogue = re.sub(r'[^a-zA-Z0-9\s.,!?;\'\"-]', '', dialogue)
    dialogue = re.sub(r'\s+', ' ', dialogue)  # Replace multiple spaces with single space
    return dialogue.strip()

# Apply cleaning
df['Dialogue'] = df['Dialogue'].apply(clean_dialogue)

# Drop empty or very short dialogues
df = df[df['Dialogue'].apply(lambda x: len(x.split()) > 3)]  # Keeps dialogues with more than 3 words

# Save the cleaned CSV
df.to_csv('schitts_creek_dialogues_cleaned.csv', index=False)
print("Cleaned data saved to schitts_creek_dialogues_cleaned.csv")


Cleaned data saved to schitts_creek_dialogues_cleaned.csv


In [12]:
df.value_counts

<bound method DataFrame.value_counts of            Character                                           Dialogue
0            unknown  Episode transcripts for the TV show "Schitt's ...
7      Revenue agent  Missus Rose! There are people here from the go...
10     Revenue agent  John, I've been stripped of every morsel of pl...
11            Johnny             Well, how do you think I feel, Moira?!
12            Johnny                    Eli was family, for God's sake!
...              ...                                                ...
35999        unknown                                I'm so proud of us.
36003        unknown                                      - I love you!
36004        unknown                                  - I love you too.
36005        unknown                                 Moira We love you!
36010        unknown                         Just wanted one last look.

[24690 rows x 2 columns]>

In [13]:
# import pandas as pd

# # Load the dataset
# df = pd.read_csv('schitts_creek_dialogues_cleaned.csv')

# # Define the characters you want to keep
# wanted_characters = ['Moira', 'Johnny', 'David', 'Alexis']

# # Filter the dataset to include only the wanted characters
# df_filtered = df[df['Character'].isin(wanted_characters)]

# # Save the filtered dataset
# df_filtered.to_csv('schitts_creek_dialogues_main_characters.csv', index=False)
# print("Filtered data saved to schitts_creek_dialogues_main_characters.csv")

In [14]:
from transformers import AutoTokenizer
import torch

# Load the cleaned dataset
df = pd.read_csv('schitts_creek_dialogues_cleaned.csv')

# Initialize a tokenizer (e.g., BERT tokenizer)
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the dialogues
tokens = tokenizer(
    df['Dialogue'].tolist(),
    padding=True,  # Add padding to make sequences the same length
    truncation=True,  # Truncate sequences longer than the model's max length
    max_length=128,  # Define max sequence length
    return_tensors='pt'  # Return PyTorch tensors
)

# Inspect the tokenized output
print(tokens['input_ids'].shape)  # Shape: (num_samples, max_length)

# Save tokenized data if needed
torch.save(tokens, 'tokenized_data.pt')
print("Tokenized data saved to tokenized_data.pt")


  from .autonotebook import tqdm as notebook_tqdm


torch.Size([24690, 128])
Tokenized data saved to tokenized_data.pt


In [15]:
print(tokens)

{'input_ids': tensor([[  101,  2792, 24051,  ...,     0,     0,     0],
        [  101,  3335,  2271,  ...,     0,     0,     0],
        [  101,  2198,  1010,  ...,     0,     0,     0],
        ...,
        [  101,  1011,  1045,  ...,     0,     0,     0],
        [  101, 25175,  2527,  ...,     0,     0,     0],
        [  101,  2074,  2359,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [16]:
from torch.utils.data import Dataset

class DialogueDatasetMLM(Dataset):
    def __init__(self, dialogues, tokenizer, max_length=128, mask_prob=0.15):
        self.dialogues = dialogues
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.mask_prob = mask_prob

    def __len__(self):
        return len(self.dialogues)

    def __getitem__(self, idx):
        #dialogue = self.dialogues[idx]
        encoding = tokens
        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)

        # Apply masking
        input_ids, labels = self._mask_tokens(input_ids)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

    def _mask_tokens(self, input_ids):
        labels = input_ids.clone()  # Clone input_ids for label generation

        # Create a probability matrix for masking
        probability_matrix = torch.full(input_ids.shape, self.mask_prob, device=input_ids.device)

        # Handle special tokens mask for each sequence in the batch
        special_tokens_mask = [
            self.tokenizer.get_special_tokens_mask(seq.tolist(), already_has_special_tokens=True)
            for seq in input_ids
        ]
        special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool, device=input_ids.device)

        # Apply the mask to the probability matrix
        probability_matrix.masked_fill_(special_tokens_mask, value=0.0)  # Do not mask special tokens

        # Select tokens to mask
        masked_indices = torch.bernoulli(probability_matrix).bool()
        labels[~masked_indices] = -100  # Ignore unmasked tokens in the loss calculation

        # Replace 80% of masked tokens with [MASK]
        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8, device=input_ids.device)).bool() & masked_indices
        input_ids[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)

        # Replace 10% of masked tokens with random tokens
        indices_random = (
            torch.bernoulli(torch.full(labels.shape, 0.5, device=input_ids.device)).bool()
            & masked_indices
            & ~indices_replaced
        )
        random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long, device=input_ids.device)
        input_ids[indices_random] = random_words[indices_random]

        # The remaining 10% of masked tokens are left unchanged

        return input_ids, labels



In [1]:
import torch
print(torch.cuda.is_available())  # Should return True
print(torch.cuda.device_count())  # Should be > 0
print(torch.cuda.get_device_name(0))  # Should return your GPU name



  from .autonotebook import tqdm as notebook_tqdm


True
1
NVIDIA GeForce MX250


In [18]:
# Load dataset with MLM
import torch
import torch.nn as nn
from tqdm import tqdm
from torch.utils.data import DataLoader
dialogues = list(df['Dialogue'])  # Assuming df['Dialogue'] contains the dialogues
mlm_dataset = DialogueDatasetMLM(dialogues, tokenizer)

# Test a single sample
sample = mlm_dataset[0]
print("Input IDs:", sample["input_ids"])
print("Attention Mask:", sample["attention_mask"])
print("Labels:", sample["labels"])

mlm_loader = DataLoader(mlm_dataset, batch_size=16, shuffle=True)

# Define MLM Model (same transformer as before)
# Define the Custom Transformer Encoder Model
class TransformerEncoderMLM(nn.Module):
    def __init__(self, vocab_size, hidden_size=256, num_layers=4, num_heads=4, max_length=128):
        super(TransformerEncoderMLM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.positional_encoding = nn.Parameter(torch.zeros(1, max_length, hidden_size))
        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_size, nhead=num_heads)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, input_ids, attention_mask):
        embeddings = self.embedding(input_ids) + self.positional_encoding[:, :input_ids.size(1), :]
        encoded = self.encoder(embeddings, src_key_padding_mask=~attention_mask.bool())
        logits = self.fc(encoded)
        return logits

# Initialize Model
vocab_size = tokenizer.vocab_size
model = TransformerEncoderMLM(vocab_size=vocab_size)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training Setup
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()


# Training Loop
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(mlm_loader, desc=f"Epoch {epoch+1}/{epochs}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        print("Before Forward Pass")
        logits = model(input_ids, attention_mask)
        print("After Forward Pass")
        loss = criterion(logits.view(-1, vocab_size), labels.view(-1))

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(mlm_loader)}")

# Save the Model
torch.save(model.state_dict(), 'schitts_creek_pretrained.pth')
print("Model saved!")

Input IDs: tensor([[  101,  2792, 24051,  ...,     0,     0,     0],
        [  101,  3335,  2271,  ...,     0,     0,     0],
        [  101,  2198,  1010,  ...,     0,     0,     0],
        ...,
        [  101,  1011,  1045,  ...,     0,     0,     0],
        [  101, 25175,  2527,  ...,     0,     0,     0],
        [  101,  2074, 24958,  ...,     0,     0,     0]])
Attention Mask: tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
Labels: tensor([[-100, -100, -100,  ..., -100, -100, -100],
        [-100, -100, -100,  ..., -100, -100, -100],
        [-100, -100, -100,  ..., -100, -100, -100],
        ...,
        [-100, -100, -100,  ..., -100, -100, -100],
        [-100, -100, -100,  ..., -100, -100, -100],
        [-100, -100, 2359,  ..., -100, -100, -100]])


Epoch 1/3:   0%|          | 0/772 [00:00<?, ?it/s]