In [1]:
!pip install transformers
!pip install wandb
!jupyter labextension install @jupyter-widgets/jupyterlab-manager

You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m
Building jupyterlab assets (build:prod:minimize)


In [1]:
import re
import nltk
import wandb
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from nltk.stem import WordNetLemmatizer
from transformers import AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset

from tqdm import tqdm
import numpy as np
import random

nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
all = pd.read_parquet('./data/train_all.parquet')
orders = pd.read_parquet('./data/train_orders.parquet')
ancestors = pd.read_parquet('./data/train_ancestors.parquet')

In [3]:
# Uncomment this to subset the data in order to test training or validation logic.

# N_SAMPLES = 100
# sample_ids = random.sample(list(all['id'].unique()), N_SAMPLES)
# all = all.set_index('id').loc[sample_ids].reset_index()

In [3]:
# Orders dataframe currently contains cell orders as a string, i.e "a b c"
# We want to convert that into a list of strings: ["a", "b", "c"]
orders['cell_order'] = orders['cell_order'].str.split(' ').tolist()

In [4]:
stemmer = WordNetLemmatizer()

def links_to_word(text):
    return re.sub("https?:\/\/[^\s]+", " link ", text)

def no_char(text):
    text = re.sub(r"\s+[a-zA-Z]\s+", " ", text)
    text = re.sub(r"\^[a-zA-Z]\s+", " ", text)
    text = re.sub(r"\s+[a-zA-Z]$", " ", text)
    return text

def no_markdown_special(text):
    """Remove reserved markdown special characters.
    """
    return re.sub(r"[\.\*\+\-\_\>\<\~\(\)\[\]]", " ", text)

def no_html_tags(text):
    return re.sub("<.*?>", " ", text)

def no_multi_spaces(text):
    return re.sub(r"\s+", " ", text, flags=re.I)

def lemmatize(text):
    tokens = text.split()
    tokens = [stemmer.lemmatize(word) for word in tokens]
    return " ".join(tokens)

def underscore_to_space(text: str):
    text = text.replace("_", " ")
    text = text.replace("-", " ")
    return text

def no_markdown_special(text):
    try:
        text = text[0] + re.sub(r"(?<!\n)[\*\+\-\>]", " ", text[1:])
        text = re.sub(r"\(\)\[\]\{\}\<\>\~\|\`\.", " ", text)
    except IndexError:
        return ""
    return text

def code_preprocess(code):
    code = links_to_word(code)
    code = lemmatize(code)
    return code

def markdown_preprocess(code: str):
    """
    1. Replace new lines with unused token.
    2. Remove HTML Tags and special markdown symbols.
    3. Clear html tags first, then markdown...
    """
    code = code.replace("\n", "[unused1]")
    code = links_to_word(code)
    code = no_html_tags(code)
    code = no_markdown_special(code)
    code = no_multi_spaces(code)
    code = lemmatize(code)
    return code

def preprocessor(text: str, cell_type: str):
    return dict(code=code_preprocess, markdown=markdown_preprocess)[cell_type](text)

def sample_cells(cells, n):
    """
    Picking 20 cells for global context.
    """
    cells = [code_preprocess(cell) for cell in cells]
    if n >= len(cells):
        return [cell[:200] for cell in cells]
    else:
        results = []
        step = len(cells) / n
        idx = 0
        while int(np.round(idx)) < len(cells):
            results.append(cells[int(np.round(idx))])
            idx += step
        assert cells[0] in results
        if cells[-1] not in results:
            results[-1] = cells[-1]
        return results

def get_features(df):
    features = dict()

    # Group by notebook and loop through unique notebooks.
    for idx, sub_df in tqdm(df.groupby("id")):
        features[idx] = dict()

        # Get count of markdown cells in current notebook.
        total_md = sub_df[sub_df.cell_type == "markdown"].shape[0]

        # Get count of code cells in current notebook.
        code_sub_df = sub_df[sub_df.cell_type == "code"]
        total_code = code_sub_df.shape[0]

        # Sample 20 code cells.
        # codes = sample_cells(code_sub_df.source.values, 20)
        codes = code_sub_df.source.values
        features[idx]["total_code"] = total_code
        features[idx]["total_md"] = total_md
        features[idx]["codes"] = codes
    return features

In [5]:
total_max_len = 512
model_path = "./graphcodebert-base-model"
#tokenizer_path = "./graphcodebert-base-tokenizer"

In [21]:
class MarkdownModel(nn.Module):
    def __init__(self, model_path: str, markdown_model: str):
        super(MarkdownModel, self).__init__()
        self.code_model = AutoModel.from_pretrained(model_path)
        self.markdown_model = AutoModel.from_pretrained(markdown_model)
        #self.model = AutoModel.from_pretrained(model_path)
        #self.top = nn.Linear(769, 1)
#         self.code_model = AutoModel.from_pretrained(code_model)
#         self.markdown_model = AutoModel.from_pretrained(markdown_model)

        # Bert embeddings are 768-d + 1 for code cell percentage.
        self.top = nn.Linear(1536, 1)

    def forward(self, code_ids, code_mask, markdown_ids, markdown_mask):
        # Embeddings
        code_embeddings = self.code_model(code_ids, code_mask)[0]
        markdown_embeddings = self.markdown_model(markdown_ids, markdown_mask)[0]

        # Concatenate code embeddings with markdown.
        x = torch.cat((code_embeddings[:, 0, :], markdown_embeddings[:, 0, :]), 1)

        return self.top(x)
#     def forward(self, ids, mask, features):
#         # Embeddings
#         x = self.model(ids, mask)[0]
        
#         x = self.top(torch.cat((x[:, 0, :], features),1))
#         return x


class MarkdownDataset(Dataset):
    """Encapsulates Markdown dataset into a single object.

    :param markdown_rows: Pandas dataframe containing markdown content.
    :param features: Extra features (number code cells, 
    :param md_max_len: Maximum length of markdown tokenized embedding.
    :param total_max_len: Maximum Length of the tokenized input to bert.
    :param model_name: Name of pretrained bert base model.

    :attr code_model_name: Code bert model name.
    :attr markdown_model_name: Bert model name.
    :
    """
    def __init__(
        self,
        markdown_rows: pd.DataFrame,
        features: dict,
        total_max_len: int,
        md_max_len: int,
        code_model_name: str = 'microsoft/graphcodebert-base',
        markdown_model_name: str = 'bert-base-uncased'
    ):
        super().__init__()
        self.markdown_rows = markdown_rows.reset_index(drop=True)
        self.features = features
        self.md_max_len = md_max_len
        self.total_max_len = total_max_len
        self.markdown_model_name = markdown_model_name
        self.code_model_name = code_model_name
        self.code_tokenizer = AutoTokenizer.from_pretrained(
            self.code_model_name,
            do_lower_case=True,
            use_fast=True
        )
        self.markdown_tokenizer = AutoTokenizer.from_pretrained(
            self.markdown_model_name,
            do_lower_case=True,
            use_fast=True
                            )
         
#         self.markdown_tokenizer = AutoTokenizer.from_pretrained(
#             self.markdown_model_name,
#             do_lower_case=True,
#             use_fast=True

    def __getitem__(self, index):
        row = self.markdown_rows.iloc[index]
        
        # Encode markdown into embedding.
        markdown_inputs = self.markdown_tokenizer.encode_plus(
            row.source,
            None,
            add_special_tokens=True,
            max_length=self.md_max_len,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True
        )
        
        # Encode code into embedding.
        # Batch encode does not like empty lists!
        code_cells = self.features[row.id]["codes"]
        code_inputs = self.code_tokenizer.batch_encode_plus(
            [str(cell) for cell in code_cells] if len(code_cells) > 0 else [''],
            add_special_tokens=True,
            max_length=23,
            padding="max_length",
            truncation=True
        )


        
        # Get markdown embedding tokens.
        markdown_ids = markdown_inputs['input_ids']
        markdown_ids = markdown_ids[:self.total_max_len]

        # Apply padding if code + markdown tokens is less than max.
        if len(markdown_ids) < self.total_max_len:
            markdown_ids = markdown_ids + [self.markdown_tokenizer.pad_token_id, ] * (self.total_max_len - len(markdown_ids))

        markdown_ids = torch.LongTensor(markdown_ids)

        # Get code embedding tokens.
        code_ids = list(np.array(code_inputs['input_ids']).flatten())
        code_ids = code_ids[:self.total_max_len]

        # Apply padding if code + markdown tokens is less than max.
        if len(code_ids) < self.total_max_len:
            code_ids = code_ids + [self.code_tokenizer.pad_token_id, ] * (self.total_max_len - len(code_ids))

        code_ids = torch.LongTensor(code_ids)
        
        # Markdown masks
        markdown_mask = markdown_inputs['attention_mask']
        markdown_mask = markdown_mask[:self.total_max_len]

        if len(markdown_mask) != self.total_max_len:
            markdown_mask = markdown_mask + [self.markdown_tokenizer.pad_token_id, ] * (self.total_max_len - len(markdown_mask))
        markdown_mask = torch.LongTensor(markdown_mask)

        # Do the same for the code attention mask.
        code_mask = markdown_inputs['attention_mask']
        code_mask = code_mask[:self.total_max_len]

        if len(code_mask) != self.total_max_len:
            code_mask = code_mask + [self.code_tokenizer.pad_token_id, ] * (self.total_max_len - len(code_mask))
        code_mask = torch.LongTensor(code_mask)

        # Tokens should be equal to the maximum length.
        assert len(markdown_ids) == self.total_max_len
        assert len(code_ids) == self.total_max_len

        # Tokens, attention mask, markdown percentage feature, and label.
        return code_ids, code_mask, markdown_ids, markdown_mask, torch.FloatTensor([row.pct_rank])

    def __len__(self):
        return self.markdown_rows.shape[0]

In [7]:
from bisect import bisect

"""
Pulled evaluation metric directly from Kaggle.
"""
def count_inversions(a):
    inversions = 0
    sorted_so_far = []
    for i, u in enumerate(a):
        j = bisect(sorted_so_far, u)
        inversions += i - j
        sorted_so_far.insert(j, u)
    return inversions

def kendall_tau(ground_truth, predictions):
    total_inversions = 0
    total_2max = 0  # twice the maximum possible inversions across all instances
    for gt, pred in zip(ground_truth, predictions):
        ranks = [gt.index(x) for x in pred]  # rank predicted order in terms of ground truth
        total_inversions += count_inversions(ranks)
        n = len(gt)
        total_2max += n * (n - 1)
    return 1 - 4 * total_inversions / total_2max

In [8]:
from sklearn.model_selection import GroupShuffleSplit

# Create label.
all['pct_rank'] = all['order'] / all.groupby("id")["cell"].transform("count")

VALID_RATIO = 0.3
TEST_RATIO = 0.1

train_splitter = GroupShuffleSplit(n_splits=1, test_size=VALID_RATIO+TEST_RATIO, random_state=0)
val_splitter = GroupShuffleSplit(n_splits=1, test_size=TEST_RATIO, random_state=0)

# Split into train, (val + test) - 60% - 40%.
train_ind, val_ind = next(train_splitter.split(all, groups=all["ancestor_id"]))

train_df = all.loc[train_ind].reset_index(drop=True)
train_features = get_features(train_df)

val_test_df = all.loc[val_ind].reset_index(drop=True)

# Split val into val, test - 90% - 10%.
val_ind, test_ind = next(val_splitter.split(val_test_df, groups=val_test_df["ancestor_id"]))

val_df = val_test_df.loc[val_ind].reset_index(drop=True)
val_features = get_features(val_df)

test_df = val_test_df.loc[test_ind].reset_index(drop=True)
test_features = get_features(test_df)

# Final sizes:
# Train - 60%
# Validation - 30%
# Test - 10%

100%|██████████| 83334/83334 [01:12<00:00, 1147.67it/s]
100%|██████████| 50455/50455 [00:44<00:00, 1137.65it/s]
100%|██████████| 5467/5467 [00:04<00:00, 1144.63it/s]


In [9]:
print(train_df.shape[0])
print(val_df.shape[0])
print(test_df.shape[0])

3819065
2300327
251254


In [12]:
markdown_train = train_df[train_df['cell_type'] == 'markdown']
markdown_val = val_df[val_df['cell_type'] == 'markdown']
markdown_test = test_df[test_df['cell_type'] == 'markdown']

train_ds = MarkdownDataset(
    markdown_train,
    features = train_features,
    total_max_len = 400,
    md_max_len = 200
)

val_ds = MarkdownDataset(
    markdown_val,
    features = val_features,
    total_max_len = 400,
    md_max_len = 200
)

test_ds = MarkdownDataset(
    markdown_test,
    features = test_features,
    total_max_len = 400,
    md_max_len = 200
)

train_loader = DataLoader(
    train_ds,
    batch_size=1,
    num_workers=0,
    pin_memory=False,
    drop_last=True,
    shuffle=True
)

val_loader = DataLoader(
    val_ds,
    batch_size=1,
    num_workers=0,
    pin_memory=False,
    drop_last=True,
    shuffle=False
)

test_loader = DataLoader(
    test_ds,
    batch_size=1,
    num_workers=0,
    pin_memory=False,
    drop_last=True,
    shuffle=False
)


In [14]:
import json
from datetime import datetime
from pathlib import Path
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import sys, os

MODEL_NAME = 'graphcodebert'

def train(
    model,
    train_loader,
    val_loader,
    model_name,
    epochs=1,
    lr=3e-5,
    patience = 5,
    use_wandb=False
):
    np.random.seed(0)

    early_stop_count = 0
    patience = 5
    best_loss = 1_000_000
    best_vloss = 1_000_000

   
    # Creating optimizer and lr schedulers
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    num_train_optimization_steps = int(epochs * len(train_loader) / 4)

    # To reproduce BertAdam specific behavior set correct_bias=False
    optimizer = AdamW(
        optimizer_grouped_parameters,
        #lr=3e-5,
        lr=lr,
        correct_bias=False
    )  

    # PyTorch scheduler
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0.05 * num_train_optimization_steps,
        num_training_steps=num_train_optimization_steps
    )  

    criterion = torch.nn.L1Loss()
    scaler = torch.cuda.amp.GradScaler()

    for e in range(epochs):
        model.train()
        tbar = tqdm(train_loader, file=sys.stdout)
        loss_list = []
        preds = []
        labels = []

        # Train
        for idx, data in enumerate(tbar):
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

            code_ids, code_mask, markdown_ids, markdown_mask, target = [dp.cuda() for dp in data]
            
            # Compute loss
            with torch.cuda.amp.autocast():
                pred = model(code_ids, code_mask, markdown_ids, markdown_mask)
                loss = criterion(pred, target)

            # Backprop
            scaler.scale(loss).backward()

            # Update optimizer and scheduler.
            if idx % 4 == 0 or idx == len(tbar) - 1:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                scheduler.step()

            loss_list.append(loss.detach().cpu().item())
            preds.append(pred.detach().cpu().numpy().ravel())
            labels.append(target.detach().cpu().numpy().ravel())

            # Compute mean loss.
            avg_loss = np.round(np.mean(loss_list), 4)

            if idx % 25_000 == 0 and avg_loss < best_loss:
                # Track best performance, and save the model's state
                best_loss = avg_loss
                model_path = 'models/{}_{}_{}_{}'.format(MODEL_NAME, timestamp, e, best_loss)
                torch.save(model.state_dict(), model_path)

            if idx % 1000 == 0 and avg_loss < best_loss:
                wandb.log({
                    'avg_loss': avg_loss,
                    'best_loss': best_loss
                })
                early_stop_count = 0
                
            # Early stopping
            if avg_loss > best_loss:
                early_stop_count += 1
                
                if early_stop_count > patience:
                    model_path = 'models/{}_{}_{}_{}'.format(MODEL_NAME, timestamp, e, best_loss)
                    torch.save(model.state_dict(), model_path)
                    break
                       
            # Update progress bar.
            tbar.set_description(f"Epoch {e + 1} Loss: {avg_loss} lr: {scheduler.get_last_lr()}")


    # Evaluation
    model.eval()

    tbar = tqdm(val_loader, file=sys.stdout)

    preds = []
    labels = []

    with torch.no_grad():
        for idx, data in enumerate(tbar):
            code_ids, code_mask, markdown_ids, markdown_mask, target = [dp.cuda() for dp in data]

            with torch.cuda.amp.autocast():
                pred = model(code_ids, code_mask, markdown_ids, markdown_mask)

            preds.append(pred.detach().cpu().numpy().ravel())
            labels.append(target.detach().cpu().numpy().ravel())

    _, y_pred = np.concatenate(labels), np.concatenate(preds)

    # Create a placeholder prediction.
    val_df["pred"] = val_df.groupby(["id", "cell_type"])["order"].rank(pct=True)
    
    # Replace pred column with predictions (only markdown cells since only markdown cells
    # are randomized).
    val_df.loc[val_df["cell_type"] == "markdown", "pred"] = y_pred
    
    # Sort based on the predicted ranks, then obtain the order of cells as a list.
    y_dummy = val_df.sort_values("pred").groupby('id')['cell'].apply(list)
    
    # Get predictions in the same format as actuals.
    prediction_cell_orders = y_dummy.to_frame()['cell']
    
    # Based on the notebook index, obtain the actual order from orders dataframe.
    actual_cell_orders = orders.set_index('id').loc[y_dummy.index]['cell_order']
    
    # Compute metric.
    kendall_tau_score = kendall_tau(actual_cell_orders, prediction_cell_orders)
    print("Preds score", kendall_tau_score)

    return model, y_pred

In [15]:
wandb.init(project="w266-project", entity="sotoodaa", name='code-markdown-model-graphcodebert')

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msotoodaa[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [22]:
WANDB = True

MODEL_CONFIG = {
    'lr': 1e-5,
    'epochs': 1,
    'model_name': 'code-markdown-graphcodebert',
    'patience': 100
}

if WANDB:
    wandb.init(
        project="w266-project",
        entity="sotoodaa",
        name='code-markdown-graphcodebert',
        config=MODEL_CONFIG
    )
    
model = MarkdownModel('microsoft/graphcodebert-base', 'bert-base-uncased')
model = model.cuda()

if WANDB:
    wandb.watch(model, log_freq=1000)
    
model, y_pred = train(
    model,
    train_loader,
    val_loader,
    MODEL_CONFIG['model_name'],
    epochs=MODEL_CONFIG['epochs'],
    lr=MODEL_CONFIG['lr'],
    patience=MODEL_CONFIG['patience'],
    use_wandb=WANDB
)

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.0166682058867688, max=1.0))…

Some weights of the model checkpoint at microsoft/graphcodebert-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at microsoft/graphcodebert-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to

  0%|          | 0/1295276 [00:00<?, ?it/s]



Epoch 1 Loss: 0.4043 lr: [1.235257968185931e-09, 1.235257968185931e-09]:   0%|          | 6/1295276 [00:02<176:08:40,  2.04it/s]
100%|██████████| 784816/784816 [6:51:35<00:00, 31.78it/s]    
Preds score 0.5439652214319002


In [43]:
wandb.watch(model, log_freq=1000)

[]

In [44]:
model, y_pred = train(model, train_loader, val_loader, epochs=1)

Epoch 1 Loss: 1.0733 lr: [3.705773904557793e-09, 3.705773904557793e-09]:   0%|          | 6/1295276 [00:02<129:35:05,  2.78it/s]  
100%|██████████| 784816/784816 [7:27:37<00:00, 29.22it/s]    
Preds score 0.4658720608634791


In [64]:
import gc
import torch

if model:
    model.cpu()
    del model
gc.collect()
torch.cuda.empty_cache()

[34m[1mwandb[0m: While tearing down the service manager. The following error has occured: [Errno 32] Broken pipe
