In [1]:
# =========================================================================================
# Libraries
# =========================================================================================
import os
import gc
import time
import math
import re
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch
import random
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from torch.utils.checkpoint import checkpoint
import tokenizers
import transformers
from sklearn.model_selection import GroupKFold
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_cosine_schedule_with_warmup, DataCollatorWithPadding
%env TOKENIZERS_PARALLELISM=false
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

from tqdm.auto import tqdm
tqdm.pandas()

import warnings
warnings.filterwarnings("ignore")

env: TOKENIZERS_PARALLELISM=false


In [2]:
# =========================================================================================
# Configurations
# =========================================================================================
class CFG:
    num_workers = 4
#     model = "../output/mpnet/exp_v3_64_new_f0"
    model = "/home/rohits/pv1/learning_equality/output/sentence-transformers/paraphrase-multilingual-mpnet-base-v2/exp_v3_64_new_f0"
    tokenizer = AutoTokenizer.from_pretrained(model)
    batch_size = 64 #128
    top_n = 50
    seed = 42
    max_len=64

In [3]:
# define some helper functions and classes to aid with data traversal

def print_markdown(md):
    display(Markdown(md))

class Topic:
    def __init__(self, topic_id):
        self.id = topic_id

    @property
    def parent(self):
        parent_id = topics_df.loc[self.id].parent
        if pd.isna(parent_id):
            return None
        else:
            return Topic(parent_id)

    @property
    def ancestors(self):
        ancestors = []
        parent = self.parent
        while parent is not None:
            ancestors.append(parent)
            parent = parent.parent
        return ancestors

    @property
    def siblings(self):
        if not self.parent:
            return []
        else:
            return [topic for topic in self.parent.children if topic != self]

    @property
    def content(self):
        if self.id in correlations_df.index:
            return [ContentItem(content_id) for content_id in correlations_df.loc[self.id].content_ids.split()]
        else:
            return tuple([]) if self.has_content else []

    def get_breadcrumbs(self, separator=" >> ", include_self=True, include_root=True):
        ancestors = self.ancestors
        if include_self:
            ancestors = [self] + ancestors
        if not include_root:
            ancestors = ancestors[:-1]
        return separator.join(reversed([a.title for a in ancestors]))

    @property
    def children(self):
        return [Topic(child_id) for child_id in topics_df[topics_df.parent == self.id].index]

    def subtree_markdown(self, depth=0):
        markdown = "  " * depth + "- " + self.title + "\n"
        for child in self.children:
            markdown += child.subtree_markdown(depth=depth + 1)
        for content in self.content:
            markdown += ("  " * (depth + 1) + "- " + "[" + content.kind.title() + "] " + content.title) + "\n"
        return markdown

    def __eq__(self, other):
        if not isinstance(other, Topic):
            return False
        return self.id == other.id

    def __getattr__(self, name):
        return topics_df.loc[self.id][name]

    def __str__(self):
        return self.title
    
    def __repr__(self):
        return f"<Topic(id={self.id}, title=\"{self.title}\")>"


class ContentItem:
    def __init__(self, content_id):
        self.id = content_id

    @property
    def topics(self):
        return [Topic(topic_id) for topic_id in topics_df.loc[correlations_df[correlations_df.content_ids.str.contains(self.id)].index].index]

    def __getattr__(self, name):
        return content_df.loc[self.id][name]

    def __str__(self):
        return self.title
    
    def __repr__(self):
        return f"<ContentItem(id={self.id}, title=\"{self.title}\")>"

    def __eq__(self, other):
        if not isinstance(other, ContentItem):
            return False
        return self.id == other.id

    def get_all_breadcrumbs(self, separator=" >> ", include_root=True):
        breadcrumbs = []
        for topic in self.topics:
            new_breadcrumb = topic.get_breadcrumbs(separator=separator, include_root=include_root)
            if new_breadcrumb:
                new_breadcrumb = new_breadcrumb + separator + self.title
            else:
                new_breadcrumb = self.title
            breadcrumbs.append(new_breadcrumb)
        return breadcrumbs

In [4]:
def get_breadcrumbs(row):
    crumbs = ""
    if row.has_content:
        topic = Topic(row.id)
        crumbs = topic.get_breadcrumbs()
    return crumbs

In [5]:
# =========================================================================================
# Prepare input, tokenize
# =========================================================================================
def prepare_input(text, cfg):
    
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors=None, 
        add_special_tokens=True, 
        max_length=cfg.max_len,
        pad_to_max_length=True,
        truncation=True
    )
    
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype = torch.long)
    return inputs

# =========================================================================================
# Unsupervised dataset
# =========================================================================================
class uns_dataset(Dataset):
    def __init__(self, df, cfg):
        self.cfg = cfg
        self.texts = df['title1'].values
#         self.texts = df['title'].values
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, item):
        inputs = prepare_input(self.texts[item], self.cfg)
        return inputs
    
# =========================================================================================
# Mean pooling class
# =========================================================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings


# =========================================================================================
# Get embeddings
# =========================================================================================
def get_embeddings(loader, model, device):
    model.eval()
    preds = []
    for step, inputs in enumerate(tqdm(loader)):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    preds = np.concatenate(preds)
    return preds

# =========================================================================================
# Get the amount of positive classes based on the total
# =========================================================================================
def get_pos_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    int_true = np.array([len(x[0] & x[1]) / len(x[0]) for x in zip(y_true, y_pred)])
    return round(np.mean(int_true), 5)

In [10]:
from IPython.display import display, Markdown
from pathlib import Path

data_dir = Path('../data')

In [11]:
ADDITIONAL_SPECIAL_TOKENS = [
    "[TITLE]", "[DESC]", "[CATEGORY]", "[LEVEl]", "[PARENT]", "[TEXT]", "[KIND]"
]


# text
def preprocess(text):
    # remove pattern 2.3.4.5 or 2.3.4.5:
    text = re.sub(r'\w*\s*\d*\.\d*\s*\:*\s*\-*', ' ', text)
    text = re.sub(r'^\d*\:\d*\s*', ' ', text)
    return text

def get_topic_text(row):
    title = row.title1.strip()
    if title != "":
        title = "[TITLE]" + title
    
    if row.description != "":
        desc = row.description.split(".")[0]
        title += "[DESC]" + " ".join(desc.split()[:25]) if len(desc.split()) > 25 else "[DESC]" + desc
        
    if row.category != "":
        title += "[CATEGORY]" + row.category
    
    if row.level != "":
        title += "[LEVEl]" + str(row.level)
        
    if row.t1 != "":
        title += "[PARENT]" + row.t1.replace(" >> ", "[PARENT]")
        
    return title


def get_content_text(row):
    title = row.title1.strip()
    if title != "":
        title = "[TITLE]" + title
    if row.description != "":
        desc = row.description.split(".")[0]
        title += "[DESC]" + " ".join(desc.split()[:25]) if len(desc.split()) > 25 else "[DESC]" + desc
    
    if row.text != "":
        text = row.text.split(".")[0]
        title += "[TEXT]" + " ".join(text.split()[:25]) if len(text.split()) > 25 else "[TEXT]" + text
    
    if row.kind != "":
        title += "[KIND]" + row.kind    
        
    return title

In [12]:
# # load the data into pandas dataframes
topics_df = pd.read_csv(data_dir / "topics.csv", index_col=0).fillna({"title": "", "description": ""})
topics_df['title1'] = topics_df.title.apply(lambda x: preprocess(x).strip())

# topics_df.fillna("", inplace=True)
topics_df['id'] = topics_df.index.values

content_df = pd.read_csv(data_dir / "content.csv", index_col=0).fillna("")
content_df['title1'] = content_df.title.apply(lambda x: preprocess(x).strip())
content_df['id'] = content_df.index.values
correlations_df = pd.read_csv(data_dir / "correlations.csv")


In [None]:
topics_df['context'] = topics_df.progress_apply(lambda x: get_breadcrumbs(x), axis=1)
topics_df.rename(columns = {'context': 't1'}, inplace=True)
topics_df.fillna("", inplace=True)
topics_df['title1'] = topics_df.progress_apply(lambda x: get_topic_text(x), axis=1)
content_df['title1'] = content_df.progress_apply(lambda x: get_content_text(x), axis=1)

  0%|          | 0/76972 [00:00<?, ?it/s]

In [None]:
topics_df.reset_index(drop=True, inplace=True)

topics_df = topics_df.merge(
    correlations_df, left_on='id', right_on='topic_id', how='left'
)

topics_df.reset_index(drop=True, inplace=True)
content_df.reset_index(drop=True, inplace=True)

In [None]:
topics_df.shape

In [None]:
topics_df = topics_df.loc[topics_df.has_content==True]

In [None]:
topics_df.to_csv("topics_filled.csv", index=False)
content_df.to_csv("content_filled.csv", index=False)

In [14]:
# =========================================================================================
# Build our training set
# =========================================================================================
def build_training_set(topics, content, cfg):
    # Create lists for training
    topics_ids = []
    content_ids = []
    title1 = []
    title2 = []
    targets = []
    # Iterate over each topic
    for k in tqdm(range(len(topics))):
        row = topics.iloc[k]
        topics_id = row['id']
        topics_title = row['title1'] #'Depth:' + str(row['level']) + ' ' +  row['title'] + ' ' + row['description'] + ' ' + row['context']
        topics_lng = row['language']
        predictions = row['predictions'].split(' ')
        ground_truth = row['content_ids'].split(' ')
        predictions.extend(ground_truth)        
        predictions = list(set(predictions))
                
        for pred in predictions:
            content_pred = content.loc[pred]
            content_lng = content_pred['language']
            if topics_lng != content_lng:
                continue                

            # If pred is in ground truth, 1 else 0
            if pred in ground_truth:
                targets.append(1)
            else:
#                 continue
                targets.append(0)
                
            content_title = content_pred['title1']  #+ ' ' + content_pred['description'] + ' ' + content_pred['text'] + ' ' + 'kind:' + content_pred['kind']
            topics_ids.append(topics_id)
#             channels.append(topic_channel)
            content_ids.append(pred)
            title1.append(topics_title)
            title2.append(content_title)

                
    # Build training dataset
    train = pd.DataFrame(
        {'topics_ids': topics_ids, 
         'content_ids': content_ids, 
         'title1': title1, 
         'title2': title2, 
         'target': targets,
        }
    )
    # Release memory
    del topics_ids, content_ids, title1, title2, targets
    gc.collect()
    return train

In [15]:
# =========================================================================================
# Get neighbors
# =========================================================================================
def get_neighbors(topics, content, cfg):
    topics_dataset = uns_dataset(topics, cfg)
    content_dataset = uns_dataset(content, cfg)
    topics_loader = DataLoader(
        topics_dataset, 
        batch_size = cfg.batch_size, 
        shuffle = False, 
        collate_fn = DataCollatorWithPadding(tokenizer = cfg.tokenizer, padding = 'longest'),
        num_workers = cfg.num_workers, 
        pin_memory = True, 
        drop_last = False
    )
    content_loader = DataLoader(
        content_dataset, 
        batch_size = cfg.batch_size, 
        shuffle = False, 
        collate_fn = DataCollatorWithPadding(tokenizer = cfg.tokenizer, padding = 'longest'),
        num_workers = cfg.num_workers, 
        pin_memory = True, 
        drop_last = False
        )
    # Create unsupervised model to extract embeddings
    model = uns_model(cfg)
    model.to(device)
    
#     # Load weights
#     state = torch.load(f"{cfg.model}/checkpoint_0.pth", map_location = torch.device('cpu'))    
#     model.load_state_dict(state['model'], strict=False)

    # Predict topics
    topics_preds = get_embeddings(topics_loader, model, device)
    content_preds = get_embeddings(content_loader, model, device)
    # Transfer predictions to gpu
    topics_preds_gpu = cp.array(topics_preds)
    content_preds_gpu = cp.array(content_preds)
    # Release memory
    torch.cuda.empty_cache()
    del topics_dataset, content_dataset, topics_loader, content_loader, topics_preds, content_preds
    gc.collect()
    # KNN model
    print(' ')
    print('Training KNN model...')
    neighbors_model = NearestNeighbors(n_neighbors = cfg.top_n, metric = 'cosine')
    neighbors_model.fit(content_preds_gpu)
#     distances, indices = neighbors_model.kneighbors(topics_preds_gpu, return_distance = True)
    indices = neighbors_model.kneighbors(topics_preds_gpu, return_distance = False)

    predictions = []
    for k in tqdm(range(len(indices))):
        pred = indices[k]
        p = ' '.join([content.loc[ind, 'id'] for ind in pred.get()])
        predictions.append(p)
    topics['predictions'] = predictions
    # Release memory
    del topics_preds_gpu, content_preds_gpu, neighbors_model, predictions, indices, model
    gc.collect()
    return topics, content

In [16]:
source_ids = topics_df.loc[topics_df.category == 'source'].id.values

In [17]:
class uns_model(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.config = AutoConfig.from_pretrained(cfg.model)
        self.config.output_hidden_states = True    
        self.model = AutoModel.from_pretrained(cfg.model, config = self.config)
        self.pool = MeanPooling()
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_state = outputs.last_hidden_state
        feature = self.pool(last_hidden_state, inputs['attention_mask'])
        return feature
    def forward(self, inputs):
        feature = self.feature(inputs)
        return feature

In [18]:
# Run nearest neighbors
topics, content = get_neighbors(topics_df, content_df, CFG)
pos_score = get_pos_score(topics['content_ids'], topics['predictions'])
print(f'Our max positive score is {pos_score}')
# We can delete correlations

# del correlations
gc.collect()
# Set id as index for content
content.set_index('id', inplace = True)

  0%|          | 0/962 [00:00<?, ?it/s]

  0%|          | 0/2407 [00:00<?, ?it/s]

 
Training KNN model...


  0%|          | 0/61517 [00:00<?, ?it/s]

Our max positive score is 0.9719


In [19]:
# topics_df

In [20]:
# Build training set   
train = build_training_set(topics, content, CFG)
print(f'Our training set has {len(train)} rows')
# Save train set to disk to train on another notebook

  0%|          | 0/61517 [00:00<?, ?it/s]

Our training set has 2807052 rows


In [21]:
train.to_csv("train_ret_mpnet_50_f4_64.csv", index=False)