<a href="https://colab.research.google.com/github/sayanbanerjee32/learning-equality-curriculum-recommendations/blob/main/lecr_inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os
is_colab = not os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')

In [3]:
if not is_colab:
    !cp -r /kaggle/input/lecr-sentence-transformer-topic-embedding/learning-equality-curriculum-recommendations/ /kaggle/working/
    !cp -r /kaggle/input/lecr-sentence-transformer-content-embd/learning-equality-curriculum-recommendations/ /kaggle/working/

In [4]:
# install fastkaggle if not available
try: import fastkaggle
except ModuleNotFoundError:
    !pip install -Uq fastkaggle
## install collab specific libraries
if is_colab:
    !pip install -Uq datasets
    !pip install -Uq transformers
    !pip install -Uq sentencepiece 
    !pip install -Uq pynvml

!pip install -Uq sentence_transformers
!pip install -Uq faiss-gpu

In [5]:
from fastkaggle import *
from pathlib import Path
import pandas as pd
import numpy as np # linear algebra
from tqdm.auto import tqdm
import gc
from functools import partial

import torch
from transformers import (AutoModelForSequenceClassification,
                          AutoTokenizer, 
                          DataCollatorWithPadding, 
                          TrainingArguments,
                          Trainer, 
                          IntervalStrategy)
from datasets import Dataset, load_from_disk

device = "cuda" if torch.cuda.is_available() else "cpu"
%env TOKENIZERS_PARALLELISM=true

env: TOKENIZERS_PARALLELISM=true


In [6]:
# config depending on whether this is running on kaggle or collab
# is_colab = True
comp = 'learning-equality-curriculum-recommendations'
if is_colab:
    model_save_path = Path('/content/drive/MyDrive/Colab Notebooks/models/'+comp)
    data_save_path = Path('/content/drive/MyDrive/Colab Notebooks/data/'+comp)
else:
    model_save_path = Path('/kaggle/working/'+comp) #+'/models')
    data_save_path = Path('/kaggle/working/' + comp)

In [7]:
# import colab libraries
if is_colab:
    from google.colab import output
    output.enable_custom_widget_manager()
    from google.colab import drive
    drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
# The Kaggle API client expects this file to be in ~/.kaggle,
# so lets move it there.
if is_colab:
    !mkdir ~/.kaggle
    !cp /content/drive/MyDrive/Kaggle_api_auth/kaggle.json ~/.kaggle/

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [9]:
# This permissions change avoids a warning on Kaggle tool startup.
if is_colab:
    !chmod 600 ~/.kaggle/kaggle.json

In [10]:
path = setup_comp(comp)
path

Path('learning-equality-curriculum-recommendations')

### read all input files

In [11]:
# load the data into pandas dataframes
topics_df = pd.read_csv(path / "topics.csv", index_col=0).fillna({"title": "", "description": ""})
content_df = pd.read_csv(path / "content.csv", index_col=0).fillna("")

In [12]:
## list of topics for submission
test_topics = pd.read_csv(path / "sample_submission.csv")['topic_id'].to_list()
test_topics

['t_00004da3a1b2',
 't_00068291e9a4',
 't_00069b63a70a',
 't_0006d41a73a8',
 't_4054df11a74e']

### Load saved topic embedding

In [13]:
topic_ds = load_from_disk(data_save_path / 'topic_embeddings_dataset.hf')

In [14]:
## test DS filter
topic_embeddings_dataset = topic_ds.filter(lambda example: example["id"] in test_topics)
topic_embeddings_dataset



Dataset({
    features: ['title', 'description', 'channel', 'category', 'level', 'language', 'parent', 'has_content', 'id', 'concat_text', 'embeddings'],
    num_rows: 5
})

In [15]:
### Need to check if topics are not there in existing topic embeddings
assert len(topic_embeddings_dataset) == len(test_topics)

### Load saved Content DS and faiss index

In [16]:
content_dataset = load_from_disk(data_save_path / 'content_dataset.hf')
content_dataset.load_faiss_index('embeddings', data_save_path / 'content_embeddings_index.faiss')
content_dataset

Dataset({
    features: ['title', 'description', 'kind', 'text', 'language', 'copyright_holder', 'license', 'id', 'concat_text'],
    num_rows: 38043
})

### Similarity search

In [17]:
SELECT_TOP_N = 25
## using range search in order to get the indices
faiss_index = content_dataset.get_index('embeddings').faiss_index
limits, distances, indices = faiss_index.range_search(x=np.array(topic_embeddings_dataset['embeddings'], dtype=np.float32).reshape(len(topic_embeddings_dataset),-1),thresh=0.97)
len(limits), len(indices), len(distances)

(6, 1888, 1888)

In [18]:
correlation_dict = {}
for i, j, t_id in zip(limits[:-1], limits[1:], topic_embeddings_dataset['id']):
    sorted_indices = [x for _, x in sorted(zip(distances[i:j], indices[i:j]))]
    correlation_dict[t_id] = ' '.join([content_dataset[int(ind)]['id'] for ind in sorted_indices[:SELECT_TOP_N]])

correlation_df = pd.DataFrame(correlation_dict.items(), columns = ['topic_id','content_ids']).set_index('topic_id')
print(correlation_df.info(null_counts = True))
correlation_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, t_00004da3a1b2 to t_4054df11a74e
Data columns (total 1 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   content_ids  5 non-null      object
dtypes: object(1)
memory usage: 80.0+ bytes
None


  print(correlation_df.info(null_counts = True))


Unnamed: 0_level_0,content_ids
topic_id,Unnamed: 1_level_1
t_00004da3a1b2,c_b59bc701c446 c_0962b290dcd2 c_3adfc7d4ebb5 c...
t_00068291e9a4,c_85e7c0954384 c_ebb7fdf10a7e c_639ea2ef9c95 c...
t_00069b63a70a,c_31caf438f66a c_a6db0765d460 c_ac592c567023
t_0006d41a73a8,c_33699a2b161f c_2e165cfa1a05 c_9ad0e8f170e5 c...
t_4054df11a74e,c_f2d184a98231 c_3695c5dc1df6 c_52f9df7e611a c...


In [19]:
def process_correlations(correlations_df):
    correlations = correlations_df.copy()
    correlations.content_ids = correlations.content_ids.str.split()
    correlations = correlations.explode("content_ids").rename(columns={"content_ids": "content_id"}).reset_index()
    return correlations
correlations = process_correlations(correlation_df)
correlations.head()

Unnamed: 0,topic_id,content_id
0,t_00004da3a1b2,c_b59bc701c446
1,t_00004da3a1b2,c_0962b290dcd2
2,t_00004da3a1b2,c_3adfc7d4ebb5
3,t_00004da3a1b2,c_d9f2fdeff0aa
4,t_00004da3a1b2,c_0041092041cd


In [20]:
df = correlations.merge(topics_df,
                        left_on = 'topic_id',
                    right_on = 'id').merge(content_df,
                                                left_on = 'content_id',
                                            right_on = 'id',
                                            suffixes=('_topic', '_content'))
df.head()

Unnamed: 0,topic_id,content_id,title_topic,description_topic,channel,category,level,language_topic,parent,has_content,title_content,description_content,kind,text,language_content,copyright_holder,license
0,t_00004da3a1b2,c_b59bc701c446,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True,Изчисляване на изрази с няколко променливи,Упражнявай решаване на изрази с две неизвестни...,exercise,,bg,,
1,t_00004da3a1b2,c_0962b290dcd2,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True,Построяване образ на правоъгълник при осева си...,"Едно по-старо видео, в което Сал използва инте...",video,,bg,,
2,t_00004da3a1b2,c_3adfc7d4ebb5,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True,Решаване на изрази с една променлива величина ...,Упражни заместването със стойност на променлив...,exercise,,bg,,
3,t_00004da3a1b2,c_d9f2fdeff0aa,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True,Уравнения със скоби,Упражнявай решаване на задачи с прилагане на р...,exercise,,bg,,
4,t_00004da3a1b2,c_0041092041cd,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True,Доказателство на формулата за намиране на коре...,Сал извежда формулата за намиране на корените ...,video,,bg,,


In [21]:
## all has contents - no breadcrums yet
df.has_content.value_counts()

True    93
Name: has_content, dtype: int64

In [22]:
### select only the ones that have same language
df = df.loc[df.language_content == df.language_topic]
df.shape

(81, 17)

In [23]:
### only required columns
req_col = ['text','title_content','description_content','title_topic','description_topic']
ds = Dataset.from_pandas(df[req_col])
ds

Dataset({
    features: ['text', 'title_content', 'description_content', 'title_topic', 'description_topic', '__index_level_0__'],
    num_rows: 81
})

In [24]:
# pre -process content text
# df['masked_content_txt'] = df[['title_content','text']].apply(lambda row: row.text.strip().replace(row.title_content.strip(),
#                                                                                                    '__MASKED__').strip().split("\n")[0][:100],
#                                                                axis = 1)
def clean_content_text(examples):
        return {
        'masked_content_txt': (
                                examples['text']#.strip()
                               .replace(examples['title_content'].strip(), '__MASKED__').strip()
                               .split("\n")[0][:100]
        )
    }

ds = ds.map(clean_content_text)

Map:   0%|          | 0/81 [00:00<?, ? examples/s]

In [25]:
# df['input'] =  "CONTENT: " + (df["title_content"]
#         + " \n "
#         + df["description_content"].str.strip()
#         + " \n "
#         + df["masked_content_txt"]).str.strip() + " TOPIC: " + (df["title_topic"]
#         + " \n "
#         + df["description_topic"].str.strip()
#         ).str.strip()
def create_input(examples):
    return {
                'input': "CONTENT: " + (examples["title_content"]
                + " \n "
                + examples["description_content"].strip()
                + " \n "
                + examples["masked_content_txt"]).strip() 
                + " \n "
                + "TOPIC: " + (examples["title_topic"]
                + " \n "
                + examples["description_topic"].strip()
                ).strip()
}

ds = ds.map(create_input)

Map:   0%|          | 0/81 [00:00<?, ? examples/s]

In [26]:
model_nm = 'xlm-roberta-base'
# 'bert-base-multilingual-cased'

In [27]:
tokz = AutoTokenizer.from_pretrained(model_nm)
tokz.model_max_length, tokz.is_fast

(512, True)

In [28]:
# helper function cleaning GPU memory
def report_gpu():
    if is_colab: print(torch.cuda.list_gpu_processes())
    gc.collect()
    torch.cuda.empty_cache()
report_gpu()

GPU:0
no processes are running


In [29]:
# def tok_func(batch, is_test = False): 
#     tokens = tokz(batch["input"], padding="longest", truncation=True)
#     return tokens
# tok_ds = ds.map(tok_func, batched=True)

raw_inputs = ds['input']
inputs = tokz(raw_inputs, padding="longest",truncation = True, return_tensors="pt")
inputs

{'input_ids': tensor([[     0,      6, 108369,  ...,      1,      1,      1],
        [     0,      6, 108369,  ...,      1,      1,      1],
        [     0,      6, 108369,  ...,      1,      1,      1],
        ...,
        [     0,      6, 108369,  ...,      8,     51,      2],
        [     0,      6, 108369,  ...,   3919,      5,      2],
        [     0,      6, 108369,  ..., 116287,      7,      2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])}

In [40]:
report_gpu()
model = AutoModelForSequenceClassification.from_pretrained(model_save_path).to(device)

GPU:0
no processes are running


In [44]:
batch_size = 128
predicted_lables = []
for i in range(0,len(raw_inputs),batch_size):
    inputs = tokz(raw_inputs[i: i + batch_size],
                  padding="longest",
                  truncation = True,
                  return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    sm_preds = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predicted_lables.extend(torch.argmax(sm_preds, dim = -1).cpu().numpy())
    
len(predicted_lables)

81

In [47]:
### prepare final output
df['pred_label'] = predicted_lables
sub_df = df.loc[df['pred_label'] ==1,['topic_id', 'content_id']]
sub_df = sub_df.groupby('topic_id')['content_id'].transform(lambda x: ' '.join(x)).reset_index()
sub_df.to_csv('submission.csv', index = False)

In [48]:
sub_df.head()

Unnamed: 0,index,content_id


In [None]:
if not iskaggle:
    push_notebook('saan', comp,
                  title='LECR: XLMR - inference',
                  file='/content/drive/MyDrive/Colab Notebooks/lecr_inference.ipynb',
                  competition=comp, private=False, gpu=True)

Your kernel title does not resolve to the specified id. This may result in surprising behavior. We suggest making your title something that resolves to the specified id. See https://en.wikipedia.org/wiki/Clean_URL#Slug for more information on how slugs are determined.
Kernel version 1 successfully pushed.  Please check progress at https://www.kaggle.com/code/saansd2003/lecr-xlmr-classification-baseline
