# Imports

In [1]:
import sys, os
sys.path.append("../input/sentence-transformer-package/sentence-transformers-2.2.2/sentence-transformers-2.2.2") 
import sentence_transformers
from sentence_transformers import SentenceTransformer, CrossEncoder, util
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer, pipeline

In [2]:
import numpy as np
import pandas as pd
import string
import torch
import zipfile

# Loading translation model M2M100

In [3]:
model_name = 'facebook-m2m100-418M-model'
token_name = 'facebook-m2m100-418M-token'
get_model_path = '/kaggle/input/get-model-for-translation/'

with zipfile.ZipFile(f'{get_model_path}{model_name}.zip', 'r') as zip_ref_m:
    zip_ref_m.extractall('/kaggle/working')
    
with zipfile.ZipFile(f'{get_model_path}{token_name}.zip', 'r') as zip_ref_t:
    zip_ref_t.extractall('/kaggle/working')

# Loading dataframes

In [4]:
challenge_files_path = '/kaggle/input/learning-equality-curriculum-recommendations'
private_files_path = '/kaggle/input/learningequalityfiles'
model_files_path = '/kaggle/input/sentence-transformer-package'

print (f"\nLoading dataframes...")

for dirname, _, filenames in os.walk(challenge_files_path):
    for filename in filenames:
        filepath = os.path.join(dirname, filename)
        print (f"\nLoading dataframe from {filepath}...")
        df = pd.read_csv (filepath)
        if 'topics' in filepath:
            topics_df = df.fillna({"title": "", "description": ""})
            display(topics_df)
        elif 'sample_submission' in filepath:
            print (f"\nLoading 'sample' dataframe...")
            sample_df = df
            display(sample_df)
        elif 'correlations' in filepath:
            correlations_df = df.fillna({"title": "", "description": ""})
            display(correlations_df)
            print (f"\nCreating exploded correlations 'corr' dataframe")
            corr_df = correlations_df.copy()
            corr_df['content_ids'] = corr_df.content_ids.str.split(' ')
            corr_df = corr_df.explode('content_ids')
            display (corr_df)
        elif 'content' in filepath:
            contents_df = df.fillna({"title": "", "description": "", "text": ""})
            display(contents_df)
print (f"\nDataframes loaded.")


Loading dataframes...

Loading dataframe from /kaggle/input/learning-equality-curriculum-recommendations/sample_submission.csv...

Loading 'sample' dataframe...


Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_1108dd0c7a5d c_376c5a8eb028 c_5bc0e1e2cba0 c...
1,t_00068291e9a4,c_639ea2ef9c95 c_89ce9367be10 c_ac1672cdcd2c c...
2,t_00069b63a70a,c_11a1dc0bfb99
3,t_0006d41a73a8,c_0c6473c3480d c_1c57a1316568 c_5e375cf14c47 c...
4,t_4054df11a74e,c_3695c5dc1df6 c_f2d184a98231



Loading dataframe from /kaggle/input/learning-equality-curriculum-recommendations/topics.csv...


Unnamed: 0,id,title,description,channel,category,level,language,parent,has_content
0,t_00004da3a1b2,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True
1,t_000095e03056,Unit 3.3 Enlargements and Similarities,,b3f329,aligned,2,en,t_aa32fb6252dc,False
2,t_00068291e9a4,Entradas e saídas de uma função,Entenda um pouco mais sobre funções.,8e286a,source,4,pt,t_d14b6c2a2b70,True
3,t_00069b63a70a,Transcripts,,6e3ba4,source,3,en,t_4054df11a74e,True
4,t_0006d41a73a8,Графики на експоненциални функции (Алгебра 2 н...,Научи повече за графиките на сложните показате...,000cf7,source,4,bg,t_e2452e21d252,True
...,...,...,...,...,...,...,...,...,...
76967,t_fffb0bf2801d,4.3 Graph of functions,,e77b55,aligned,4,en,t_676e6a1a4dc7,False
76968,t_fffbe1d5d43c,Inscribed shapes problem solving,Use properties of inscribed angles to prove pr...,0c929f,source,4,sw,t_50145b9bab3f,True
76969,t_fffe14f1be1e,Lección 7,,6e90a7,aligned,6,es,t_d448c707984d,True
76970,t_fffe811a6da9,تحديد العلاقة بين الإحداثيّات القطبية والإحداث...,5b9e5ca86571f90499ea987f,9fd860,source,2,ar,t_5b4f3ba4eb7d,True



Loading dataframe from /kaggle/input/learning-equality-curriculum-recommendations/correlations.csv...


Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_1108dd0c7a5d c_376c5a8eb028 c_5bc0e1e2cba0 c...
1,t_00068291e9a4,c_639ea2ef9c95 c_89ce9367be10 c_ac1672cdcd2c c...
2,t_00069b63a70a,c_11a1dc0bfb99
3,t_0006d41a73a8,c_0c6473c3480d c_1c57a1316568 c_5e375cf14c47 c...
4,t_0008768bdee6,c_34e1424229b4 c_7d1a964d66d5 c_aab93ee667f4
...,...,...
61512,t_fff830472691,c_61fb63326e5d c_8f224e321c87
61513,t_fff9e5407d13,c_026db653a269 c_0fb048a6412c c_20de77522603 c...
61514,t_fffbe1d5d43c,c_46f852a49c08 c_6659207b25d5
61515,t_fffe14f1be1e,c_cece166bad6a



Creating exploded correlations 'corr' dataframe


Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_1108dd0c7a5d
0,t_00004da3a1b2,c_376c5a8eb028
0,t_00004da3a1b2,c_5bc0e1e2cba0
0,t_00004da3a1b2,c_76231f9d0b5e
1,t_00068291e9a4,c_639ea2ef9c95
...,...,...
61513,t_fff9e5407d13,c_d64037a72376
61514,t_fffbe1d5d43c,c_46f852a49c08
61514,t_fffbe1d5d43c,c_6659207b25d5
61515,t_fffe14f1be1e,c_cece166bad6a



Loading dataframe from /kaggle/input/learning-equality-curriculum-recommendations/content.csv...


Unnamed: 0,id,title,description,kind,text,language,copyright_holder,license
0,c_00002381196d,"Sumar números de varios dígitos: 48,029+233,930","Suma 48,029+233,930 mediante el algoritmo está...",video,,es,,
1,c_000087304a9e,Trovare i fattori di un numero,Sal trova i fattori di 120.\n\n,video,,it,,
2,c_0000ad142ddb,Sumar curvas de demanda,Cómo añadir curvas de demanda\n\n,video,,es,,
3,c_0000c03adc8d,Nado de aproximação,Neste vídeo você vai aprender o nado de aproxi...,document,\nNado de aproximação\nSaber nadar nas ondas ...,pt,Sikana Education,CC BY-NC-ND
4,c_00016694ea2a,geometry-m3-topic-a-overview.pdf,geometry-m3-topic-a-overview.pdf,document,Estándares Comunes del Estado de Nueva York\n\...,es,Engage NY,CC BY-NC-SA
...,...,...,...,...,...,...,...,...
154042,c_fffcbdd4de8b,2. 12: Diffusion,,html5,What will eventually happen to these dyes?\n\n...,en,CSU and Merlot,CC BY-NC-SA
154043,c_fffe15a2d069,Sommare facendo gruppi da 10,Sal somma 5+68 spezzando il 5 in un 2 e un 3.\n\n,video,,it,,
154044,c_fffed7b0d13a,Introdução à subtração,Sal fala sobre o que significa subtrair. Os ex...,video,,pt,,
154045,c_ffff04ba7ac7,SA of a Cone,,video,,en,,



Dataframes loaded.


# Choosing sample data

In [5]:
print ("\nDefining sampled dataset...")
use_submission_sample = True
if ~sample_df.empty and use_submission_sample:
    corr_df = corr_df[corr_df.topic_id.isin(sample_df.topic_id)]
    correlations_df = correlations_df[correlations_df.topic_id.isin(sample_df.topic_id)]
    topics_df = topics_df[topics_df.id.isin(sample_df.topic_id)]
    print (f"\nFiltered 'topics' to {len(topics_df)} samples and 'contents' to {len(contents_df)} samples")
else:
    topics_df = topics_df.sample(n=7000)
display (topics_df)
display (contents_df)
display (correlations_df)


Defining sampled dataset...

Filtered 'topics' to 5 samples and 'contents' to 154047 samples


Unnamed: 0,id,title,description,channel,category,level,language,parent,has_content
0,t_00004da3a1b2,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True
2,t_00068291e9a4,Entradas e saídas de uma função,Entenda um pouco mais sobre funções.,8e286a,source,4,pt,t_d14b6c2a2b70,True
3,t_00069b63a70a,Transcripts,,6e3ba4,source,3,en,t_4054df11a74e,True
4,t_0006d41a73a8,Графики на експоненциални функции (Алгебра 2 н...,Научи повече за графиките на сложните показате...,000cf7,source,4,bg,t_e2452e21d252,True
19136,t_4054df11a74e,Flow Charts: Logical Thinking?,This lesson is focused on flow charts. It supp...,6e3ba4,source,2,en,t_acbbd893e6af,True


Unnamed: 0,id,title,description,kind,text,language,copyright_holder,license
0,c_00002381196d,"Sumar números de varios dígitos: 48,029+233,930","Suma 48,029+233,930 mediante el algoritmo está...",video,,es,,
1,c_000087304a9e,Trovare i fattori di un numero,Sal trova i fattori di 120.\n\n,video,,it,,
2,c_0000ad142ddb,Sumar curvas de demanda,Cómo añadir curvas de demanda\n\n,video,,es,,
3,c_0000c03adc8d,Nado de aproximação,Neste vídeo você vai aprender o nado de aproxi...,document,\nNado de aproximação\nSaber nadar nas ondas ...,pt,Sikana Education,CC BY-NC-ND
4,c_00016694ea2a,geometry-m3-topic-a-overview.pdf,geometry-m3-topic-a-overview.pdf,document,Estándares Comunes del Estado de Nueva York\n\...,es,Engage NY,CC BY-NC-SA
...,...,...,...,...,...,...,...,...
154042,c_fffcbdd4de8b,2. 12: Diffusion,,html5,What will eventually happen to these dyes?\n\n...,en,CSU and Merlot,CC BY-NC-SA
154043,c_fffe15a2d069,Sommare facendo gruppi da 10,Sal somma 5+68 spezzando il 5 in un 2 e un 3.\n\n,video,,it,,
154044,c_fffed7b0d13a,Introdução à subtração,Sal fala sobre o que significa subtrair. Os ex...,video,,pt,,
154045,c_ffff04ba7ac7,SA of a Cone,,video,,en,,


Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_1108dd0c7a5d c_376c5a8eb028 c_5bc0e1e2cba0 c...
1,t_00068291e9a4,c_639ea2ef9c95 c_89ce9367be10 c_ac1672cdcd2c c...
2,t_00069b63a70a,c_11a1dc0bfb99
3,t_0006d41a73a8,c_0c6473c3480d c_1c57a1316568 c_5e375cf14c47 c...
15278,t_4054df11a74e,c_3695c5dc1df6 c_f2d184a98231


# Cleaning data

* Remove ponctuation and special chars from text fields
* Delete columns 'copyright_holder' and 'license' from 'contents'
* Filter 'topics' by 'has_content' = True
* Group 'topics' and 'contents' by language
* Change 'level' column from numbers to text

In [6]:
def clean_text(text_col):
    """
    Clean ponctuation and special chars from a dataframe column
    """
    punctuations = string.punctuation
    text_col = text_col.str.replace('\W', ' ', regex=True)
    for punct in string.punctuation:
        text_col = text_col.str.replace(punct, ' ', regex=True)
    return text_col.str.lower()

In [7]:
# Cleaning topics
levels = {1: 'Level 1', 2: 'Level 2', 3: 'Level 3', 4: 'Level 4', 5: 'Level 5', 6: 'Level 6', 7: 'Level 7', 
          8: 'Level 8', 9: 'Level 9', 10: 'Level 10', 0: 'Level 0'}
topics_cols = ['title', 'description']

print (f"\nCreating and cleaning topic features...")
topic_features = topics_df.copy()
topic_features = topic_features.replace ({'level': levels})
for col in topics_cols:
    topic_features[col] = clean_text(topic_features[col])
topic_features.sort_values (by='language', inplace=True)

#topics_features['sentences'] = topics_features[topics_cols].apply(lambda x: '.'.join(x.dropna().astype(str)), axis=1)
#topics_features = topics_features.drop(columns=['parent'] + topics_cols) 
print (f"\nCreated 'topic_features'")
display (topic_features)


Creating and cleaning topic features...

Created 'topic_features'


Unnamed: 0,id,title,description,channel,category,level,language,parent,has_content
0,t_00004da3a1b2,откриването на резисторите,изследване на материали които предизвикват на...,000cf7,source,Level 4,bg,t_16e29365b50d,True
4,t_0006d41a73a8,графики на експоненциални функции алгебра 2 н...,научи повече за графиките на сложните показате...,000cf7,source,Level 4,bg,t_e2452e21d252,True
3,t_00069b63a70a,transcripts,,6e3ba4,source,Level 3,en,t_4054df11a74e,True
19136,t_4054df11a74e,flow charts logical thinking,this lesson is focused on flow charts it supp...,6e3ba4,source,Level 2,en,t_acbbd893e6af,True
2,t_00068291e9a4,entradas e saídas de uma função,entenda um pouco mais sobre funções,8e286a,source,Level 4,pt,t_d14b6c2a2b70,True


In [8]:
# Cleaning contents
content_cols = ['title', 'description', 'text']

print (f"\nCreating and cleaning content features...")
content_features = contents_df.copy()
for col in content_cols:
    content_features[col] = clean_text(content_features[col])
#content_features['sentences'] =  content_features[content_cols].apply(lambda x: '.'.join(x.dropna().astype(str)), axis=1)
content_features.sort_values (by='language', inplace=True)
content_features.drop(columns=['copyright_holder', 'license'], inplace=True)
print (f"\nCreated 'content_features'")
display(content_features)


Creating and cleaning content features...

Created 'content_features'


Unnamed: 0,id,title,description,kind,text,language
133868,c_de43cff8dd60,المفردات والتراكيب,,exercise,ما المقصود بعبارة هل هناك من خطب هل هنا...,ar
63589,c_699d4bd5d5bb,المفردات والتراكيب,,exercise,ما ضد لا تمل في الجملة الآتية ت عيد الأ...,ar
17214,c_1cab88377df5,المفردات والتراكيب,,exercise,ما مرادف يتم م في ما يلي ي ق ف أ ح يان ا ...,ar
145550,c_f1b8123b975e,الس ف ر,يركب رامي الط ائرة للمر ة الأولى فيحق ق أمله...,html5,الس ف ر ب ح ر ص ش ديد ح ز م رامي أ م ...,ar
17202,c_1ca6079deb47,يعي ن معكوس مصفوفة مرب عة من الرتبة الثالثة با...,5ad46b8a6b9064043d8b4158,exercise,contentstorage 40cf62978d6581e03a5085...,ar
...,...,...,...,...,...,...
15012,c_18fc5ad44e5d,构建比较级和最高级,学习如何改装比较级和最高级修饰语,video,,zh
128758,c_d5e56175090a,主谓一致简介,主谓一致指匹配一个句子的主语和动词 这里介绍它是怎么一回事,video,,zh
32558,c_35f7982b250f,双臂和胯部热身运动,在本视频中 您将学会三个热身练习 锻炼您上半身的肌肉 尤其是双臂和胯部的肌肉,document,双臂和胯部热身运动 双臂和胯部热身运动 在本视频中 您将学会三个热身练习 锻炼您上半身的...,zh
15053,c_19148c5d7ab3,出血,微博 http www weibo com sikana 脸书 https www...,document,出血 本视频由红十字会和sikana共同出品 急救 出血 在这个视频您将学习怎样对出血伤...,zh


# Scoring: F2 score 

In [9]:
def calculate_F2score(pred_df, act_df):
    
    """
    Using predictions_df and actual_df as exploded correlation columns to calculate F1 score.
    Results show correct predicts, recall, precision and F2 score.
    Results also return the list of correct predicts, correct_df_
    """
    print ('\nCalculating scores...')
    correct_preds=[]
    correct_pairs=[]
    if pred_df.empty or act_df.empty:
        print ('\nOne or both dataframes are empty. Abort F2score calculation.')
        return None
    prediction_df=pred_df.copy()
    actual_df = act_df.copy()
    prediction_df.columns=['topic_id', 'content_ids_pred']
    actual_df.columns=['topic_id', 'content_ids_actual']
    df = pd.merge(prediction_df, actual_df, how='inner', on='topic_id')
    if df.empty:
        print ('\nNo matches between predictions and correlations. Abort F2score calculation.')
        return None
    for row in df.itertuples():
        counts = 0
        for id in row.content_ids_pred.split(' '):
            correct_pairs.append([row.topic_id, id])
            if id in row.content_ids_actual.split(' '):
                counts += 1 
        correct_preds.append (counts)
    df['correct_pred'] = correct_preds
    df['precision'] = df['correct_pred']/(df.content_ids_actual.str.len() + 1e-7)
    df['recall'] = df['correct_pred']/(df.content_ids_pred.str.len() + 1e-7)
    for beta in [0.5, 1, 2]:
        df['f'+str(beta)] = ((1 + beta**2) * df['precision'] * df['recall'])/((beta**2 * df['precision']) + df['recall'] + 1e-7) 
    print ('\nF2 score calculation finished.')

    return df, correct_pairs

# Getting matches: sentence transformer with retrain-rerank

In [10]:
def search(query, topic_embedding, corpus_embeddings, content_sentences, content_ids, cross_encoder):

    # passages = content_sentences

    ##### Semantic Search #####
    # find potentially relevant passages
    hits = util.semantic_search(topic_embedding, corpus_embeddings, top_k=top_k)
    hits = hits[0]  # Get the hits for the first query

    ##### Re-Ranking #####
    # Now, score all retrieved passages with the cross_encoder
    cross_inp = [[query, content_sentences[hit['corpus_id']]] for hit in hits]
    cross_scores = cross_encoder.predict(cross_inp)

    # Sort results by the cross-encoder scores
    for idx in range(len(cross_scores)):
        hits[idx]['cross-score'] = cross_scores[idx]

    # Output of top-30 hits from re-ranker
    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
    results = {}
    for hit in hits[0:30]:
        results[content_ids.iloc[hit['corpus_id']]] = content_sentences[hit['corpus_id']]
    return results

## Load models

In [11]:
if not torch.cuda.is_available():
    print("Warning: No GPU found. Please add GPU to your notebook")
else: 
    device = torch.cuda.current_device()
    
languages = topic_features.language.unique()
preds = {}
matches = {}
biencoder = '/kaggle/input/sentence-embedding-models/paraphrase-multilingual-mpnet-base-v2'
# truncate to 256 tokens, 10 matches
# /kaggle/input/sentence-embedding-models/paraphrase-MiniLM-L12-v2 -> score = 0.208 (f2 = 0.0031)
# /kaggle/input/sentence-embedding-models/paraphrase-distilroberta-base-v2 -> score 0.205 (f2 = 0.0061)
# /kaggle/input/sentence-embedding-models/paraphrase-multilingual-mpnet-base-v2 -> score 0.207 (f2 = 0.0046)
# truncate to 512 tokens, 10 matches
# /kaggle/input/sentence-embedding-models/paraphrase-MiniLM-L12-v2 -> score = 0.208 (f2 = 0.0030)
# /kaggle/input/sentence-embedding-models/paraphrase-TinyBERT-L6-v2 - (f2 = 0.0046)
# /kaggle/input/sentence-embedding-models/paraphrase-distilroberta-base-v2 -> score ... (f2 = ...)
# /kaggle/input/sentence-embedding-models/paraphrase-multilingual-mpnet-base-v2 -> score 0.19 (f1 = 0.0042)
#'multi-qa-MiniLM-L6-cos-v1'
crossencoder = '/kaggle/input/msmarcominilml6v2/ms-marco-MiniLM-L-6-v2'

print (f"\nGetting matches using bi-encoder {biencoder} and cross encoder {crossencoder}. Loading models...")

#Use a translation model to get all text in english
model = M2M100ForConditionalGeneration.from_pretrained(f"/kaggle/working/{model_name}")
tokenizer = M2M100Tokenizer.from_pretrained(f"/kaggle/working/{token_name}")
token_lenght = 512

#Use the Bi-Encoder to encode all contents, so that we can use it with semantic search
bi_encoder = SentenceTransformer(biencoder)
bi_encoder.max_seq_length = token_lenght     #Truncate long passages to 256 tokens
top_k = 15                          #Number of passages we want to retrieve with the bi-encoder

#Use a cross-encoder, to re-rank the results list to improve the quality
cross_encoder = CrossEncoder(crossencoder)
print (f"\nModels loaded.")


Getting matches using bi-encoder /kaggle/input/sentence-embedding-models/paraphrase-multilingual-mpnet-base-v2 and cross encoder /kaggle/input/msmarcominilml6v2/ms-marco-MiniLM-L-6-v2. Loading models...

Models loaded.


## Run calculations for each language

In [12]:
from tqdm import tqdm

print ('Topic languages:',languages)
content_sentences=[]
topic_sentences=[]
for lang in languages:
    print ('\nWorking on topics for language ', lang)
    tokenizer.src_lang = lang
    lang_content_sentences = content_features[content_features.language == lang].title.to_list()
    lang_topic_sentences = topic_features[topic_features.language == lang].title.to_list()
    
    print ('\nTranslating contents from ', lang, 'to en')
    translator = pipeline('translation', model=model, tokenizer=tokenizer,src_lang=lang, tgt_lang="en",device=device)
    for target_seq in tqdm(translator(lang_content_sentences, batch_size=32), total=len(lang_content_sentences)):
        content_sentences.append(str(target_seq.values()))
    
    print ('\nTranslating topics from ', lang, 'to en')
    for target_seq in tqdm(translator(lang_topic_sentences, batch_size=32), total=len(lang_content_sentences)):
        topic_sentences.append(str(target_seq.values()))

    if len(content_sentences) == 0:
        print ('\nNo contents for this language.')
        continue
    if len(topic_sentences) == 0:
        print ('\nNo topics for this language.')
        continue
        
    print ("\nCalculating 'content' embeddings...")

    # encode all contents into our vector space. This takes about 5 minutes (depends on your GPU speed)
    corpus_embeddings = bi_encoder.encode(content_sentences, convert_to_tensor=True, show_progress_bar=False)
    corpus_embeddings = corpus_embeddings.cuda()
    corpus_embeddings = util.normalize_embeddings(corpus_embeddings)
    
    print ("\nCalculating 'topic' embeddings...")
    
    # Encode the topics using the bi-encoder
    topic_embeddings = bi_encoder.encode(topic_sentences, convert_to_tensor=True, show_progress_bar=False)
    topic_embeddings = topic_embeddings.cuda()
    topic_embeddings = util.normalize_embeddings(topic_embeddings)
    
    print ("\nRunning matches...")
        
    for i, (topic_embed, query) in enumerate(tqdm(zip (topic_embeddings, topic_sentences), total=len(topic_sentences))):
            results = search(query,
                             topic_embed,
                             corpus_embeddings, 
                             content_sentences, 
                             content_features.id,
                             cross_encoder)
            matches[query] = results.values()
            preds[topic_features.iloc[i].id] = results.keys()
print ('\nEnd of calculating matches.')

Topic languages: ['bg' 'en' 'pt']

Working on topics for language  bg

Translating contents from  bg to en


100%|██████████| 6050/6050 [00:00<00:00, 383652.43it/s]



Translating topics from  bg to en


  0%|          | 2/6050 [00:00<00:01, 3442.19it/s]



Calculating 'content' embeddings...

Calculating 'topic' embeddings...

Running matches...


  0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 2/2 [00:00<00:00, 21.62it/s]


Working on topics for language  en

Translating contents from  en to en



100%|██████████| 65939/65939 [00:00<00:00, 315057.17it/s]



Translating topics from  en to en


  0%|          | 2/65939 [00:00<00:08, 8144.28it/s]



Calculating 'content' embeddings...

Calculating 'topic' embeddings...

Running matches...


  0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 75%|███████▌  | 3/4 [00:00<00:00, 23.50it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 4/4 [00:00<00:00, 24.93it/s]


Working on topics for language  pt

Translating contents from  pt to en



100%|██████████| 10435/10435 [00:00<00:00, 453160.10it/s]



Translating topics from  pt to en


  0%|          | 1/10435 [00:00<00:04, 2406.37it/s]



Calculating 'content' embeddings...

Calculating 'topic' embeddings...

Running matches...


  0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 60%|██████    | 3/5 [00:00<00:00, 27.86it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 5/5 [00:00<00:00, 26.91it/s]


End of calculating matches.





In [13]:
topic_sentences

["dict_values(['Discovery of Resistants'])",
 "dict_values(['Graphics of Exponential Functions Algebra Level 2'])",
 "dict_values(['Transcripts'])",
 "dict_values(['Flow Charts Logical Thinking'])",
 "dict_values(['Entrance and Exit of a Function'])"]

In [14]:
# predictions in exploded format
preds_df = pd.DataFrame(zip (list(preds.keys()), [list(vals) for vals in preds.values()]), columns=['id','content_ids']).explode('content_ids')

# predictions in submissions format
df_preds_aux = pd.DataFrame(zip (list(preds.keys()), (' '.join(list(preds[key])) for key in preds.keys())), columns=['id','content_ids'])

predicts_submission = pd.DataFrame(topics_df.id).merge (df_preds_aux, how ='left', on = 'id')
predicts_submission.rename(columns={'id':'topic_id'}, inplace=True)
predicts_submission.fillna(' ', inplace=True)

predicts_submission

Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_6ccf8ebedca0 c_6056836ab4d2 c_235243772b59 c...
1,t_00068291e9a4,c_19acc0fa60ff c_e9d2e37faec3 c_ee583e1c7719 c...
2,t_00069b63a70a,c_dc1b86b850d5 c_697f734db142 c_47751542d30c c...
3,t_0006d41a73a8,c_6f144e2652bd c_307a804f3d18 c_9668b9a9325a c...
4,t_4054df11a74e,c_4eb753dd3d38 c_9dee2afab0a6 c_16c19ad9f109 c...


In [15]:
#Check if all content from correlations is in current contents_df
#contents_in_corr = correlations_df.content_ids.str.split(' ').explode().isin(list(contents_df.id))

# Model evaluation

In [16]:
score, true_matches = calculate_F2score(predicts_submission.sort_values('topic_id'), correlations_df)
if score is not None:
    display(score)
    print ('F2 mean score:', score.f2.mean())


Calculating scores...

F2 score calculation finished.


Unnamed: 0,topic_id,content_ids_pred,content_ids_actual,correct_pred,precision,recall,f0.5,f1,f2
0,t_00004da3a1b2,c_6ccf8ebedca0 c_6056836ab4d2 c_235243772b59 c...,c_1108dd0c7a5d c_376c5a8eb028 c_5bc0e1e2cba0 c...,0,0.0,0.0,0.0,0.0,0.0
1,t_00068291e9a4,c_19acc0fa60ff c_e9d2e37faec3 c_ee583e1c7719 c...,c_639ea2ef9c95 c_89ce9367be10 c_ac1672cdcd2c c...,0,0.0,0.0,0.0,0.0,0.0
2,t_00069b63a70a,c_dc1b86b850d5 c_697f734db142 c_47751542d30c c...,c_11a1dc0bfb99,0,0.0,0.0,0.0,0.0,0.0
3,t_0006d41a73a8,c_6f144e2652bd c_307a804f3d18 c_9668b9a9325a c...,c_0c6473c3480d c_1c57a1316568 c_5e375cf14c47 c...,0,0.0,0.0,0.0,0.0,0.0
4,t_4054df11a74e,c_4eb753dd3d38 c_9dee2afab0a6 c_16c19ad9f109 c...,c_3695c5dc1df6 c_f2d184a98231,0,0.0,0.0,0.0,0.0,0.0


F2 mean score: 0.0


# Submission

In [17]:
predicts_submission.to_csv('submission.csv', index=False)