# Kaggle Challenge - Learning Equality

# Data

## Imports

In [1]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l- \ | done
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125938 sha256=c3e72c41b0090736fa80f8ffe1b98f2b5947c2e9707c4df2d4f654d36fc0b3ed
  Stored in directory: /root/.cache/pip/wheels/bf/06/fb/d59c1e5bd1dac7f6cf61ec0036cc3a10ab8fecaa6b2c3d3ee9
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.2.2
[0m

In [2]:
import pandas as pd
import numpy as np
import os
import string

# Data

# Data Collection

## Load dataframes

In [3]:
challenge_files_path = '/kaggle/input/learning-equality-curriculum-recommendations'
private_files_path = '/kaggle/input/learningequalityfiles'
model_files_path = '/kaggle/input/sentence-transformer-package'

print (f"\nLoading dataframes...")

for dirname, _, filenames in os.walk(challenge_files_path):
    for filename in filenames:
        filepath = os.path.join(dirname, filename)
        print (f"\nLoading dataframe from {filepath}...")
        df = pd.read_csv (filepath)
        if 'topics' in filepath:
            topics_df = df.fillna({"title": "", "description": ""})
            display(topics_df)
        elif 'sample_submission' in filepath:
            print (f"\nLoading 'sample' dataframe...")
            sample_df = df
            display(sample_df)
        elif 'correlations' in filepath:
            correlations_df = df.fillna({"title": "", "description": ""})
            display(correlations_df)
            print (f"\nCreating exploded correlations 'corr' dataframe")
            corr_df = correlations_df.copy()
            corr_df['content_ids'] = corr_df.content_ids.str.split(' ')
            corr_df = corr_df.explode('content_ids')
            display (corr_df)
        elif 'content' in filepath:
            contents_df = df.fillna({"title": "", "description": "", "text": ""})
            display(contents_df)
print (f"\nDataframes loaded.")


Loading dataframes...

Loading dataframe from /kaggle/input/learning-equality-curriculum-recommendations/sample_submission.csv...

Loading 'sample' dataframe...


Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_1108dd0c7a5d c_376c5a8eb028 c_5bc0e1e2cba0 c...
1,t_00068291e9a4,c_639ea2ef9c95 c_89ce9367be10 c_ac1672cdcd2c c...
2,t_00069b63a70a,c_11a1dc0bfb99
3,t_0006d41a73a8,c_0c6473c3480d c_1c57a1316568 c_5e375cf14c47 c...
4,t_4054df11a74e,c_3695c5dc1df6 c_f2d184a98231



Loading dataframe from /kaggle/input/learning-equality-curriculum-recommendations/topics.csv...


Unnamed: 0,id,title,description,channel,category,level,language,parent,has_content
0,t_00004da3a1b2,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True
1,t_000095e03056,Unit 3.3 Enlargements and Similarities,,b3f329,aligned,2,en,t_aa32fb6252dc,False
2,t_00068291e9a4,Entradas e saídas de uma função,Entenda um pouco mais sobre funções.,8e286a,source,4,pt,t_d14b6c2a2b70,True
3,t_00069b63a70a,Transcripts,,6e3ba4,source,3,en,t_4054df11a74e,True
4,t_0006d41a73a8,Графики на експоненциални функции (Алгебра 2 н...,Научи повече за графиките на сложните показате...,000cf7,source,4,bg,t_e2452e21d252,True
...,...,...,...,...,...,...,...,...,...
76967,t_fffb0bf2801d,4.3 Graph of functions,,e77b55,aligned,4,en,t_676e6a1a4dc7,False
76968,t_fffbe1d5d43c,Inscribed shapes problem solving,Use properties of inscribed angles to prove pr...,0c929f,source,4,sw,t_50145b9bab3f,True
76969,t_fffe14f1be1e,Lección 7,,6e90a7,aligned,6,es,t_d448c707984d,True
76970,t_fffe811a6da9,تحديد العلاقة بين الإحداثيّات القطبية والإحداث...,5b9e5ca86571f90499ea987f,9fd860,source,2,ar,t_5b4f3ba4eb7d,True



Loading dataframe from /kaggle/input/learning-equality-curriculum-recommendations/correlations.csv...


Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_1108dd0c7a5d c_376c5a8eb028 c_5bc0e1e2cba0 c...
1,t_00068291e9a4,c_639ea2ef9c95 c_89ce9367be10 c_ac1672cdcd2c c...
2,t_00069b63a70a,c_11a1dc0bfb99
3,t_0006d41a73a8,c_0c6473c3480d c_1c57a1316568 c_5e375cf14c47 c...
4,t_0008768bdee6,c_34e1424229b4 c_7d1a964d66d5 c_aab93ee667f4
...,...,...
61512,t_fff830472691,c_61fb63326e5d c_8f224e321c87
61513,t_fff9e5407d13,c_026db653a269 c_0fb048a6412c c_20de77522603 c...
61514,t_fffbe1d5d43c,c_46f852a49c08 c_6659207b25d5
61515,t_fffe14f1be1e,c_cece166bad6a



Creating exploded correlations 'corr' dataframe


Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_1108dd0c7a5d
0,t_00004da3a1b2,c_376c5a8eb028
0,t_00004da3a1b2,c_5bc0e1e2cba0
0,t_00004da3a1b2,c_76231f9d0b5e
1,t_00068291e9a4,c_639ea2ef9c95
...,...,...
61513,t_fff9e5407d13,c_d64037a72376
61514,t_fffbe1d5d43c,c_46f852a49c08
61514,t_fffbe1d5d43c,c_6659207b25d5
61515,t_fffe14f1be1e,c_cece166bad6a



Loading dataframe from /kaggle/input/learning-equality-curriculum-recommendations/content.csv...


Unnamed: 0,id,title,description,kind,text,language,copyright_holder,license
0,c_00002381196d,"Sumar números de varios dígitos: 48,029+233,930","Suma 48,029+233,930 mediante el algoritmo está...",video,,es,,
1,c_000087304a9e,Trovare i fattori di un numero,Sal trova i fattori di 120.\n\n,video,,it,,
2,c_0000ad142ddb,Sumar curvas de demanda,Cómo añadir curvas de demanda\n\n,video,,es,,
3,c_0000c03adc8d,Nado de aproximação,Neste vídeo você vai aprender o nado de aproxi...,document,\nNado de aproximação\nSaber nadar nas ondas ...,pt,Sikana Education,CC BY-NC-ND
4,c_00016694ea2a,geometry-m3-topic-a-overview.pdf,geometry-m3-topic-a-overview.pdf,document,Estándares Comunes del Estado de Nueva York\n\...,es,Engage NY,CC BY-NC-SA
...,...,...,...,...,...,...,...,...
154042,c_fffcbdd4de8b,2. 12: Diffusion,,html5,What will eventually happen to these dyes?\n\n...,en,CSU and Merlot,CC BY-NC-SA
154043,c_fffe15a2d069,Sommare facendo gruppi da 10,Sal somma 5+68 spezzando il 5 in un 2 e un 3.\n\n,video,,it,,
154044,c_fffed7b0d13a,Introdução à subtração,Sal fala sobre o que significa subtrair. Os ex...,video,,pt,,
154045,c_ffff04ba7ac7,SA of a Cone,,video,,en,,



Dataframes loaded.


## Data Cleaning

## Data cleaning functions

In [4]:
# Clean up text
def clean_text(text_col):
    """
    Clean ponctuation and special chars from a dataframe column
    """
    punctuations = string.punctuation
    text_col = text_col.str.replace('\W', ' ', regex=True)
    for punct in string.punctuation:
        text_col = text_col.str.replace(punct, ' ', regex=True)
    return text_col

In [5]:
# Cleaning topics
levels = {1: 'Level 1', 2: 'Level 2', 3: 'Level 3', 4: 'Level 4', 5: 'Level 5', 6: 'Level 6', 7: 'Level 7', 
          8: 'Level 8', 9: 'Level 9', 10: 'Level 10', 0: 'Level 0'}
topics_cols = ['title', 'description']

print (f"\nCreating and cleaning topic features...")
topic_features = topics_df.copy()[topics_df.has_content == True]
topic_features = topic_features.replace ({'level': levels})
for col in topics_cols:
    topic_features[col] = clean_text(topic_features[col])
topic_features.sort_values (by='language', inplace=True)
topic_features['topic_sentences'] = topic_features[topics_cols].apply(lambda x: '.'.join(x.dropna().astype(str)), axis=1)
topic_features = topic_features.drop(columns=['parent'] + topics_cols) 
print (f"\nCreated 'topic_features'")
display (topic_features)


Creating and cleaning topic features...

Created 'topic_features'


Unnamed: 0,id,channel,category,level,language,has_content,topic_sentences
76970,t_fffe811a6da9,9fd860,source,Level 2,ar,True,تحديد العلاقة بين الإحداثي ات القطبية والإحداث...
36893,t_7b5ca5d6bf6a,5139e9,source,Level 7,ar,True,Everyday Phrases .Everyday English Phrases ...
36781,t_7af06b12a79f,7b47c5,source,Level 4,ar,True,ر سال ة م ن الم ن فى.
3964,t_0d308a58d685,9fd860,source,Level 3,ar,True,يرسم المنحنى المتجم ع الصاعد والمنحنى المتجم ع...
36771,t_7ae88c23c58d,5139e9,source,Level 5,ar,True,طريقة كتابة المصفوفات في البرنامج عملي.
...,...,...,...,...,...,...,...
59851,t_c7de37ff1773,f83dcf,source,Level 4,zh,True,长方体和正方体的体积.掌握体积的概念以及认识体积单位 理解计算公式的推导过程 掌握体积计算的...
22956,t_4d3ac853a17f,f83dcf,source,Level 4,zh,True,分数和整数的乘法应用题.通过一些练习来解决将分数乘以整数的应用题
33496,t_6fe539174250,da1fa7,source,Level 3,zh,True,如何准备种子.
30147,t_64e46e13bd1f,f83dcf,source,Level 4,zh,True,不进位加法.计算两位数加两位数的不进位加法


In [6]:
# Cleaning contents
content_cols = ['title', 'description']

print (f"\nCreating and cleaning content features...")
content_features = contents_df.copy()
for col in content_cols:
    content_features[col] = clean_text(content_features[col])
#content_features['sentences'] =  content_features[content_cols].apply(lambda x: '.'.join(x.dropna().astype(str)), axis=1)
content_features.sort_values (by='language', inplace=True)
content_features.drop(columns=['copyright_holder', 'license'], inplace=True)
content_features['content_sentences'] = content_features[content_cols].apply(lambda x: '.'.join(x.dropna().astype(str)), axis=1) 
print (f"\nCreated 'content_features'")
display(content_features)


Creating and cleaning content features...

Created 'content_features'


Unnamed: 0,id,title,description,kind,text,language,content_sentences
133868,c_de43cff8dd60,المفردات والتراكيب,,exercise,"ما المقصود بعبارة: ""هل هناك من خطب""؟\n\n- هل ه...",ar,المفردات والتراكيب.
63589,c_699d4bd5d5bb,المفردات والتراكيب,,exercise,"ما ضدّ ""لا تملّ"" في الجملة الآتية: ""تُعيدُ الأ...",ar,المفردات والتراكيب.
17214,c_1cab88377df5,المفردات والتراكيب,,exercise,"ما مرادف ""يتمّم"" في ما يلي: ""يَقِفُ أَحْيانًا ...",ar,المفردات والتراكيب.
145550,c_f1b8123b975e,الس ف ر,يركب رامي الط ائرة للمر ة الأولى فيحق ق أمله...,html5,"السَّفَرُ\n\nبِحِرْصٍ شَديدٍ، حَزَمَ ""رامي"" أَ...",ar,الس ف ر .يركب رامي الط ائرة للمر ة الأولى ف...
17202,c_1ca6079deb47,يعي ن معكوس مصفوفة مرب عة من الرتبة الثالثة با...,5ad46b8a6b9064043d8b4158,exercise,![](${☣ CONTENTSTORAGE}/40cf62978d6581e03a5085...,ar,يعي ن معكوس مصفوفة مرب عة من الرتبة الثالثة با...
...,...,...,...,...,...,...,...
15012,c_18fc5ad44e5d,构建比较级和最高级,学习如何改装比较级和最高级修饰语,video,,zh,构建比较级和最高级.学习如何改装比较级和最高级修饰语
128758,c_d5e56175090a,主谓一致简介,主谓一致指匹配一个句子的主语和动词 这里介绍它是怎么一回事,video,,zh,主谓一致简介.主谓一致指匹配一个句子的主语和动词 这里介绍它是怎么一回事
32558,c_35f7982b250f,双臂和胯部热身运动,在本视频中 您将学会三个热身练习 锻炼您上半身的肌肉 尤其是双臂和胯部的肌肉,document,\n双臂和胯部热身运动\n双臂和胯部热身运动\n在本视频中，您将学会三个热身练习，锻炼您上...,zh,双臂和胯部热身运动.在本视频中 您将学会三个热身练习 锻炼您上半身的肌肉 尤其是双臂和胯部的肌肉
15053,c_19148c5d7ab3,出血,微博 http www weibo com sikana 脸书 https www...,document,\n出血\n本视频由红十字会和Sikana共同出品\n急救\n出血\n在这个视频您将学习怎...,zh,出血.微博 http www weibo com sikana 脸书 https ...


# Setting Up Features

## Features

*   Filter by category (do not use 'aligned', use only for validation)
*   Clean strings: ponctuation and special chars (\n\t...)
*   Merge columns in 'df_topics' from 'topic_cols' into 1 sentence (Sentence1)
*   Merge columns in 'df_content' from 'topic_cols' into 1 sentence (Sentence1)
*   Merge both datasets using y (correlations.csv)

By merging both sentences column using correlations, we get a pair of similar sentences with size of y

In [7]:
features = topic_features.merge (corr_df, left_on = 'id', right_on='topic_id', how='left')
features = features.merge (content_features, left_on='content_ids', right_on='id', how='left')
features = features[['topic_sentences', 'content_sentences']]
n = len(features)
n

279919

In [8]:
import gc

n = 100000
features = features.sample(n=n)
del contents_df
del topics_df
gc.collect()

128

In [9]:
train_features = features[:int(n*0.8)]
test_features = features[int(n*0.8+1):]

# Model

## Model params

In [10]:
backbone = 'all-MiniLM-L6-v2' #explore also msmarco models and multilanguage: paraphrase-multilingual-mpnet-base-v2
#backbone = 'paraphrase-multilingual-mpnet-base-v2'
batch_size = 64
shuffle = True
num_epochs = 4
warmup_steps = int(len(features) * num_epochs * 0.2)
warmup_steps
print ("\nLoaded model training parameters...")


Loaded model training parameters...


# Sentence Transformer train

- Using 'all-MiniLM-L6-v2' as from discussions it seems to be best model, but may use other models
- Using pair of sentences and label
- Sentence1: topic_title + topic_description
- Sentence2: content title + content description + content text

In [11]:
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation
from torch.utils.data import DataLoader

print (f"\nLoading model {backbone}...")
model = SentenceTransformer(backbone)
train_sentences = []
matches = len(train_features)

print ("Converting 'features' to proper format...")
for i in range(matches):
    train_sentences.append(InputExample(texts=[train_features.iloc[i, 0], train_features.iloc[i, 1]]))

train_dataloader = DataLoader(train_sentences, shuffle=shuffle, batch_size=batch_size)

#print ("Setting multi GPU process...")
#pool = model.start_multi_process_pool()

print ("\nDefining model loss function...")
train_loss = losses.MultipleNegativesRankingLoss(model=model)
print (f"\nAll set to train model")


Loading model all-MiniLM-L6-v2...


Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Converting 'features' to proper format...

Defining model loss function...

All set to train model


In [12]:
print (f"\nTraining model with {warmup_steps} sentences and {train_loss}...")

labels = pd.Series (np.ones(matches))
print (labels)
#evaluator = evaluation.MSEEvaluator(features.iloc[:, 0].to_list(), features.iloc[:, 1].to_list(), teacher_model=model)
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=num_epochs,
          output_path=f"/kaggle/working/ST-{backbone}-trained",
          warmup_steps=warmup_steps)

print ("Model saved.")


Training model with 80000 sentences and MultipleNegativesRankingLoss(
  (model): SentenceTransformer(
    (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
    (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
    (2): Normalize()
  )
  (cross_entropy_loss): CrossEntropyLoss()
)...
0        1.0
1        1.0
2        1.0
3        1.0
4        1.0
        ... 
79995    1.0
79996    1.0
79997    1.0
79998    1.0
79999    1.0
Length: 80000, dtype: float64


Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1250 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1250 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1250 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1250 [00:00<?, ?it/s]

Model saved.


In [13]:
labels2 = pd.Series (np.ones(len(test_features)))
evaluator = evaluation.BinaryClassificationEvaluator(test_features.iloc[:, 0].to_list(), test_features.iloc[:, 1].to_list(), labels=labels2)
model.evaluate (evaluator)

1.0

In [14]:
# MSE: -0.4197012633085251 4 epochs 64 batch size (mas agora crasha sem memória)
# MSE: -0.4188443999737501 2 epochs (mas agora crasha sem memória)
# MSE: -0.8855 2 epochs 32 batch size