<a href="https://www.kaggle.com/code/sofiamatias/learning-equality-challenge-semanticsearch?scriptVersionId=121602210" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Imports

In [None]:
import sys, os
sys.path.append("../input/sentence-transformer-package/sentence-transformers-2.2.2/sentence-transformers-2.2.2") 
import sentence_transformers
from sentence_transformers import SentenceTransformer, CrossEncoder, util

In [None]:
import numpy as np
import pandas as pd
import string
import torch

# Loading dataframes

In [None]:
challenge_files_path = '/kaggle/input/learning-equality-curriculum-recommendations'
private_files_path = '/kaggle/input/learningequalityfiles'
model_files_path = '/kaggle/input/sentence-transformer-package'

print (f"\nLoading dataframes...")

for dirname, _, filenames in os.walk(challenge_files_path):
    for filename in filenames:
        filepath = os.path.join(dirname, filename)
        print (f"\nLoading dataframe from {filepath}...")
        df = pd.read_csv (filepath)
        if 'topics' in filepath:
            topics_df = df.fillna({"title": "", "description": ""})
            display(topics_df)
        elif 'sample_submission' in filepath:
            print (f"\nLoading 'sample' dataframe...")
            sample_df = df
            display(sample_df)
        elif 'correlations' in filepath:
            correlations_df = df.fillna({"title": "", "description": ""})
            display(correlations_df)
            print (f"\nCreating exploded correlations 'corr' dataframe")
            corr_df = correlations_df.copy()
            corr_df['content_ids'] = corr_df.content_ids.str.split(' ')
            corr_df = corr_df.explode('content_ids')
            display (corr_df)
        elif 'content' in filepath:
            contents_df = df.fillna({"title": "", "description": "", "text": ""})
            display(contents_df)
print (f"\nDataframes loaded.")

# Choosing sample data

In [None]:
print ("\nDefining sampled dataset...")
use_submission_sample = True
samples = 1000
if ~sample_df.empty and use_submission_sample:
    corr_df = corr_df[corr_df.topic_id.isin(sample_df.topic_id)]
    correlations_df = correlations_df[correlations_df.topic_id.isin(sample_df.topic_id)]
    topics_df = topics_df[topics_df.id.isin(sample_df.topic_id)]
    samples = 5
    print (f"\nFiltered 'topics' to {len(topics_df)} samples and 'contents' to {len(contents_df)} samples")
else:
    topics_df = topics_df[topics_df.has_content == True].sample(n=samples)
    corr_df = corr_df[corr_df.topic_id.isin(topics_df.id)]
    correlations_df = correlations_df[correlations_df.topic_id.isin(topics_df.id)]

display (topics_df)
display (contents_df)
display (correlations_df)

# Cleaning data

* Remove ponctuation and special chars from text fields
* Delete columns 'copyright_holder' and 'license' from 'contents'
* Filter 'topics' by 'has_content' = True
* Group 'topics' and 'contents' by language
* Change 'level' column from numbers to text

In [None]:
def clean_text(text_col):
    """
    Clean ponctuation and special chars from a dataframe column
    """
    punctuations = string.punctuation
    text_col = text_col.str.replace('\W', ' ', regex=True)
    for punct in string.punctuation:
        text_col = text_col.str.replace(punct, ' ', regex=True)
    return text_col.str.lower()

In [None]:
# Cleaning topics
levels = {1: 'Level 1', 2: 'Level 2', 3: 'Level 3', 4: 'Level 4', 5: 'Level 5', 6: 'Level 6', 7: 'Level 7', 
          8: 'Level 8', 9: 'Level 9', 10: 'Level 10', 0: 'Level 0'}
topics_cols = ['title', 'description']

print (f"\nCreating and cleaning topic features...")
topic_features = topics_df.copy()
topic_features = topic_features.replace ({'level': levels})
for col in topics_cols:
    topic_features[col] = clean_text(topic_features[col])
topic_features.sort_values (by='language', inplace=True)

#topics_features['sentences'] = topics_features[topics_cols].apply(lambda x: '.'.join(x.dropna().astype(str)), axis=1)
#topics_features = topics_features.drop(columns=['parent'] + topics_cols) 
print (f"\nCreated 'topic_features'")
display (topic_features)

In [None]:
# Cleaning contents
content_cols = ['title']


print (f"\nCreating and cleaning content features...")
content_features = contents_df.copy()
for col in content_cols:
    content_features[col] = clean_text(content_features[col])
#content_features['sentences'] =  content_features[content_cols].apply(lambda x: '.'.join(x.dropna().astype(str)), axis=1)
content_features.sort_values (by='language', inplace=True)
content_features.drop(columns=['copyright_holder', 'license'], inplace=True)
print (f"\nCreated 'content_features'")
display(content_features)

# Scoring: F2 score 

In [None]:
def calculate_F2score(pred_df, act_df):
    
    """
    Using predictions_df and actual_df as exploded correlation columns to calculate F1 score.
    Results show correct predicts, recall, precision and F2 score.
    Results also return the list of correct predicts, correct_df_
    """
    print ('\nCalculating scores...')
    if pred_df.empty or act_df.empty:
        print ('\nOne or both dataframes are empty. Abort F2score calculation.')
        return None
    prediction_df=pred_df.copy()
    actual_df = act_df.copy()
    prediction_df.columns=['topic_id', 'content_ids_pred']
    actual_df.columns=['topic_id', 'content_ids_actual']
    df = pd.merge(prediction_df, actual_df, how='inner', on='topic_id')
    if df.empty:
        print ('\nNo matches between predictions and correlations. Abort F2score calculation.')
        return None
    df['tp'] = df[['content_ids_pred', 'content_ids_actual']].apply (lambda x: len(set(x['content_ids_actual'].split()).intersection(set(x['content_ids_pred'].split()))), axis=1)
    df['fp'] = df[['content_ids_pred', 'content_ids_actual']].apply (lambda x: len(set(x['content_ids_pred'].split()) - set(x['content_ids_actual'].split())), axis=1)
    df['fn'] = df[['content_ids_pred', 'content_ids_actual']].apply (lambda x: len(set(x['content_ids_actual'].split()) - set(x['content_ids_pred'].split())), axis=1)
    df['precision'] = df['tp'] / (df['tp'] + df['fp'])
    df['recall'] = df['tp'] / (df['tp'] + df['fn'])
    df['f2'] = df['tp'] / (df['tp'] + 0.2 * df['fp'] + 0.8 * df['fn']) 
    print ('\nF2 score calculation finished.')

    return df

# Getting matches: sentence transformer with retrain-rerank

In [None]:
def search(query, topic_embedding, corpus_embeddings, content_sentences, content_ids, cross_encoder, top_k):

    # passages = content_sentences

    ##### Semantic Search #####
    # find potentially relevant passages
    hits = util.semantic_search(topic_embedding, corpus_embeddings, top_k=top_k)
    hits = hits[0]  # Get the hits for the first query

    ##### Re-Ranking #####
    # Now, score all retrieved passages with the cross_encoder
    cross_inp = [[query, content_sentences[hit['corpus_id']]] for hit in hits]
    cross_scores = cross_encoder.predict(cross_inp)

    # Sort results by the cross-encoder scores
    for idx in range(len(cross_scores)):
        hits[idx]['cross-score'] = cross_scores[idx]

    # Output of top-30 hits from re-ranker
    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
    results = {}
    for hit in hits[0:top_k]:
        results[content_ids.iloc[hit['corpus_id']]] = content_sentences[hit['corpus_id']]
    return results

In [None]:
from tqdm import tqdm

gpu_on = True

if not torch.cuda.is_available():
    print("Warning: No GPU found. Please add GPU to your notebook")
    gpu_on = False
    
languages = topic_features.language.unique()
print (languages)
preds = {}
matches = {}
biencoder = "/kaggle/input/learning-equality-st-train-sm/ST-all-MiniLM-L6-v2-trained"
crossencoder = '/kaggle/input/msmarcominilml6v2/ms-marco-MiniLM-L-6-v2'

print (f"\nGetting matches using bi-encoder {biencoder} and cross encoder {crossencoder}...")

#Use the Bi-Encoder to encode all contents, so that we can use it with semantic search
bi_encoder = SentenceTransformer(biencoder)
bi_encoder.max_seq_length = 256    #Truncate long passages to 256 tokens (256 is ideal value)
top_k = 5                         #Number of passages we want to retrieve with the bi-encoder (10 is ideal value)

#Use a cross-encoder, to re-rank the results list to improve the quality
cross_encoder = CrossEncoder(crossencoder)

for lang in languages:
    print ('\nWorking on topics for language ', lang)
    content_sentences = content_features[content_features.language == lang]
    topic_sentences = topic_features[topic_features.language == lang]

    if len(content_sentences) == 0:
        print ('\nNo contents for this language.')
        continue
    if len(topic_sentences) == 0:
        print ('\nNo topics for this language.')
        continue

    
    print ("\nCalculating 'content' embeddings...")

    # encode all contents into our vector space. This takes about 5 minutes (depends on your GPU speed)
    corpus_embeddings = bi_encoder.encode(content_sentences.title.to_list(), convert_to_tensor=True, show_progress_bar=False)
    
    if gpu_on:
        corpus_embeddings = corpus_embeddings.cuda()
        corpus_embeddings = util.normalize_embeddings(corpus_embeddings)
    
    print ("\nCalculating 'topic' embeddings...")
    
    # Encode the topics using the bi-encoder
    topic_embeddings = bi_encoder.encode(topic_sentences.title.to_list(), convert_to_tensor=True, show_progress_bar=False)
    
    if gpu_on:
        topic_embeddings = topic_embeddings.cuda()
        topic_embeddings = util.normalize_embeddings(topic_embeddings)
    
    print ("\nRunning matches...")
    
    for i, (topic_embed, query) in enumerate(tqdm(zip (topic_embeddings, topic_sentences.title.to_list()), total=len(topic_sentences))):
        results = search(query,
                         topic_embed,
                         corpus_embeddings, 
                         content_sentences.title.to_list(), 
                         content_sentences.id,
                         cross_encoder,
                         top_k)
        matches[query] = results.values()
        preds[topic_sentences.iloc[i].id] = results.keys()
print ('\nEnd of calculating matches.')

In [None]:
# predictions in exploded format
preds_df = pd.DataFrame(zip (list(preds.keys()), [list(vals) for vals in preds.values()]), columns=['id','content_ids']).explode('content_ids')

# predictions in submissions format
df_preds_aux = pd.DataFrame(zip (list(preds.keys()), (' '.join(list(preds[key])) for key in preds.keys())), columns=['id','content_ids'])

predicts_submission = pd.DataFrame(topics_df.id).merge (df_preds_aux, how ='left', on = 'id')
predicts_submission.rename(columns={'id':'topic_id'}, inplace=True)
predicts_submission.fillna(' ', inplace=True)

predicts_submission
#predicts_submission.content_ids.apply (lambda x: len(x.split()))

# Model evaluation

In [None]:
scoring = True
if scoring:
    score = calculate_F2score(predicts_submission.sort_values('topic_id'), correlations_df)
    if score is not None:
        display(score)
        print ('F2 mean score:', score.f2.mean())
        print ('Correct predictions:', score.tp.sum())
        print ('Topics to match:', len(topics_df))
        print ('Contents to match:', len(corr_df.merge (topics_df.id, how = 'inner', left_on='topic_id', right_on='id')))

# Submissions

In [None]:
#predicts_submission.to_csv('submission.csv', index=False)

# Cleaning wrong matches with LGBM

In [None]:
clean_wrong_matches = False

## Calculating features 'X_test' for predictions

In [None]:
def categorize_features (X: pd.DataFrame):
    obj_feat = list(X.loc[:, X.dtypes == 'object'].columns.values)
    for feature in obj_feat:
        X[feature] = pd.Series(X[feature], dtype="category")
    return X
    
def get_y_class (y):
    return (y > 0.5).astype("bool")

In [None]:
if clean_wrong_matches:
    X_test = preds_df.merge (topic_features, how='inner', on='id')
    X_test.drop (columns = ['language', 'has_content', 'description'], inplace=True)
    X_test.rename (columns = {'id': 'topic_id', 'title':'topic_title'}, inplace=True)
    X_test = X_test.merge (content_features, how='inner', left_on='content_ids', right_on='id')
    X_test.drop (columns = ['title', 'description', 'text', 'id'], inplace=True)
    X_test = categorize_features (X_test)
    display (X_test, X_test.dtypes)

In [None]:
#corr_df
#preds_df = preds_df.merge (corr_df, how='left', on = 'content_ids')
#preds_df['match'] = (preds_df.id == preds_df.topic_id)
#preds_df

# Get model (load existing model or train model)

In [None]:
import lightgbm as lgb

if os.path.exists(private_files_path) and clean_wrong_matches:
    for dirname, _, filenames in os.walk(private_files_path):
        for filename in filenames:
            filepath = os.path.join(dirname, filename)
    model = lgb.Booster(model_file=filepath)
    print ('Loaded model from :', filepath)
elif clean_wrong_matches:
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import precision_score
    import lightgbm as lgb
    import optuna
    import datetime
        
    objective = 'binary'
    boosting_type = 'dart'

    def objective_lgbm(trial):
    
        param = {
            'boosting_type': boosting_type,
            'objective': objective,
            'is_unbalance': 'true',
            'metric': 'precision_score',
            'pos_bagging_fraction': trial.suggest_float('pos_bagging_fraction',0.1,1),
            'neg_bagging_fraction': trial.suggest_float('neg_bagging_fraction',0.1,1),
            'num_leaves': trial.suggest_int('num_leaves', 10,60),
            'max_depth': trial.suggest_int('max_depth', 10,60),
            'min_split_gain': trial.suggest_float('min_split_gain',0.1,1),
            'colsample_bytree': trial.suggest_float('colsample_bytree',0.1,1),
            'reg_alpha' : trial.suggest_float('reg_alpha',0.1,10),
            'reg_lambda': trial.suggest_float('reg_lambda',0.1,10),
            'n_estimators': trial.suggest_int('n_estimators', 150,350),
            'learning_rate': trial.suggest_float('learning_rate', 0.01,0.1),
            'verbosity': -1
        }
        num_boost_rounds = trial.suggest_int('num_boost_rounds', 50,400)
        model = lgb.train(param, train_data, num_boost_rounds)
        preds = model.predict(X_val)
        pred_labels = np.rint(preds)
        score = round(precision_score(y_val, pred_labels),4)
        return score
    
    # building a dataframe 'mix_matches' with 50% true topic-content matches and 50% false topic-content matches
    df_aux = preds_df.merge (corr_df, how='left', on='content_ids')
    true_matches = df_aux[df_aux['id'] == df_aux['topic_id']]
    true_matches_val = len(df_aux[df_aux['id'] == df_aux['topic_id']])
    false_matches = df_aux[df_aux['id'] != df_aux['topic_id']].sample(n=samples)
    mix_matches = pd.concat([true_matches, false_matches])
    mix_matches['y'] = (mix_matches.id == mix_matches.topic_id)
    mix_matches.drop (columns='topic_id', inplace=True)

    # building X features and y target 
    y = mix_matches['y']
    X = mix_matches.merge (topic_features, how='inner', on='id')
    #X = X.merge (content_features, how='inner', left_on='content_ids', right_on='id')
    X.drop (columns = ['language', 'has_content', 'description', 'y'], inplace=True)
    X.rename (columns = {'id': 'topic_id', 'title': 'topic_title'}, inplace=True)
    X = X.merge (content_features, how='inner', left_on='content_ids', right_on='id')
    X.drop (columns = ['title', 'description', 'text', 'id'], inplace=True)

    # preparing X and y for model: changing dtype to "category" and creating train/val sets
    X = categorize_features (X)

    print(f"Training features of shape {X.shape}")
    display (X)
    print(f"Training labels of shape {y.shape}")
    display (y)
    
    X_train, X_val, y_train, y_val = train_test_split (X, y, test_size = 0.3, random_state=42)

    train_data = lgb.Dataset(X_train, label = y_train)

    # get best hyperparameters
    study_lgbm = optuna.create_study(direction = 'maximize',study_name = "LGBM")
    study_lgbm.optimize(objective_lgbm, n_trials=50)

    trial_lgbm = study_lgbm.best_trial
    print("Model Accuracy --> ",trial_lgbm.value)
    print("Model's Best parameters --> ",trial_lgbm.params)

    # fit model and get score
    num_boost_rounds = trial_lgbm.params['num_boost_rounds']
    del trial_lgbm.params['num_boost_rounds']
    trial_lgbm.params['boosting_type'] = boosting_type
    trial_lgbm.params['objective'] = objective
    trial_lgbm.params['is_unbalance'] = True
    trial_lgbm.params['verbosity'] = -1
    print("Using parameters --> ",trial_lgbm.params)
    model = lgb.train(trial_lgbm.params, train_data, num_boost_rounds)
    pred_model = model.predict(X_val)
    pred_model = get_y_class (pred_model)
    score = precision_score(y_val, pred_model)
    print('\nLightGBM Model accuracy score: {0:0.4f}'.format(score))
    from sklearn.metrics import confusion_matrix
    print('\nConfusion Matrix : \n' + str(confusion_matrix(y_val,pred_model)))
    
    # save model
    date = datetime.datetime.now().strftime('%d%m%y-%H%M')
    model_filename = f"lgb-classifier-{date}.txt"
    model.save_model (model_filename)
    print ('\nSaved model as ', model_filename)

# New Predictions

In [None]:
if clean_wrong_matches:
    # get predictions
    preds_lgbm = model.predict(X_test)
    preds_lgbm = get_y_class (preds_lgbm)
    print ('\nPredictions: ', preds_lgbm[-50:])
    print (pd.DataFrame(preds_lgbm).value_counts())
    #put predictions in final format
    df_preds_aux = preds_df[preds_lgbm]
    print (df_preds_aux.groupby('id').count())
    df_preds_aux = df_preds_aux.groupby(['id']).apply (lambda x : ' '.join (x.iloc[:, 1])).reset_index()
    df_preds_aux.rename (columns = {0: 'content_ids'}, inplace=True)
    print (df_preds_aux)
    predicts_submission = pd.DataFrame(topics_df.id).merge (df_preds_aux, how ='left', on = 'id')
    predicts_submission.rename(columns={'id':'topic_id'}, inplace=True)
    predicts_submission.fillna(' ', inplace=True)

    display(predicts_submission)
    
    # submissions
    predicts_submission.to_csv('submission.csv', index=False)

In [None]:
predicts_submission.content_ids.apply (lambda x: len(x.split()))

# Final Score

In [None]:
if clean_wrong_matches and scoring:
    score = calculate_F2score(predicts_submission.sort_values('topic_id'), correlations_df)
    if score is not None:
        display(score)
        print ('F2 mean score:', score.f2.mean())
        print ('Correct predictions:', score.tp.sum())
        print ('Topics to match:', len(topics_df))
        print ('False positives:', score.fp.sum())
        print ('False negatives:', score.fn.sum())
        print ('Contents to match:', len(corr_df.merge (topics_df.id, how = 'inner', left_on='topic_id', right_on='id')))

In [None]:
predicts_submission.to_csv('submission.csv', index=False)