In [1]:
import polars as pl

predictions_path = '~/experiments/Inference_Test_2024-04-15_16-17-03/predictions.parquet'
behaviors_path = '/mnt/ebs_volume/recsys_challenge/dataset/ebnerd_testset/test/behaviors.parquet'

In [2]:
predictions_df = pl.read_parquet(predictions_path)
predictions_df.head()

impression_id,user_id,article,prediction
u32,u32,i32,f64
0,245884,9791786,0.253133
0,1491159,9791480,0.332636
0,1431182,9793094,0.218526
0,1503152,9791774,0.202196
0,2406666,9792367,0.301963


In [3]:
beyond_acc_predictions = predictions_df.filter(pl.col('impression_id') == 0) \
    .group_by('user_id').agg(pl.col('article').count(), pl.col('prediction'))
beyond_acc_predictions

user_id,article,prediction
u32,u32,list[f64]
532845,250,"[0.198607, 0.156493, … 0.194264]"
1378311,250,"[0.240602, 0.369223, … 0.209994]"
1534038,250,"[0.23461, 0.473656, … 0.470568]"
1803539,250,"[0.525871, 0.536508, … 0.495367]"
2037003,250,"[0.188874, 0.61741, … 0.255359]"
…,…,…
1584208,250,"[0.240352, 0.192291, … 0.263981]"
1521432,250,"[0.163147, 0.206981, … 0.446518]"
257163,250,"[0.190553, 0.179975, … 0.187893]"
367936,250,"[0.22766, 0.223993, … 0.344541]"


If the predictions are correct, there should be 200000 impressions with id 0 and each of them should have 250 samples

In [4]:
beyond_acc_predictions.shape[0], beyond_acc_predictions['article'].unique().to_list()

(200000, [250])

In [5]:
behaviors = pl.read_parquet(behaviors_path, columns=['impression_id', 'user_id', 'article_ids_inview']).explode('article_ids_inview')
behaviors.head(5)

impression_id,user_id,article_ids_inview
u32,u32,i32
6451339,35982,9796527
6451339,35982,7851321
6451339,35982,9798805
6451339,35982,9795150
6451339,35982,9531110


Checking that each triplet (impression_id, user_id, article_ids_inview) has a prediction. If the predictions df contains all the triplets, there should not be null predictions when joining the dataframe

In [6]:
behaviors.join(predictions_df, left_on=['impression_id', 'user_id', 'article_ids_inview'],
               right_on=['impression_id', 'user_id', 'article'], how='left') \
    .select(pl.col('prediction').is_null().sum()).item(0,0)

0

Is the reordering correct?

In [7]:
ordered_predictions = behaviors.with_row_index() \
    .join(predictions_df, left_on=['impression_id', 'user_id', 'article_ids_inview'],
          right_on=['impression_id', 'user_id', 'article'], how='left') \
    .sort('index').group_by(['impression_id', 'user_id'], maintain_order=True) \
    .agg(pl.col('prediction'), pl.col('article_ids_inview')) \
    .with_columns(pl.col('prediction').list.eval(pl.element().rank(descending=True)).cast(pl.List(pl.Int16)))

The predicion dataframe can have a different order, it needs to be rearranged to match the order in the initial behaviors list

In [8]:
display(behaviors.filter(pl.col('impression_id') == 6451339))
display(predictions_df.filter(pl.col('impression_id') == 6451339).with_columns(pl.col('prediction').rank(descending=True)))
display(ordered_predictions.filter(pl.col('impression_id') == 6451339))

impression_id,user_id,article_ids_inview
u32,u32,i32
6451339,35982,9796527
6451339,35982,7851321
6451339,35982,9798805
6451339,35982,9795150
6451339,35982,9531110
6451339,35982,9798526
6451339,35982,9798682
6451339,35982,9796198
6451339,35982,9492777


impression_id,user_id,article,prediction
u32,u32,i32,f64
6451339,35982,9795150,6.0
6451339,35982,9492777,8.0
6451339,35982,9798526,3.0
6451339,35982,9531110,5.0
6451339,35982,9796198,7.0
6451339,35982,9798805,2.0
6451339,35982,7851321,9.0
6451339,35982,9796527,4.0
6451339,35982,9798682,1.0


impression_id,user_id,prediction,article_ids_inview
u32,u32,list[i16],list[i32]
6451339,35982,"[4, 9, … 8]","[9796527, 7851321, … 9492777]"


Checking if the joins have maintained the beyond accuracy samples

In [9]:
ordered_predictions.filter(pl.col('impression_id') == 0).with_columns(pl.col('prediction').list.len().alias('len')).head()

impression_id,user_id,prediction,article_ids_inview,len
u32,u32,list[i16],list[i32],u32
0,1049297,"[195, 144, … 248]","[9793163, 9793069, … 9789545]",250
0,231624,"[229, 193, … 248]","[9793163, 9793069, … 9789545]",250
0,716356,"[189, 132, … 243]","[9793163, 9793069, … 9789545]",250
0,1440307,"[164, 113, … 238]","[9793163, 9793069, … 9789545]",250
0,1822406,"[189, 156, … 246]","[9793163, 9793069, … 9789545]",250


# Checking prediction.txt file

To run this, the file needs to be unzipped before

In [4]:
sample_submission_path = '/mnt/ebs_volume/recsys_challenge/dataset/sample_submission.txt'
submission_file_path = '/home/ubuntu/experiments/Inference_Test_2024-04-15_16-17-03/predictions.txt'

Checking if all ids are present also in the file (and counting the number of impressions with id 0)

In [5]:
def read_ids_from_file(file_path):
    ids = set()
    count_zeros = 0
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split(' ')
            if len(parts) >= 1:
                ids.add(parts[0])
                if parts[0] == '0':
                    count_zeros += 1
    return ids, count_zeros

def find_missing_ids(file1_path, file2_path):
    ids_file1, count_zeros1 = read_ids_from_file(file1_path)
    ids_file2, count_zeros2 = read_ids_from_file(file2_path)
    print(count_zeros1, count_zeros2)
    missing_file2 = ids_file1.difference(ids_file2)
    missing_file1 = ids_file2.difference(ids_file1)
    return missing_file1, missing_file2

missing_file1, missing_file2 = find_missing_ids(submission_file_path, sample_submission_path)
print("Missing IDs in predictions w.r.t. sample submission:", missing_file1)
print("IDs present in predictions but not in sample submission:", missing_file2)

200000 200000
Missing IDs in predictions w.r.t. sample submission: set()
IDs present in predictions but not in sample submission: set()


Checking if the lists have the same length, no output should be returned from the function

In [7]:
def validate_files(file1_path, file2_path):
    id_lists_file1 = {}
    id_lists_file2 = {}

    with open(file1_path, 'r') as file1:
        for line in file1:
            id_, elements_str = line.strip().split(' ')
            elements = elements_str.strip('[]').split(',')
            id_lists_file1[id_] = len(elements)

    with open(file2_path, 'r') as file2:
        for line in file2:
            id_, elements_str = line.strip().split(' ')
            elements = elements_str.strip('[]').split(',')
            id_lists_file2[id_] = len(elements)

    if set(id_lists_file1.keys()) != set(id_lists_file2.keys()):
        print("Error: IDs in the two files do not match!")
        return False

    for id_ in id_lists_file1.keys():
        if id_lists_file1[id_] != id_lists_file2[id_]:
            print(f"Error: Lists for ID {id_} have different lengths ({id_lists_file1[id_]}, {id_lists_file2[id_]})!")
            return False

validate_files(submission_file_path, sample_submission_path)

Orrible code, but is needed to see if the format of the predictions is ok

In [8]:
i = 0
with open(submission_file_path) as file:
    for line in file:
        if i == 0:
            print(line)
            break

6451339 [4,9,2,6,5,3,1,7,8]

