## Download Data

In [52]:
import requests
import zipfile
import os

# URL file ZIP
url = "https://ebnerd-dataset.s3.eu-west-1.amazonaws.com/ebnerd_demo.zip"
output_zip_path = "ebnerd_demo.zip" 
extract_to_path = 'ebnerd_demo' 

try:
    print("Download file...")
    response = requests.get(url, stream=True)
    response.raise_for_status()  
    with open(output_zip_path, "wb") as file:
        for chunk in response.iter_content(chunk_size=8192):
            file.write(chunk)
    print(f"File : {output_zip_path}")

    # Mengekstrak file ZIP
    print("extract file...")
    with zipfile.ZipFile(output_zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to_path)
    print(f"File extract to: {extract_to_path}")

except requests.exceptions.RequestException as e:
    print(f"erorr: {e}")
except zipfile.BadZipFile:
    print("Not Valid File ZIP.")


Download file...
File : ebnerd_demo.zip
extract file...
File extract to: ebnerd_demo


## Open Dataset

In [53]:
import pandas as pd
import numpy as np

df_1 = pd.read_parquet('ebnerd_demo/train/history.parquet')
df_2 = pd.read_parquet('ebnerd_demo/validation/history.parquet')
df = pd.concat([df_1, df_2])

In [54]:
df = df.explode(['impression_time_fixed', 'scroll_percentage_fixed','article_id_fixed', 'read_time_fixed'])

In [55]:
df = df.rename(columns={'impression_time_fixed': 'timestamp', 'scroll_percentage_fixed': 'scroll_percentage', 'article_id_fixed': 'item_id', 'read_time_fixed': 'read_time'})

In [56]:
df.head()

Unnamed: 0,user_id,timestamp,scroll_percentage,item_id,read_time
0,13538,2023-04-27 10:17:43,100.0,9738663,17.0
0,13538,2023-04-27 10:18:01,35.0,9738569,12.0
0,13538,2023-04-27 10:18:13,100.0,9738663,4.0
0,13538,2023-04-27 10:18:17,24.0,9738490,5.0
0,13538,2023-04-27 10:18:23,100.0,9738663,4.0


In [57]:
num_user = df['user_id'].nunique()
num_item = df['item_id'].nunique()
interaction = len(df)
sparcity = 1 - (interaction / (num_user * num_item))

print(f'Number of users: {num_user}')
print(f'Number of items: {num_item}')
print(f'Number of interactions: {interaction}')
print(f'Sparsity: {sparcity}')

Number of users: 1935
Number of items: 5349
Number of interactions: 498376
Sparsity: 0.9518491949278838


## Delete Null Value

In [58]:
df = df.dropna()

In [59]:
num_user = df['user_id'].nunique()
num_item = df['item_id'].nunique()
interaction = len(df)
sparcity = 1 - (interaction / (num_user * num_item))

print(f'Number of users: {num_user}')
print(f'Number of items: {num_item}')
print(f'Number of interactions: {interaction}')
print(f'Sparsity: {sparcity}')

Number of users: 1935
Number of items: 5029
Number of interactions: 448466
Sparsity: 0.9539142225736722


## Delete Duplicate Value

In [60]:
df = df.drop_duplicates(['user_id', 'item_id'])

In [61]:
num_user = df['user_id'].nunique()
num_item = df['item_id'].nunique()
interaction = len(df)
sparcity = 1 - (interaction / (num_user * num_item))

print(f'Number of users: {num_user}')
print(f'Number of items: {num_item}')
print(f'Number of interactions: {interaction}')
print(f'Sparsity: {sparcity}')

Number of users: 1935
Number of items: 5029
Number of interactions: 273220
Sparsity: 0.9719230530108831


## Delete user or item minim interaction

In [62]:
UN = 10
IN = 5

while True:
    df = df[df.groupby('user_id')['user_id'].transform('count') >= UN]
    df = df[df.groupby('item_id')['item_id'].transform('count') >= IN]

    if (df.groupby('user_id').count().min().item_id >= UN) and df.groupby('item_id').count().min().user_id >= IN:
        break
    if np.isnan(df.groupby('user_id').count().min().item_id) or np.isnan(df.groupby('item_id').count().min().user_id):
        break

In [63]:
num_user = df['user_id'].nunique()
num_item = df['item_id'].nunique()
interaction = len(df)
sparcity = 1 - (interaction / (num_user * num_item))

print(f'Number of users: {num_user}')
print(f'Number of items: {num_item}')
print(f'Number of interactions: {interaction}')
print(f'Sparsity: {sparcity}')

Number of users: 1723
Number of items: 3247
Number of interactions: 269250
Sparsity: 0.9518730714596857


## Generate Internal ID

In [64]:
u_ids = df['user_id'].unique().tolist()
i_ids = df['item_id'].unique().tolist()

user_dict = dict(zip(u_ids, [i for i in range(len(u_ids))]))
item_dict = dict(zip(i_ids, [i for i in range(len(i_ids))]))

df['user_id'] = df['user_id'].map(user_dict)
df['item_id'] = df['item_id'].map(item_dict)

In [65]:
df = df.sort_values(by=['timestamp']).reset_index(drop=True)

In [66]:
num_user = df['user_id'].nunique()
num_item = df['item_id'].nunique()
interaction = len(df)
sparcity = 1 - (interaction / (num_user * num_item))

print(f'Number of users: {num_user}')
print(f'Number of items: {num_item}')
print(f'Number of interactions: {interaction}')
print(f'Sparsity: {sparcity}')

Number of users: 1723
Number of items: 3247
Number of interactions: 269250
Sparsity: 0.9518730714596857


In [67]:
df.head()

Unnamed: 0,user_id,timestamp,scroll_percentage,item_id,read_time
0,1107,2023-04-27 07:00:05,35.0,591,163.0
1,1107,2023-04-27 07:00:08,100.0,750,1197.0
2,640,2023-04-27 07:00:25,17.0,818,5.0
3,739,2023-04-27 07:00:26,43.0,2301,2.0
4,739,2023-04-27 07:00:28,100.0,1418,3.0


## Train Test Validation Split

In [68]:
from lenskit.crossfold import sample_rows

def split_train_test(df, test_size=0.2):
    len_test = int(len(df) * test_size)
    train, test = sample_rows(df, None, len_test)
    return train, test

In [69]:
train_df, test_df = split_train_test(df)

train_df, valid_df = split_train_test(train_df)

In [70]:
len(train_df),len(valid_df),len(test_df)

(172320, 43080, 53850)

In [71]:
train_df['user_id'].nunique(), train_df['item_id'].nunique()

(1723, 3246)

In [72]:
valid_df['user_id'].nunique(), valid_df['item_id'].nunique()

(1700, 3155)

In [73]:
test_df['user_id'].nunique(), test_df['item_id'].nunique()

(1714, 3173)

## Save Data

In [74]:
os.makedirs(f'../data/ebnerd', exist_ok=True)

train_df.to_csv(f'../data/ebnerd/train_df.csv', index=False)
valid_df.to_csv(f'../data/ebnerd/valid_df.csv', index=False)
test_df.to_csv(f'../data/ebnerd/test_df.csv', index=False)