In [8]:
from tqdm import tqdm
import pandas as pandas
import random

In [None]:
NUMBER_OF_SUBSETS = 385

In [None]:
# 1. Combine all of the csv files
print('1. Concatenating all of the csv files...')
clicks_csv = [pandas.read_csv(f'archive/clicks/clicks/clicks_hour_{str(hour).zfill(3)}.csv') for hour in range(NUMBER_OF_SUBSETS)]
all_clicks = pandas.concat(clicks_csv)

In [None]:
# Checkpoint 1. Save the combined csv file
print('--- Saving checkpoint 1. Combined csv file...')
all_clicks.to_csv('globo_all_clicks.csv', index=False)

In [None]:
# 2. Only keep relevant columns
print('2. Only keeping relevant columns...')
only_relevant_columns = all_clicks[['session_id', 'session_size', 'click_article_id', 'click_timestamp']]

In [None]:
# 3. Discard sessions with less than 2 clicks
print('3. Discarding sessions with less than 2 clicks...')
all_clicks_with_at_least_2_clicks = only_relevant_columns.query('session_size > 1')

In [None]:
# 4. Normalize item ids starting from 1
print('4. Normalizing item ids starting from 1...')

# Sort the values in ascending order
all_clicks_with_at_least_2_clicks.sort_values(by='click_article_id')

# Create a new column with integers representing unique values starting from 1
all_clicks_with_at_least_2_clicks['item_id'] = all_clicks_with_at_least_2_clicks['click_article_id'].astype('category').cat.codes + 1

# Drop the original column
all_clicks_with_at_least_2_clicks.drop('click_article_id', axis=1, inplace=True)

In [None]:
# Checkpoint 2. Save the normalized csv file
print('--- Saving checkpoint 2. Normalized csv file...')
all_clicks_with_at_least_2_clicks.to_csv('globo_normalized_items.csv', index=False)

In [None]:
# 5. Create master sequence file
print('5. Creating master sequence file...')

# Retrieve from the checkpoint
globo_normalized_items = pandas.read_csv('globo_normalized_items.csv')

# Rename the click_timestamp column to timestamp
globo_normalized_items.rename(columns={'click_timestamp': 'timestamp'}, inplace=True)

# Same mapping for the session_id column but with integers starting from 0
# This is done in order to use the integers as indices for the sessions
globo_normalized_items.sort_values(by='session_id')
globo_normalized_items['session'] = globo_normalized_items['session_id'].astype('category').cat.codes
globo_normalized_items.drop('session_id', axis=1, inplace=True)

# Grabbing the number of sessions
number_of_sessions = globo_normalized_items.iloc[-1]['session'] + 1

print(f'Number of sessions: {number_of_sessions}')

with open('globo_sequences.txt', 'w') as f:
    # For each session, sort its timestamps in ascending order
    for session in tqdm(range(number_of_sessions)):
        session_data = globo_normalized_items.query(f'session == {session}')
        session_data.sort_values(by='timestamp', inplace=True)
        
        # Then grab its item ids and put them in a list
        session_sequence = session_data['item_id'].values.tolist()
        
        # Append session_sequence to a file containing all of the session's subsequences*. The items are separated by a comma
        # and the sessions are separated by a new line
        for subsequence in range(2, len(session_sequence) + 1):
            subsequence = ','.join(map(str, session_sequence[:subsequence]))
            f.write(f'{subsequence}\n')
            
        # * A subsequence in this context starts from the beginning and is a subset of the current sequence.
        # For example, the sequence [1, 2, 3, 4] has the subsequences [1, 2], [1, 2, 3], [1, 2, 3, 4]

In [9]:
def split_dataset(filename, ratio=0.9):
    with open(filename, "r") as f:
        lines = f.readlines()
        
    # Split the dataset into a training set and a testing set
    random.shuffle(lines)
    num_lines = len(lines)
    training_set_size = int(ratio * num_lines)
    training_set = lines[:training_set_size]
    testing_set = lines[training_set_size:]

    # Create a set of items in the training set
    item_set = set()
    for line in training_set:
        items = line.strip().split(",")
        item_set |= set(items) # item_set.union(set(items))

    # Filter out the sequences that contain items that are not in the training set
    filtered_testing_set = []
    for line in testing_set:
        items = line.strip().split(",")
        if set(items).isdisjoint(item_set):
            filtered_testing_set.append(line)

    return training_set, filtered_testing_set

In [11]:
# 6. Splitting the dataset into a training set and a testing set
print("6. Splitting the dataset into a training set and a testing set...")

training_set, testing_set = split_dataset('globo_sequences.txt')

with open('globo_training_set.txt', 'w') as f:
    f.writelines(training_set)

with open('globo_testing_set.txt', 'w') as f:
    f.writelines(testing_set)