In [1]:
import pandas as pd

behaviors = pd.read_csv('behaviors.tsv', delimiter="\t", names=['impression_id', 'user_id', 'time', 'history', 'impressions'])

In [2]:
behaviors.shape

(2232748, 5)

In [3]:
behaviors.head(4)

Unnamed: 0,impression_id,user_id,time,history,impressions
0,1,U87243,11/10/2019 11:30:54 AM,N8668 N39081 N65259 N79529 N73408 N43615 N2937...,N78206-0 N26368-0 N7578-0 N58592-0 N19858-0 N5...
1,2,U598644,11/12/2019 1:45:29 PM,N56056 N8726 N70353 N67998 N83823 N111108 N107...,N47996-0 N82719-0 N117066-0 N8491-0 N123784-0 ...
2,3,U532401,11/13/2019 11:23:03 AM,N128643 N87446 N122948 N9375 N82348 N129412 N5...,N103852-0 N53474-0 N127836-0 N47925-1
3,4,U593596,11/12/2019 12:24:09 PM,N31043 N39592 N4104 N8223 N114581 N92747 N1207...,N38902-0 N76434-0 N71593-0 N100073-0 N108736-0...


In [4]:
# I used the following algorithm to generate the `ratings.csv` file:

# * For each behavior in the `behaviors.tsv` file:
#   * For each impression in the behavior's impressions
#     * If the impression is a click (ends with `-1`)
#       * Create a record with `userId,articleId,1`
#     * If the impression is not a click (ends with `-0`)
#       * Create a record with `userId,articleId,-0.5`
#     * Ignore items that a user hasn't seen, `librec-auto` assumes zeroes

ratings_data = []

for index, row in behaviors.iterrows():
    impressions = row['impressions'].split(' ')
    clicks = list(filter(lambda impression: impression[-2:] == '-1', impressions))
    ignores = list(filter(lambda impression: impression[-2:] == '-0', impressions))
    [ratings_data.append([row['user_id'], click[:-2], 1]) for click in clicks]
    [ratings_data.append([row['user_id'], ignore[:-2], -0.5]) for ignore in ignores]

ratings = pd.DataFrame(ratings_data, columns=['user_id', 'item_id', 'numeric_rating'])

ratings.shape

(83507374, 3)

In [5]:
def filter_by_freq(df: pd.DataFrame, column: str,
                   min_freq: int) -> pd.DataFrame:
    """Filters the DataFrame based on the value frequency in the specified column.

    :param df: DataFrame to be filtered.
    :param column: Column name that should be frequency filtered.
    :param min_freq: Minimal value frequency for the row to be accepted.
    :return: Frequency filtered DataFrame.
    """
    # Frequencies of each value in the column.
    freq = df[column].value_counts()
    # Select frequent values. Value is in the index.
    frequent_values = freq[freq >= min_freq].index
    # Return only rows with value frequency above threshold.
    return df[df[column].isin(frequent_values)]


filtered_ratings = ratings

items_or_users_have_lt_five = True

# We'll need to do this several times, so that users and items all have five items -- repeat until the number of items removed is zero
while items_or_users_have_lt_five:
    old_list_length = filtered_ratings.shape[0]
    print('filtering ratings, current size:', old_list_length)

    # Only include users with over 50 ratings
    filtered_ratings = filter_by_freq(filtered_ratings, 'user_id', 50)

    # Only include items with over 50 ratings
    filtered_ratings = filter_by_freq(filtered_ratings, 'item_id', 50)

    new_list_length = filtered_ratings.shape[0]

    items_or_users_have_lt_five = new_list_length < old_list_length


filtered_ratings.shape

filtering ratings, current size: 83507374
filtering ratings, current size: 77121260
filtering ratings, current size: 77106008


(77106008, 3)

In [6]:
print(len(filtered_ratings['user_id'].drop_duplicates()), 'users')
print(len(filtered_ratings['item_id'].drop_duplicates()), 'items')

418047 users
13393 items


In [7]:
freq = filtered_ratings['item_id'].value_counts()

# TODO: check for duplicate user/item pairs

freq.to_csv('frequency.csv')

In [8]:
filtered_ratings.count()

user_id           77106008
item_id           77106008
numeric_rating    77106008
dtype: int64

In [9]:
filtered_ratings.head(2)

Unnamed: 0,user_id,item_id,numeric_rating
0,U87243,N94157,1.0
1,U87243,N78699,1.0


In [10]:
filtered_ratings.to_csv('librec-auto-study/data/ratings.csv', header=False, index=False)
filtered_ratings.head(50000).to_csv('librec-auto-study/data/ratings-sample.csv', header=False, index=False)