In [2]:
import pickle
import os
import pandas as pd
import pyarrow
import pyarrow.parquet as pa

from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster, cophenet
from scipy.spatial.distance import pdist, squareform
import numpy as np
import math

# This flag gives the possibility to run the code in testmode
# i.e. test prints and limited I/O output such that it does not exceed the buffer (e.g. only printing one user) 
TEST = True

In [3]:
# Read in user history, i.e. dictionary containing all histories per user {<user_id>: {<article_id>, ...}}
user_history_path = "../DATA/user_history_selection/histories_pickle.txt"
user_history = pickle.load(open(user_history_path, 'rb'))

In [5]:
# TEST PRINT
if TEST:
    K = 1
    res = dict(list(user_history.items())[0: K])
    print(res)

{'2444624': {'9758057', '9745661', '9765551', '9761635', '9759681', '9753811', '9749886', '9744897', '9741802', '9767336', '9759109', '9743846', '9745016', '9743746', '9740223', '9742635', '9742440', '9744055', '9768635', '9751564', '9750189', '9740332', '9728166', '9748186', '9747796', '9747267', '9756287', '9738490', '9751385', '9754996', '9767507', '9768260', '9751030', '9739179', '9759353', '9743702', '9756297', '9735824', '9746868', '9750726', '9761359', '9761768', '9768829', '9765871', '9747411', '9764461', '9736862', '9740249', '9738760', '9745034', '9760962', '9745912', '9759461', '9770333', '9759929', '9757533', '9750304', '9741288', '9751764', '9749873', '9757876', '9756107', '9739272', '9742737', '9750718', '9744114', '9751962', '9740580', '9755010', '9766238', '9741415', '9750039', '9765798', '9760288', '9761419', '9749869', '9740161', '9747404', '9744301', '9745931', '9742764', '9759885', '9753684', '9763398', '9738868', '9743229', '9767908', '9756295', '9761699', '9754000

In [6]:
user_articles_info_path = "../DATA/user_history_selection/final_cluster_data.txt"
user_articles_info = pickle.load(open(user_articles_info_path, 'rb'))

In [7]:
# TEST PRINT
if TEST:
    K = 1
    res = dict(list(user_articles_info.items())[0: K])
    print(res)

{'2444624': {'topics': [array(['Underholdning', 'Film og tv'], dtype=object), array(['Kriminalitet', 'Bedrageri', 'Erhverv', 'Privat virksomhed',
       'Kendt', 'Økonomi', 'Politik', 'International politik'],
      dtype=object), array(['Kendt', 'Begivenhed', 'Sport', 'Sportsbegivenhed', 'Cykling'],
      dtype=object), array(['International politik', 'Konflikt og krig', 'Væbnet konflikt',
       'Politik'], dtype=object), array(['Transportmiddel', 'Bil'], dtype=object), array(['Kendt', 'Begivenhed', 'Sport', 'Sportsbegivenhed', 'Motorsport'],
      dtype=object), array(['Erhverv', 'Privat virksomhed', 'Politik', 'International politik'],
      dtype=object), array(['Transportmiddel', 'Bil', 'Katastrofe', 'Større transportmiddel'],
      dtype=object), array(['Kriminalitet', 'Kendt', 'Økonomi', 'Politik', 'National politik'],
      dtype=object), array(['Kriminalitet', 'Personfarlig kriminalitet'], dtype=object), array(['Kriminalitet', 'Erhverv', 'Privat virksomhed', 'Sport', 'Fodbold

In [8]:
print(len(user_articles_info), "users")

19704 users


In [9]:
if TEST:
    example_user_id = list(user_articles_info.keys())[0]
    print("User", example_user_id)
    print(user_articles_info[example_user_id])

User 2444624
{'topics': [array(['Underholdning', 'Film og tv'], dtype=object), array(['Kriminalitet', 'Bedrageri', 'Erhverv', 'Privat virksomhed',
       'Kendt', 'Økonomi', 'Politik', 'International politik'],
      dtype=object), array(['Kendt', 'Begivenhed', 'Sport', 'Sportsbegivenhed', 'Cykling'],
      dtype=object), array(['International politik', 'Konflikt og krig', 'Væbnet konflikt',
       'Politik'], dtype=object), array(['Transportmiddel', 'Bil'], dtype=object), array(['Kendt', 'Begivenhed', 'Sport', 'Sportsbegivenhed', 'Motorsport'],
      dtype=object), array(['Erhverv', 'Privat virksomhed', 'Politik', 'International politik'],
      dtype=object), array(['Transportmiddel', 'Bil', 'Katastrofe', 'Større transportmiddel'],
      dtype=object), array(['Kriminalitet', 'Kendt', 'Økonomi', 'Politik', 'National politik'],
      dtype=object), array(['Kriminalitet', 'Personfarlig kriminalitet'], dtype=object), array(['Kriminalitet', 'Erhverv', 'Privat virksomhed', 'Sport', 'Fodbol

In [13]:
df_ua_info = pd.DataFrame(user_articles_info).T

if TEST:
    display(df_ua_info)

# Compute what the average 'page_views' is for a user given the whole dataset, 
# i.e. the average 'page_views' for whole dataset of the average 'page_views' per user history
count_user_avg_page_views = np.array([])
for page_views in df_ua_info["page_views"]:
    user_avg_page_views = np.nanmean(np.array(page_views)) 
    count_user_avg_page_views = np.append(count_user_avg_page_views, user_avg_page_views)

avg_avg_user_page_views = np.nanmean(count_user_avg_page_views)
print("On average, the average 'page_views' per user is", avg_avg_user_page_views)

Unnamed: 0,topics,sentiment,page_views,time_published,index
2444624,"[[Underholdning, Film og tv], [Kriminalitet, B...","[0.868399977684021, 0.9979000091552734, 0.8687...","[nan, 152285.0, 91552.0, 88676.0, 77458.0, 501...","[2022-12-25 08:41:59, 2023-05-16 16:44:29, 202...","{0: '9558040', 1: '9767557', 2: '9760386', 3: ..."
2462626,"[[Samfund], [Kendt, Underholdning, Film og tv]...","[0.9352999925613403, 0.5896999835968018, 0.709...","[126400.0, 28998.0, 76735.0, 139009.0, 762726....","[2023-05-20 17:24:27, 2023-04-09 18:00:44, 202...","{0: '9773877', 1: '9700156', 2: '9780181', 3: ..."
1619335,"[[Begivenhed, Sport, Sportsbegivenhed, Motorsp...","[0.9265000224113464, 0.9352999925613403, 0.563...","[83925.0, 126400.0, 87151.0, 77903.0, 91406.0,...","[2023-05-17 11:16:30, 2023-05-20 17:24:27, 202...","{0: '9769891', 1: '9773877', 2: '9740551', 3: ..."
1867788,"[[Samfund], [Samfund, Bæredygtighed og klima, ...","[0.9352999925613403, 0.6398000121116638, 0.749...","[126400.0, 32410.0, 99376.0, 76735.0, 88676.0,...","[2023-05-20 17:24:27, 2023-05-16 10:44:34, 202...","{0: '9773877', 1: '9767725', 2: '9739333', 3: ..."
2408610,"[[Transportmiddel, Konflikt og krig, Større tr...","[0.7325999736785889, 0.868399977684021, 0.9979...","[208126.0, nan, 152285.0, 158562.0, 111243.0, ...","[2023-04-30 18:35:33, 2022-12-25 08:41:59, 202...","{0: '9743574', 1: '9558040', 2: '9767557', 3: ..."
...,...,...,...,...,...
295336,"[[Erhverv, Privat virksomhed, Økonomi], [Begiv...","[0.9803000092506409, 0.902899980545044, 0.9176...","[44211.0, 42601.0, 60109.0, 52040.0, 38253.0, ...","[2023-05-10 06:08:15, 2023-05-16 18:34:45, 202...","{0: '9757218', 1: '9767642', 2: '9763799', 3: ..."
380901,"[[Samfund], [Kriminalitet, Bedrageri, Erhverv,...","[0.9352999925613403, 0.9979000091552734, 0.927...","[126400.0, 152285.0, 82846.0, 76735.0, 91552.0...","[2023-05-20 17:24:27, 2023-05-16 16:44:29, 202...","{0: '9773877', 1: '9767557', 2: '9766242', 3: ..."
1133867,"[[Uddannelse, Grundskole], [Kendt, Livsstil, U...","[0.9247999787330627, 0.9952999949455261, 0.995...","[80438.0, 109994.0, 54769.0, 57249.0, 63143.0,...","[2023-05-24 18:02:09, 2023-05-21 17:05:37, 202...","{0: '9779705', 1: '9775142', 2: '9775873', 3: ..."
1221343,"[[Kriminalitet, Bedrageri, Erhverv, Privat vir...","[0.9979000091552734, 0.9273999929428101, 0.907...","[152285.0, 82846.0, 139009.0, 69125.0, 183259....","[2023-05-16 16:44:29, 2023-05-16 02:54:55, 202...","{0: '9767557', 1: '9766242', 2: '9773887', 3: ..."


On average, the average 'page_views' per user is 110880.13820090744


  user_avg_page_views = np.nanmean(np.array(page_views))


In [74]:
import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm
from time import sleep
import random
from collections import defaultdict, OrderedDict

# Dictionary containing al the relevant clustered articles per user
user_clustered_articles = {}

# Insert per user its cluster ids and corresponding articles
def insert_user_history_clusters(user_id, df, data_index_to_article_id):
    # Get all possible clusters
    clusters = df.cluster.unique()

    if TEST:
        print("Clusters user", user_id, ":", clusters)
    
    # print(df.dtypes)

    for c in clusters:
        if TEST:
            print("Cluster:", c)
        
        # Get all articles belonging to cluster c
        clustered_articles = df.loc[df['cluster'] == c].copy()
        # Sort articles history by most recent publishing date
        ca_history = clustered_articles.sort_values(by='time_published', ascending = False)
        
        if TEST:
            display(ca_history)
        
        article_id_indices = list(ca_history.index)

        if TEST:
            print("Cluster article indices df:", article_id_indices)
            
        # For each article in the history save the <article_id> as a key and the <time_published> as value
        for idx in article_id_indices:
            # print(idx)
            article_id = data_index_to_article_id[idx]
            timestamp = df.loc[[idx],['time_published']].values[0]

            if TEST:
                print(timestamp)

            if user_id not in user_clustered_articles:
                user_clustered_articles[user_id] = {}
                
            # When cluster does not yet exist for user
            if c not in user_clustered_articles[user_id]:
                user_clustered_articles[user_id][c] = OrderedDict()
                
            user_clustered_articles[user_id][c][article_id] = timestamp

        if TEST:
            for cluster, articles in user_clustered_articles[user_id].items():
                print("Cluster", cluster)
                for article_id, timestamp in articles.items():
                    print("Article id", article_id, ", timestamp", timestamp)
    
            print(user_clustered_articles[user_id])

if TEST:
    N = 1
else:
    N = len(user_articles_info)
    
pbar = tqdm(list(user_articles_info.keys())[:N])

# Loop through all users
for user_id in pbar:
    if TEST:
        user_id = '756293'
        
    # sleep(0.25)
    pbar.set_description("Processing user %s" % user_id)

    # Sample data
    data = user_articles_info[user_id]

    data_index_to_article_id = data['index']

    data = {'topics': data['topics'],
            'sentiment': data['sentiment'],
            'page_views': data['page_views'],
           'time_published': data['time_published']}
    
    # Convert to DataFrame
    df = pd.DataFrame(data)
    
    # Calculate the average of these columns ignoring NaN values
    avg = df[['sentiment', 'page_views', 'time_published']].mean(skipna=True)

    # When average page view is nan take the user's average 'page_view' on average 
    # (this case only occurs when user has only one article for which this value is nan 
    if math.isnan(avg['page_views']):
        avg['page_views'] = avg_avg_user_page_views

    # Imputation by the mean
    df = df.fillna({'sentiment': avg['sentiment'], 'page_views': avg['page_views'], 'time_published': avg['time_published']})

    # User history containing only one article needs no clustering, hardcode data and continue
    if len(df.index) == 1:
        df['cluster'] = [1]
        insert_user_history_clusters(user_id, df, data_index_to_article_id)
        continue
    
    # Multi-Label Binarizer for 'topics'
    mlb = MultiLabelBinarizer()
    # Normalizer for 'page_views'
    scaler = MinMaxScaler()

    topics_encoded = mlb.fit_transform(df['topics'])
    page_views_normalized = scaler.fit_transform(df[['page_views']])
    
    # Convert 'time_published' to datetime and extract the hour
    df['time_published'] = pd.to_datetime(df['time_published'], unit='s', format='%y-%m-%d %H:%M:%S')
    df['hour'] = df['time_published'].dt.hour
    
    # Normalize the hour (0-23) to be between 0 and 1
    hour_normalized = df['hour'] / 23.0
    hour_normalized = hour_normalized.values.reshape(-1, 1)
    
    # Combine all features
    features = np.hstack((topics_encoded, df[['sentiment']].values, page_views_normalized, hour_normalized))
    
    # Define a function to calculate Gower distance
    def gower_distance(X):
        individual_variable_distances = []
        for col in range(X.shape[1]):
            if np.issubdtype(X[:, col].dtype, np.number):
                range_ = np.ptp(X[:, col])
                if range_ == 0:
                    range_ = 1  # avoid division by zero
                individual_variable_distances.append(pdist(X[:, col].reshape(-1, 1), metric='euclidean') / range_)
            else:
                individual_variable_distances.append(pdist(X[:, col].reshape(-1, 1), metric='hamming'))
        return np.sqrt(sum(individual_variable_distances))
    
    # Calculate Gower distance matrix
    gower_dist_matrix = squareform(gower_distance(features))
    
    # Perform hierarchical clustering
    Z = linkage(gower_dist_matrix, method='ward')
    
    # Calculate cophenetic distances
    c, coph_dists = cophenet(Z, pdist(gower_dist_matrix))

    if TEST:
        print("Max cophenetic distance threshold =", math.floor(max(coph_dists)))
    
        # Plot the dendrogram
        import matplotlib.pyplot as plt
        plt.figure(figsize=(10, 7))
        dendrogram(Z)
        plt.title('Hierarchical Clustering Dendrogram')
        plt.xlabel('Sample index')
        plt.ylabel('Distance')
        plt.show()
        
    # Determine cluster assignments
    max_d = math.floor(max(coph_dists))  # set this value based on the dendrogram
    clusters = fcluster(Z, max_d, criterion='distance')
    
    df['cluster'] = clusters

    if TEST:
        display(df)

    insert_user_history_clusters(user_id, df, data_index_to_article_id)

Processing user 1957332: 100%|██████████| 19704/19704 [1:03:59<00:00,  5.13it/s]


In [75]:
import pickle

# Write data: clustered user articles history
# Structures :
"""
{<user_id>: 
    {<cluster_id>: 
        {<article_id>: <timestamp_published>,
        <article_id>: <timestamp_published>,
        …,
        <article_id>: <timestamp_published>}
    }
}
"""
with open('../DATA/user_history_selection/user_clustered_articles_history.pickle', 'wb') as handle:
    pickle.dump(user_clustered_articles, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [77]:
# Test
with open('../DATA/user_history_selection/user_clustered_articles_history.pickle', 'rb') as handle:
    file = pickle.load(handle)

K = 1
res = dict(list(file.items())[0: K])
print(res)

{'2444624': {2: OrderedDict([('9770882', array(['2023-05-18T04:37:32.000000000'], dtype='datetime64[ns]')), ('9770638', array(['2023-05-17T18:59:40.000000000'], dtype='datetime64[ns]')), ('9769580', array(['2023-05-17T18:31:50.000000000'], dtype='datetime64[ns]')), ('9762288', array(['2023-05-17T18:31:40.000000000'], dtype='datetime64[ns]')), ('9770604', array(['2023-05-17T17:57:43.000000000'], dtype='datetime64[ns]')), ('9769917', array(['2023-05-17T16:02:08.000000000'], dtype='datetime64[ns]')), ('9770146', array(['2023-05-17T15:40:48.000000000'], dtype='datetime64[ns]')), ('9770425', array(['2023-05-17T15:37:25.000000000'], dtype='datetime64[ns]')), ('9770328', array(['2023-05-17T14:22:46.000000000'], dtype='datetime64[ns]')), ('9770037', array(['2023-05-17T12:21:29.000000000'], dtype='datetime64[ns]')), ('9770030', array(['2023-05-17T12:15:37.000000000'], dtype='datetime64[ns]')), ('9770006', array(['2023-05-17T11:54:33.000000000'], dtype='datetime64[ns]')), ('9769994', array(['202

## DEMO

In [None]:
# Sample data
data = {
    'article_id': [1234, 13255, 223943, 221432],
    'topics': [['politics', 'economy'], ['technology'], ['health', 'science'], ['sports']],
    'sentiment': [-1, 0, 1, 0],
    'page_views': [100, 200, 300, 150],
    'time_published': [1609459200, 1609545600, 1609632000, 1609718400]  # Unix timestamps
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Multi-Label Binarizer for 'topics' 
mlb = MultiLabelBinarizer()
topics_encoded = mlb.fit_transform(df['topics'])

# Normalize 'page_views'
scaler = MinMaxScaler()
page_views_normalized = scaler.fit_transform(df[['page_views']])

# Convert 'time_published' to datetime and extract the hour
df['time_published'] = pd.to_datetime(df['time_published'], unit='s')
df['hour'] = df['time_published'].dt.hour

# Normalize the hour (0-23) to be between 0 and 1
hour_normalized = df['hour'] / 23.0
hour_normalized = hour_normalized.values.reshape(-1, 1)

# Combine all features
features = np.hstack((topics_encoded, df[['sentiment']].values, page_views_normalized, hour_normalized))

# Define a function to calculate Gower distance
def gower_distance(X):
    individual_variable_distances = []
    for col in range(X.shape[1]):
        if np.issubdtype(X[:, col].dtype, np.number):
            range_ = np.ptp(X[:, col])
            if range_ == 0:
                range_ = 1  # avoid division by zero
            individual_variable_distances.append(pdist(X[:, col].reshape(-1, 1), metric='euclidean') / range_)
        else:
            individual_variable_distances.append(pdist(X[:, col].reshape(-1, 1), metric='hamming'))
    return np.sqrt(sum(individual_variable_distances))

# Calculate Gower distance matrix
gower_dist_matrix = squareform(gower_distance(features))

# Perform hierarchical clustering
Z = linkage(gower_dist_matrix, method='ward')

# Plot the dendrogram
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 7))
dendrogram(Z)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample index')
plt.ylabel('Distance')
plt.show()

# Calculate cophenetic distances
c, coph_dists = cophenet(Z, pdist(gower_dist_matrix))

print(coph_dists)
print(math.ceil(max(coph_dists)))
print(math.floor(max(coph_dists)))

# Determine cluster assignments
max_d = 3  # set this value based on the dendrogram
clusters = fcluster(Z, max_d, criterion='distance')

df['cluster'] = clusters

print(df)
display(df)