In [19]:
import pickle
import os
import pandas as pd
import pyarrow
import pyarrow.parquet as pa

from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster, cophenet
from scipy.spatial.distance import pdist, squareform
import numpy as np
import math

TEST = False

In [20]:
# Read in user history, i.e. dictionary containing all histories per user {<user_id>: {<article_id>, ...}}
user_history_path = "../DATA/user_history_selection/histories_pickle.txt"
user_history = pickle.load(open(user_history_path, 'rb'))

In [21]:
# TEST PRINT
if TEST:
    K = 5
    res = dict(list(user_history.items())[0: K])
    print(res)

In [22]:
user_articles_info_path = "../DATA/user_history_selection/final_cluster_data.txt"
user_articles_info = pickle.load(open(user_articles_info_path, 'rb'))

In [23]:
# TEST PRINT
if TEST:
    K = 1
    res = dict(list(user_articles_info.items())[0: K])
    print(res)

In [24]:
print(len(user_articles_info), "users")

19704 users


In [25]:
if TEST:
    example_user_id = '2444624'
    print(user_articles_info[example_user_id])

In [None]:
import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm
from time import sleep
import random
from collections import defaultdict, OrderedDict

# Dictionary containing al the relevant clustered articles per user
user_clustered_articles = defaultdict(lambda: {})

# Handle data (i.e. articles) with missing feature values
def handle_missing_values(df):
    # Drop entries missing any value
    return df.dropna(how='any')

# Insert per user its cluster ids and corresponding articles
def insert_user_history_clusters(user_id, df, data_index_to_article_id):
    # Get all possible clusters
    clusters = df.cluster.unique()

    if TEST:
        print("Clusters user", user_id, ":", clusters)
    
    # print(df.dtypes)

    for c in clusters:
        if TEST:
            print("Cluster:", c)
        
        # Get all articles belonging to cluster c
        clustered_articles = df.loc[df['cluster'] == c].copy()
        # Sort articles history by most recent publishing date
        ca_history = clustered_articles.sort_values(by='time_published', ascending = False)
        
        if TEST:
            display(ca_history)
        
        article_id_indices = list(ca_history.index)

        if TEST:
            print("Cluster article indices df:", article_id_indices)
            
        # For each article in the history save the <article_id> as a key and the <time_published> as value
        for idx in article_id_indices:
            # print(idx)
            article_id = data_index_to_article_id[idx]
            timestamp = df.loc[[idx],['time_published']].values[0]

            if TEST:
                print(timestamp)

            # When cluster does not yet exist for user
            if c not in user_clustered_articles[user_id]:
                user_clustered_articles[user_id][c] = OrderedDict()
                
            user_clustered_articles[user_id][c][article_id] = timestamp

        if TEST:
            for cluster, articles in user_clustered_articles[user_id].items():
                print("Cluster", cluster)
                for article_id, timestamp in articles.items():
                    print("Article id", article_id, ", timestamp", timestamp)
    
            print(user_clustered_articles[user_id])

if TEST:
    N = 1
else:
    N = len(user_articles_info)
    
pbar = tqdm(list(user_articles_info.keys())[:N])

# Loop through all users
for user_id in pbar:
    # sleep(0.25)
    pbar.set_description("Processing user %s" % user_id)

    # Sample data
    data = user_articles_info[user_id]

    data_index_to_article_id = data['index']

    data = {'topics': data['topics'],
            'sentiment': data['sentiment'],
            'page_views': data['page_views'],
           'time_published': data['time_published']}
    
    # Convert to DataFrame
    df = pd.DataFrame(data)
    df = handle_missing_values(df)

    # Multi-Label Binarizer for 'topics'
    mlb = MultiLabelBinarizer()
    # Normalizer for 'page_views'
    scaler = MinMaxScaler()

    topics_encoded = mlb.fit_transform(df['topics'])
    page_views_normalized = scaler.fit_transform(df[['page_views']])
    
    # Convert 'time_published' to datetime and extract the hour
    df['time_published'] = pd.to_datetime(df['time_published'], unit='s', format='%y-%m-%d %H:%M:%S')
    df['hour'] = df['time_published'].dt.hour
    
    # Normalize the hour (0-23) to be between 0 and 1
    hour_normalized = df['hour'] / 23.0
    hour_normalized = hour_normalized.values.reshape(-1, 1)
    
    # Combine all features
    features = np.hstack((topics_encoded, df[['sentiment']].values, page_views_normalized, hour_normalized))
    
    # Define a function to calculate Gower distance
    def gower_distance(X):
        individual_variable_distances = []
        for col in range(X.shape[1]):
            if np.issubdtype(X[:, col].dtype, np.number):
                range_ = np.ptp(X[:, col])
                if range_ == 0:
                    range_ = 1  # avoid division by zero
                individual_variable_distances.append(pdist(X[:, col].reshape(-1, 1), metric='euclidean') / range_)
            else:
                individual_variable_distances.append(pdist(X[:, col].reshape(-1, 1), metric='hamming'))
        return np.sqrt(sum(individual_variable_distances))
    
    # Calculate Gower distance matrix
    gower_dist_matrix = squareform(gower_distance(features))
    
    # Perform hierarchical clustering
    Z = linkage(gower_dist_matrix, method='ward')
    
    # Calculate cophenetic distances
    c, coph_dists = cophenet(Z, pdist(gower_dist_matrix))

    if TEST:
        print("Max cophenetic distance threshold =", math.floor(max(coph_dists)))
    
        # Plot the dendrogram
        import matplotlib.pyplot as plt
        plt.figure(figsize=(10, 7))
        dendrogram(Z)
        plt.title('Hierarchical Clustering Dendrogram')
        plt.xlabel('Sample index')
        plt.ylabel('Distance')
        plt.show()
        
    # Determine cluster assignments
    max_d = math.floor(max(coph_dists))  # set this value based on the dendrogram
    clusters = fcluster(Z, max_d, criterion='distance')
    
    df['cluster'] = clusters

    if TEST:
        display(df)

    insert_user_history_clusters(user_id, df, data_index_to_article_id)

Processing user 260338:   2%|â–Ž            | 463/19704 [03:03<3:55:50,  1.36it/s]

## DEMO

In [None]:
# Sample data
data = {
    'article_id': [1234, 13255, 223943, 221432],
    'topics': [['politics', 'economy'], ['technology'], ['health', 'science'], ['sports']],
    'sentiment': [-1, 0, 1, 0],
    'page_views': [100, 200, 300, 150],
    'time_published': [1609459200, 1609545600, 1609632000, 1609718400]  # Unix timestamps
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Multi-Label Binarizer for 'topics' 
mlb = MultiLabelBinarizer()
topics_encoded = mlb.fit_transform(df['topics'])

# Normalize 'page_views'
scaler = MinMaxScaler()
page_views_normalized = scaler.fit_transform(df[['page_views']])

# Convert 'time_published' to datetime and extract the hour
df['time_published'] = pd.to_datetime(df['time_published'], unit='s')
df['hour'] = df['time_published'].dt.hour

# Normalize the hour (0-23) to be between 0 and 1
hour_normalized = df['hour'] / 23.0
hour_normalized = hour_normalized.values.reshape(-1, 1)

# Combine all features
features = np.hstack((topics_encoded, df[['sentiment']].values, page_views_normalized, hour_normalized))

# Define a function to calculate Gower distance
def gower_distance(X):
    individual_variable_distances = []
    for col in range(X.shape[1]):
        if np.issubdtype(X[:, col].dtype, np.number):
            range_ = np.ptp(X[:, col])
            if range_ == 0:
                range_ = 1  # avoid division by zero
            individual_variable_distances.append(pdist(X[:, col].reshape(-1, 1), metric='euclidean') / range_)
        else:
            individual_variable_distances.append(pdist(X[:, col].reshape(-1, 1), metric='hamming'))
    return np.sqrt(sum(individual_variable_distances))

# Calculate Gower distance matrix
gower_dist_matrix = squareform(gower_distance(features))

# Perform hierarchical clustering
Z = linkage(gower_dist_matrix, method='ward')

# Plot the dendrogram
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 7))
dendrogram(Z)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample index')
plt.ylabel('Distance')
plt.show()

# Calculate cophenetic distances
c, coph_dists = cophenet(Z, pdist(gower_dist_matrix))

print(coph_dists)
print(math.ceil(max(coph_dists)))
print(math.floor(max(coph_dists)))

# Determine cluster assignments
max_d = 3  # set this value based on the dendrogram
clusters = fcluster(Z, max_d, criterion='distance')

df['cluster'] = clusters

print(df)
display(df)