In [None]:
## ==================== ALIGNMENT =====================
## compute user alignment and issue alignment based on
## previously computed users_blocks.csv
## and trend2topic.csv
## ====================================================

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import pairwise_distances
from scipy.cluster import hierarchy

In [None]:
def compute_user_alignment_numpy(A1, A2):    
    ## find the common indices where both arrays are nonzero
    common_indices = np.nonzero((A1 != 0) & (A2 != 0))[0]
    
    ## count the number of common indices with the same values
    same_values_count = np.sum(A1[common_indices] == A2[common_indices])
    different_values_count = len(common_indices) - same_values_count
    
    score = same_values_count-different_values_count
    
    ## divide the count by the number of common indices
    if len(common_indices) > 0:
        result = score / len(common_indices)
    else:
        result = np.nan  ## handle case when there are no common nonzero indices
    
    return result

def hierarchical_clustering(similarity_matrix,N_clusters=2):
    distance_matrix = np.nan_to_num(1-similarity_matrix)
    Z = hierarchy.linkage(distance_matrix, method='complete')  # You can choose a different linkage method if needed
    ## cut the dendrogram to obtain clusters
    num_clusters = N_clusters # You can adjust this number as needed
    cluster_labels = hierarchy.fcluster(Z, t=num_clusters, criterion='maxclust')
    ## rearrange the rows of the matrix based on cluster labels
    sorted_indices = np.argsort(cluster_labels)
    sorted_matrix = similarity_matrix[np.ix_(sorted_indices,sorted_indices)]
    return cluster_labels,sorted_matrix

def optimal_leaf_sort(matrix):
    distance_matrix = np.nan_to_num(1-matrix)
    Z = hierarchy.linkage(distance_matrix, method='complete')  # You can choose a different linkage method if needed    
    order = hierarchy.leaves_list(hierarchy.optimal_leaf_ordering(Z, distance_matrix))
    return matrix[np.ix_(order,order)],order

def compute_ideological_score(utm_user,utm_left,utm_right):
    alignment_left = np.apply_along_axis(lambda row: compute_user_alignment_numpy(utm_user, row), 1, utm_left)
    alignment_right = np.apply_along_axis(lambda row: compute_user_alignment_numpy(utm_user, row), 1, utm_right)    
    alignment_left_mean = np.nanmean(alignment_left)
    alignment_right_mean = np.nanmean(alignment_right)    
    score = 0.5 * (alignment_right_mean-alignment_left_mean)
    return score

def print_top_users_of_cluster(cluster_labels,user_indices,sortby='out_degree_total',N=100):
    N_clusters = len(set(cluster_labels))
    sorted_indices = np.argsort(cluster_labels)
    position = 0
    for i in np.arange(N_clusters):        
        subset = np.where(cluster_labels==i+1)        
        N_elements = len(subset[0])        
        print(f"Cluster {i} ({N_elements} users) | {position} - {position+N_elements}")
        print(userdf.loc[user_indices[np.where(cluster_labels==i+1)]].sort_values(by=sortby,ascending=False)['screen_name'][:N])
        print()
        position += N_elements

In [None]:
## load the user cluster assignments
userdf = pd.read_csv("./output/users_blocks.csv",dtype={'user_id':str})
sbm_stats = pd.read_csv("./output/sbm/sbm_stats.csv")
trend2idx = dict(zip(sbm_stats['trend'],sbm_stats.index))
N_users = len(userdf)
N_trends = len(trend2idx)

In [None]:
## add topics
trend2topic = pd.read_csv("./data/trend2topic.csv")
trend2topic = trend2topic.set_index('trend')
sbm_stats['main_topic'] = sbm_stats['trend'].map(trend2topic['main_topic'])
topics = sorted(sbm_stats['main_topic'].value_counts().index)
N_topics = len(topics)

In [None]:
## build the user-trends-matrix
user_trend_mat = np.zeros((N_users,N_trends),dtype=int)
for user_idx,row in tqdm(userdf.iterrows(),total=userdf.shape[0]):    
    trends = row['trends'].split("|")
    blocks = [int(i) for i in row['blocks'].split("|")]
    for idx,trend in enumerate(trends):
        trend_idx = trend2idx[trend]
        block = blocks[idx]
        user_trend_mat[user_idx][trend_idx] = block

In [None]:
## compute pairwise user alignment
N_sample = 1000
high_id_indices = userdf.sort_values(by=['in_degree_total'],ascending=False)[:N_sample].index
high_od_indices = userdf.sort_values(by=['out_degree_total'],ascending=False)[:N_sample].index 
both_indices = list(set(list(high_id_indices) + list(high_od_indices)))
random_indices = userdf[userdf['N_trends_total'] >= 10].sample(N_sample).index

results_dict = {'influencers':{'indices':high_id_indices,
                               'similarity_matrix':0},
                'multipliers':{'indices':high_od_indices,
                               'similarity_matrix':0},
                'randomusers':{'indices':random_indices,
                               'similarity_matrix':0},
                'both':{'indices':both_indices,
                        'similarity_matrix':0}
               }

for user_type in tqdm(['influencers','multipliers','randomusers','both']):
    indices = results_dict[user_type]['indices']
    user_trend_mat_sample = user_trend_mat[indices].copy()
    user_alignment_mat = pairwise_distances(user_trend_mat_sample,metric=compute_user_alignment_numpy,n_jobs=1)
    results_dict[user_type]['similarity_matrix'] = user_alignment_mat

In [None]:
## cluster
cl_inf,sm_inf = hierarchical_clustering(results_dict["influencers"]['similarity_matrix'],N_clusters=2)
cl_mul,sm_mul = hierarchical_clustering(results_dict["multipliers"]['similarity_matrix'],N_clusters=2)
cl_ran,sm_ran = hierarchical_clustering(results_dict["randomusers"]['similarity_matrix'],N_clusters=2)
cl_bot,sm_bot = hierarchical_clustering(results_dict["both"]['similarity_matrix'],N_clusters=2)

In [None]:
## get the clustering right (left/right assignment)
## this only makes sense if you know the users...

left_influencers_idx = []
right_influencers_idx = []
indices_influencers = results_dict["influencers"]["indices"]
left_influencers_idx.extend(indices_influencers[np.where(cl_inf==2)])
right_influencers_idx.extend(indices_influencers[np.where(cl_inf==1)])

left_multipliers_idx = []
right_multipliers_idx = []
indices_multipliers = results_dict["multipliers"]["indices"]
left_multipliers_idx.extend(indices_multipliers[np.where(cl_mul==1)])
right_multipliers_idx.extend(indices_multipliers[np.where(cl_mul==2)])

left_both_idx = []
right_both_idx = []
indices_both = np.array(results_dict["both"]["indices"])
left_both_idx.extend(indices_both[np.where(cl_bot==2)])
right_both_idx.extend(indices_both[np.where(cl_bot==1)])

results_dict["influencers"]["left"] = left_influencers_idx
results_dict["influencers"]["right"] = right_influencers_idx
results_dict["multipliers"]["left"] = left_multipliers_idx
results_dict["multipliers"]["right"] = right_multipliers_idx
results_dict["both"]["left"] = left_both_idx
results_dict["both"]["right"] = right_both_idx
results_dict["both"]["left"] = np.array(results_dict["both"]["left"])
results_dict["both"]["right"] = np.array(results_dict["both"]["right"])

In [None]:
## compute the user alignment to each cluster
for user_type in ['influencers','multipliers','randomusers']:
    
    utm_left = user_trend_mat[results_dict["both"]["left"]]
    utm_right = user_trend_mat[results_dict["both"]["right"]]
    
    user_set = results_dict[user_type]['indices']
    user_dict = {}
    for user_idx in tqdm(user_set):
        utm_user = user_trend_mat[user_idx]
        score_global = compute_ideological_score(utm_user,utm_left,utm_right)
        ud = {}
        ud['user_id'] = userdf.iloc[user_idx]['user_id']
        # ud['screen_name'] = userdf.iloc[user_idx]['screen_name']
        ud['global_score'] = score_global
        for topic in topics:
            ## get the indices of the topics
            topic_indices = sbm_stats[sbm_stats['main_topic'] == topic].index
            utm_user_topic = utm_user[topic_indices]
            utm_left_topic = utm_left[:,topic_indices]
            utm_right_topic = utm_right[:,topic_indices]
            topic_score = compute_ideological_score(utm_user_topic,utm_left_topic,utm_right_topic)
            ud[topic] = topic_score
        user_dict[user_idx] = ud
    alignment = pd.DataFrame(user_dict).T
    for t in topics:
        alignment[t] = alignment[t].astype(float)
    results_dict[user_type]['alignment'] = alignment

In [None]:
## compute the topic alignment matrix
for user_type in ['influencers','multipliers','randomusers']:
    alignment = results_dict[user_type]['alignment']
    topic_alignment_mat = np.zeros((N_topics,N_topics))
    for i in range(N_topics):
        for j in range(N_topics):
            topic1 = topics[i]
            topic2 = topics[j]
            topicalignment = np.nanmean(alignment[topic1] * alignment[topic2])
            topic_alignment_mat[i][j] = topicalignment        
    results_dict[user_type]['topic_alignment_mat'] = topic_alignment_mat

In [None]:
## ## issue alignment plot (1)
columns = ['global_score'] + topics
handles = ['Global'] + topics

ce = results_dict['influencers']['alignment'].copy()
ce = ce.sort_values(by='global_score')
utm_influencers = np.array(ce[columns],dtype=float)

ce = results_dict['multipliers']['alignment'].copy()
ce = ce.sort_values(by='global_score')
utm_multipliers = np.array(ce[columns],dtype=float)

aspect = .016

fig = plt.figure(figsize=(13,6))

gs = fig.add_gridspec(1, 3, width_ratios=[1, 1, 0.03], height_ratios=[1])

ax1 = fig.add_subplot(gs[0, 0])
ax2 = fig.add_subplot(gs[0, 1])

pos = ax1.imshow(utm_influencers,aspect=aspect,interpolation='nearest')
ax1.set_title("Influencers",loc='left',fontdict={'weight':'bold'})

ax1.set_xticks(np.arange(N_topics+1),np.array(handles),rotation=45,ha='right')
ax1.set_yticks([])

ax2.imshow(utm_multipliers,aspect=aspect,interpolation='nearest')
ax2.set_title("Multipliers",loc='left',fontdict={'weight':'bold'})

ax2.set_xticks(np.arange(N_topics+1),np.array(handles),rotation=45,ha='right')
ax2.set_yticks([])

ax1.set_ylabel("user index")

cax = fig.add_subplot(gs[0, 2])

## add the colorbar
cbar = fig.colorbar(pos, cax=cax)
cbar.ax.set_aspect(10.3)

plt.tight_layout(pad=-1.5)

plt.show()

In [None]:
## issue alignment plot (2)

mat1,order1 = optimal_leaf_sort(results_dict['influencers']['topic_alignment_mat'])
mat2,order2 = optimal_leaf_sort(results_dict['multipliers']['topic_alignment_mat'])

mat1 = np.flip(mat1)
order1 = order1[::-1]
mat2 = np.flip(mat2)
order2 = order2[::-1]

vmax = max([np.max(mat1),np.max(mat2)])

fig = plt.figure(figsize=(18,8))

gs = fig.add_gridspec(1, 3, width_ratios=[1, 1, .03], height_ratios=[1])

ax1 = fig.add_subplot(gs[0, 0])
ax2 = fig.add_subplot(gs[0, 1])

pos = ax1.imshow(mat1,interpolation='nearest',vmin=0,vmax=vmax)
ax1.set_title("Influencers",loc='left',fontdict={'weight':'bold'})
ax1.set_yticks(np.arange(N_topics),np.array(topics)[order1])
ax1.set_xticks(np.arange(N_topics),np.array(topics)[order1],rotation=45,ha='right')

pos = ax2.imshow(mat2,interpolation='nearest',vmin=0,vmax=vmax)
ax2.set_title("Multipliers",loc='left',fontdict={'weight':'bold'})
ax2.set_yticks(np.arange(N_topics),np.array(topics)[order2])
ax2.set_xticks(np.arange(N_topics),np.array(topics)[order2],rotation=45,ha='right')
cax = fig.add_subplot(gs[0, 2])

## add the colorbar
cbar = fig.colorbar(pos, cax=cax)

fig.tight_layout(h_pad=1)

plt.show()

In [None]:
## user alignment plot

for user_type in ['influencers','multipliers','randomusers']:
    user_idx_order = results_dict[user_type]['alignment'].sort_values(by='global_score').index
    user_idx2idx = dict(zip(results_dict[user_type]['indices'],np.arange(1000)))
    matrix_order = [user_idx2idx[i] for i in user_idx_order]
    matrix = results_dict[user_type]['similarity_matrix']
    matrix_ordered = matrix[np.ix_(matrix_order,matrix_order)]
    results_dict[user_type]['similarity_matrix_ordered'] = matrix_ordered
    
mat1 = results_dict['influencers']['similarity_matrix_ordered']
mat2 = results_dict['multipliers']['similarity_matrix_ordered']
mat3 = results_dict['randomusers']['similarity_matrix_ordered']

fig = plt.figure(figsize=(12,6))    

gs = fig.add_gridspec(1, 4, width_ratios=[1, 1, 1, 0.03], height_ratios=[1])

ax1 = fig.add_subplot(gs[0, 0])
ax2 = fig.add_subplot(gs[0, 1])
ax3 = fig.add_subplot(gs[0, 2])

ax1.set_title("Influencers",loc='left',weight='bold')
pos = ax1.imshow(mat1,cmap='viridis',interpolation='nearest',rasterized=True)
ax1.set_xticks([])
ax1.set_yticks([])

ax2.set_title("Multipliers",loc='left',weight='bold')
pos = ax2.imshow(mat2,cmap='viridis',interpolation='nearest',rasterized=True)
ax2.set_xticks([])
ax2.set_yticks([])


ax3.set_title(r"Random user sample ($N_{trends} \geq 10$)",loc='left',weight='bold')
pos = ax3.imshow(mat3,cmap='viridis',interpolation='nearest',rasterized=True)
ax3.set_xticks([])
ax3.set_yticks([])

ax1.set_ylabel("user index")
ax1.set_xlabel("user index")
ax2.set_xlabel("user index")
ax3.set_xlabel("user index")


cax = fig.add_subplot(gs[0, 3])

## add the colorbar
cbar = fig.colorbar(pos, cax=cax)
cbar.ax.set_aspect(16.7)

plt.tight_layout()

plt.show()
plt.close()    