In [None]:
import pandas as pd
import numpy as np
import json
import itertools
import networkx as nx
import matplotlib.pyplot as plt
from pyvis.network import Network

from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from scipy.spatial.distance import pdist, squareform

from utils import (
    save_as_json,
    get_graph_from_matrix_customized_color,
    append_cluster_color,
    get_cluster_plot,
    get_graph_from_cluster_data_without_color,
    get_cluster_overview,
    append_mean_coefficients_per_cluster,
    get_mean_sentiment_corr,
)

pd.set_option("display.max_colwidth", None)

matrix = pd.read_json(
    "../graph_network/plots_graphs_or_dfs_archive/matrix_iv_29-04-24.json"
)
graph_clustering_shortest_path = pd.read_json("../graph_network/graph_clustering_shortest_path_03-05-24.json")
graph_clustering_e_b_c = pd.read_json("../graph_network/graph_clustering_edge_betweenness_centrality_03-05-24.json")
url_clustering_3 = pd.read_json("../graph_network/url_clustering_2_04-05-24.json")
url_clustering_4 = pd.read_json("../graph_network/url_clustering_3_04-05-24.json")

In [None]:
senti = pd.read_json("../graph_network/full_data_iv_29-04-24.json")

In [None]:
G, N = get_graph_from_matrix_customized_color(matrix)
N.show("graph_sentiment_coloring_iv.html")

# Clustering based on Shortest-Path

### Function demonstration using the Karate Club graph  

In [None]:
G = nx.karate_club_graph()

path_lengths = dict(nx.all_pairs_shortest_path_length(G))
distance_matrix = np.zeros((len(G), len(G)))

nodes = list(G.nodes())
for i, node_i in enumerate(nodes):
    for j, node_j in enumerate(nodes):
        if node_j in path_lengths[node_i]:
            distance_matrix[i, j] = path_lengths[node_i][node_j]

linked = linkage(squareform(distance_matrix), method='ward')

dendrogram(linked)

# Dendrogram anzeigen
plt.title("Dendrogram")
plt.xlabel('Node Index')
plt.ylabel('Distance')
plt.show()

In [None]:
G = G.to_undirected()
isolates = list(nx.isolates(G))
G.remove_nodes_from(isolates)

In [None]:
path_lengths = dict(nx.all_pairs_shortest_path_length(G))
distance_matrix = np.zeros((len(G), len(G)))

nodes = list(G.nodes())
for i, node_i in enumerate(nodes):
    for j, node_j in enumerate(nodes):
        if node_j in path_lengths[node_i]:
            distance_matrix[i, j] = path_lengths[node_i][node_j]

linked = linkage(squareform(distance_matrix), method='ward')

In [None]:
fig, ax = plt.subplots(figsize=(21, 9))
dendrogram(linked, ax=ax, color_threshold=6)
ax.set_ylim(0, 10)
ax.set_xticks([])
ax.axhline(y=6, color='r', linestyle='--', label='Distance = 6')
plt.xlabel('Subpages of Mobiliar-Webpage')
ax.set_ylabel('Distance')
# plt.title('Full Dendrogram of Hierarchical Linkage Clustering')
ax.legend()
plt.show()

In [None]:
max_distance = 6
cluster_labels = fcluster(linked, max_distance, criterion='distance')
node_to_cluster = {node: int(cluster) for node, cluster in zip(nodes, cluster_labels)}

In [None]:
graph_clustering_shortest_path = senti.copy()
graph_clustering_shortest_path['cluster_id'] = graph_clustering_shortest_path['url'].map(node_to_cluster)

### Coloring

In [None]:
print("Rows count:", graph_clustering_shortest_path.shape[0])
graph_clustering_shortest_path = graph_clustering_shortest_path.dropna(subset=["cluster_id"])
print("Rows count:", graph_clustering_shortest_path.shape[0])

In [None]:
df = get_cluster_overview(graph_clustering_shortest_path, "cluster_id").sort_values("cluster_size", ascending=False)
df

In [None]:
get_bar_plot(df, "cluster_id")

In [None]:
# specific_colors = {
#     # 1: "red", 2: "green", 3: "blue", 4: "yellow", 5: "orange", 6: "purple", 7:"winered"
#     5: "#f5e505", 10: "#05f531", 3: "#f50515", 4: "#2d05f7", 12: "#7c02f5", 6: "#f5a505", 7:"#c20aff", 9:"#4a1a1a", 2:"#02eff7", 8:"#144026", 1:"#79801d"
    
# }
# append_cluster_color(graph_clustering_shortest_path, "cluster_id", specific_colors)
get_cluster_plot(graph_clustering_shortest_path, "cluster_id")

### Create Graph

In [None]:
# G, N = get_graph_from_cluster_data(matrix, graph_clustering_shortest_path, "cluster_id")
# graph_clustering_shortest_path = pd.read_json("../graph_network/graph_clustering_shortest_path_04-05-24.json")
G, N = get_graph_from_cluster_data_without_color(matrix, graph_clustering_shortest_path, "cluster_id") 
N.show("graph_clustering_shortest_path.html")

### Stats

In [None]:
graph_clustering_shortest_path["cluster_id"].nunique()

In [None]:
correlation_of_mean_sentiment_per_cluster, correlation_matrix_per_cluster = get_correlation_df_per_cluster(graph_clustering_shortest_path, "cluster_id")
correlation_of_mean_sentiment_per_cluster

In [None]:
get_cluster_overview(graph_clustering_shortest_path, "cluster_id")

## Edge betweenness centrality - Girvan Newman Algorithm

In [None]:
import networkx as nx
from sklearn.cluster import KMeans
import numpy as np

# Erstellen Sie Ihren Graphen
G = nx.karate_club_graph()

# Berechnen Sie die Betweenness Centrality für jeden Knoten
centrality = nx.betweenness_centrality(G)
centrality_values = np.array(list(centrality.values())).reshape(-1, 1)

# Anwendung eines Clustering-Algorithmus, hier k-means
kmeans = KMeans(n_clusters=2)
kmeans.fit(centrality_values)
labels = kmeans.labels_

# Knotenlabels zuordnen und den Graphen visualisieren
pos = nx.spring_layout(G)
nx.draw(G, pos, node_color=labels, with_labels=True, cmap='viridis')


In [None]:
G = G.to_undirected()
isolates = list(nx.isolates(G))
G.remove_nodes_from(isolates)

In [None]:
def most_central_edge(G):
    centrality = nx.edge_betweenness_centrality(G)
    return max(centrality, key=centrality.get)

k = 20
comp = nx.community.girvan_newman(G, most_valuable_edge=most_central_edge)

limited = itertools.takewhile(lambda c: len(c) <= k, comp)
for communities in limited:
    tuple(sorted(c) for c in communities)

In [None]:
community_id = {node: idx for idx, community in enumerate(communities) for node in community}

In [None]:
graph_clustering_e_b_c = senti.copy()
graph_clustering_e_b_c['commun_id'] = senti['url'].map(community_id)

### Coloring

In [None]:
print("Rows count:", graph_clustering_e_b_c.shape[0])
graph_clustering_e_b_c = graph_clustering_e_b_c.dropna(subset=["commun_id"])
print("Rows count:", graph_clustering_e_b_c.shape[0])

In [None]:
# specific_colors = {
#     5: "#f5e505", 0: "#05f531", 7: "#f50515", 2: "#2d05f7", 8: "#7c02f5", 1: "#f5a505", 4:"#c20aff"
# }
# append_cluster_color(graph_clustering_e_b_c, "commun_id",specific_colors)
get_cluster_plot(graph_clustering_e_b_c, "commun_id")

### Create Graph

In [None]:
# save_as_json(graph_clustering_e_b_c, "graph_clustering_edge_betweenness_centrality")
graph_clustering_e_b_c = pd.read_json("../graph_network/graph_clustering_edge_betweenness_centrality_03-05-24.json")
G, N = get_graph_from_cluster_data_without_color(matrix, graph_clustering_e_b_c, "commun_id") 
N.show("graph_clustering_edge_betweenness_centrality.html")

### Stats

In [None]:
df = get_cluster_overview(graph_clustering_e_b_c, "commun_id").sort_values("commun_size", ascending=False)
df

In [None]:
def get_bar_plot(df, cluster_col):  
    plt.figure(figsize=(12, 6))  # Größe des Plots festlegen
    plt.bar(df[cluster_col].astype(str), df[f'{cluster_col[:-3]}_size'], color=df[f"color_{cluster_col}"])
    plt.xlabel(f'Number of Nodes in Cluster') #{cluster_col[:-3]}
    plt.grid(True, which='both', linestyle='--', linewidth=0.5)
    ax = plt.gca()
    plt.xlabel('Cluster ID') 
    plt.ylabel(f'Number of Nodes in Cluster')
    plt.show()

get_bar_plot(df, "commun_id")

In [None]:
graph_clustering_e_b_c["commun_id"].nunique()
correlation_of_mean_sentiment_per_cluster, correlation_matrix_per_cluster = get_correlation_df_per_cluster(graph_clustering_e_b_c, "commun_id")
correlation_of_mean_sentiment_per_cluster

In [None]:
get_cluster_overview(graph_clustering_e_b_c, "commun_id")

# URL Clustering

### Helper Functions

In [None]:
def get_url_path(url):
    from urllib.parse import urlparse
    parsed_url = urlparse(url)
    return parsed_url.path, parsed_url.netloc

def get_url_list(url):
    path, domain = get_url_path(url.strip("/"))
    url_elements = [domain] 
    if path:
        url_elements.extend(x.strip() for x in path.split("/") if x.strip())  
    return url_elements

def append_url_list_to_df(df, url_col="url"):
    df["url_elements_list"] = df[url_col].apply(get_url_list)
    return df 
    
def get_list_length(df, url_list_col):
    df['list_length'] = df[url_list_col].apply(len)
    return df

senti = append_url_list_to_df(senti)

### Clustering

In [None]:
def group_by_list_element(df, column_name, position):
    df_copy = df.copy()
    element_groups = {}
    df_copy["group_id"] = df.index
    unique_group_id = 1  # Startwert für Gruppen-IDs

    for idx, row in df_copy.iterrows():
        # Prüfen, ob die Liste lang genug ist
        if position < len(row[column_name]):
            # Erstellen eines Tupels der Elemente bis zur angegebenen Position
            current_elements = tuple(row[column_name][:position + 1])
            if current_elements in element_groups:
                new_group_id = element_groups[current_elements]
            else:
                new_group_id = unique_group_id
                element_groups[current_elements] = new_group_id
                unique_group_id += 1  # Erhöhe den Zähler für Gruppen-IDs
        else:
            # Falls die Liste kürzer ist, als die geforderte Position,
            # weisen wir eine neue, eindeutige Gruppen-ID zu
            new_group_id = unique_group_id
            unique_group_id += 1  # Erhöhe den Zähler für Gruppen-IDs

        df_copy.at[idx, "group_id"] = new_group_id

    return df_copy

In [None]:
url_clustering_1 = group_by_list_element(senti, "url_elements_list", 1)
url_clustering_3 = group_by_list_element(senti, "url_elements_list", 2)
url_clustering_4 = group_by_list_element(senti, "url_elements_list", 3)

In [None]:
url_clustering_ = group_by_list_element(senti, "url_elements_list", 1)
url_clustering_["group_id"].nunique()

In [None]:
fig, ax = plt.subplots()
x = [0,1,2,3,4,5,6,7,8,9,10]
y = [0,13,42,128,294,1179,1226,1561,1702,1735,1746]
# ax.plt.figure(figsize=(6, 4))
plt.plot(x, y, marker='o')
# plt.plot(y)
ax.set(xlabel='List Element Position', ylabel='Number of Groups',
       title='')
ax.grid()
plt.show()

### Coloring

In [None]:
# specific_colors = {
#    13: "#f5e505", 11: "#05f531", 14: "#f50515", 8: "#2d05f7", 2: "#f5a505", 16: "#7c02f5", 4:"#c20aff", 12:"#4a1a1a", 7:"#02eff7", 10:"#144026", 32:"#79801d"
# }
# append_cluster_color(url_clustering_3, "group_id", specific_colors)

get_cluster_plot(url_clustering_3, "group_id")

# save_as_json(url_clustering_3, "url_clustering_3")

In [None]:
# specific_colors = {
#    176: "#f5e505", 166: "#05f531", 108: "#f50515", 90: "#2d05f7", 12: "#f5a505", 271: "#7c02f5", 283:"#c20aff", 20:"#4a1a1a", 99:"#02eff7", 162:"#144026", 69:"#79801d"
# }
# append_cluster_color(url_clustering_4, "group_id", specific_colors)

get_cluster_plot(url_clustering_4, "group_id")
# save_as_json(url_clustering_4, "url_clustering_4")

### Create Graph

In [None]:
# graph_url_clustering_4 = pd.read_json("../graph_network/url_clustering_4_04-05-24.json")
G, N = get_graph_from_cluster_data_without_color(matrix, url_clustering_4, "group_id") 
N.show("graph_url_clustering_4.html")

In [None]:
# graph_url_clustering_3 = pd.read_json("../graph_network/url_clustering_3_04-05-24.json")
G, N = get_graph_from_cluster_data_without_color(matrix, url_clustering_3, "group_id") 
N.show("graph_url_clustering_3.html")

In [None]:
def get_identical_url_elements_col(df, cluster_col, position):
    avg_senti_size_per_cluster = get_cluster_overview(df, "group_id")
    df = df[df[cluster_col].isin(avg_senti_size_per_cluster[cluster_col])]
    df['identical_url_elements'] = df['url_elements_list'].apply(lambda x: x[:position] if len(x) >= position else [None]*position)
    og_df = df.groupby(cluster_col).first().reset_index()
    avg_senti_size_per_cluster = pd.merge(avg_senti_size_per_cluster, og_df[[cluster_col, 'identical_url_elements']], on=cluster_col, how='left')
    avg_senti_size_per_cluster = avg_senti_size_per_cluster.sort_values("group_size", ascending=False)
    return avg_senti_size_per_cluster.reset_index()

In [None]:
def get_bar_plot(df, cluster_col):  
    plt.figure(figsize=(12, 6))  # Größe des Plots festlegen
    plt.barh(df['identical_url_elements'].astype(str), df[f'{cluster_col[:-3]}_size'], color=df[f"color_{cluster_col}"])

    plt.gca().invert_yaxis()
    plt.gca().invert_xaxis()
    plt.xlabel(f'Number of Nodes in Cluster') #{cluster_col[:-3]}
    plt.grid(True, which='both', linestyle='--', linewidth=0.5)
    plt.tight_layout()
    # yaxis.set_label_position("right")
    ax = plt.gca()  # Aktuelle Achse holen
    ax.yaxis.set_label_position("right")  # Position des Y-Achsen-Labels setzen
    ax.yaxis.tick_right()
    plt.show()

In [None]:
df4 = get_identical_url_elements_col(url_clustering_4, "group_id", 4)
df4[:10].style.set_properties(subset=['identical_url_elements'], **{'text-align': 'left'})

In [None]:
get_bar_plot(df4, "group_id")

In [None]:
df3 = get_identical_url_elements_col(url_clustering_3, "group_id", 3)
df3[:10].style.set_properties(subset=['identical_url_elements'], **{'text-align': 'left'})

In [None]:
get_bar_plot(df3, "group_id")

In [None]:
def plot_regression_multi(df):
    columns_pairs = [
        ("negative", "avg_cluster_neg", 'red'),
        ("neutral", "avg_cluster_neu", 'blue'),
        ("positive", "avg_cluster_pos", 'green')
    ]
    
    plt.figure(figsize=(10, 10))
    for x_col, y_col, color in columns_pairs:
        df_filtered = df.dropna(subset=[x_col, y_col])
        
        x = df_filtered[x_col]
        y = df_filtered[y_col]
        m, b = np.polyfit(x, y, 1)
        plt.scatter(x, y, color=color, label=f'{x_col} vs {y_col}', s=1)
        plt.plot(x, m*x + b, color=color)
    plt.xlabel('Value')
    plt.ylabel('Average Cluster Value')
    plt.title('Regression Plot of Different Sentiment Coefficients vs. Average Cluster Metrics')
    plt.legend()

    plt.xlim(0, 1)
    plt.ylim(0, 1)
    plt.show()
    
plot_regression_multi(append_mean_coefficients_per_cluster(url_clustering_4, "group_id"))