In [None]:
import pandas as pd
import numpy as np
from utils import get_graph_from_cluster_data_without_color, get_graph_from_matrix_customized_color, save_as_json
pd.set_option("display.max_colwidth", None)
df = pd.read_json("../graph_network/full_data_iv_29-04-24.json")


## Matrix build
To build the matrix, we need to list every single linked URL in the X-axis. The individual pages that were scraped are listed in the Y-axis.

In the first step, we need the URL of each scraped page in the form of a list. These can be found in the "url" column.

In the next step, we extract each individual URL from the column of URLs linked on the page, if it is not already in the all_linked_page_urls list, it is added.

In [None]:
def get_connection_matrix(pandas_dataframe, column_containing_list_of_all_linked_pages_per_url):

    column_names = pandas_dataframe.url.tolist()

    if column_names[0] != "url":
        column_names.insert(0, "url")

    assert column_names[0] == "url", "first Element must be 'url'."
    assert column_names[1].startswith("https://"), "second Element must start with 'https://...'." 

    matrix = pd.DataFrame(columns=column_names)
    list_with_no_linked_pages = []
    all_scraped_page_urls = pandas_dataframe.url.tolist()

    for url in all_scraped_page_urls:
        idx = all_scraped_page_urls.index(url)
        list_of_linked_pages_per_url = pandas_dataframe.loc[pandas_dataframe.index[pandas_dataframe["url"]==url].tolist(), column_containing_list_of_all_linked_pages_per_url]
        is_in_list_of_all_linked_urls = [(linked_page in list_of_linked_pages_per_url[idx]) for linked_page in column_names]
        is_in_list_of_all_linked_urls[0] = url
        new_row = dict(zip(matrix.columns, is_in_list_of_all_linked_urls))      
        matrix = pd.concat([matrix, pd.DataFrame([new_row])], ignore_index=True)

    return matrix

In [None]:
# def get_all_page_urls_of_linkedpages_column(pandas_dataframe, column_of_lists_with_linked_pages="linkedpages"):
#     all_linked_page_urls = []
#     for i in pandas_dataframe.index:
#         list_of_linked_pages_per_url = pandas_dataframe.loc[i, column_of_lists_with_linked_pages]
#         for linkedpage in list_of_linked_pages_per_url:
#             if linkedpage not in all_linked_page_urls:
#                 all_linked_page_urls.append(linkedpage)
#     return all_linked_page_urls

# all_linked_page_urls = get_all_page_urls_of_linkedpages_column(df)
# len(all_linked_page_urls)

In [None]:
all_scraped_page_urls = df.url.tolist()
len(all_scraped_page_urls)

## Helper Functions

In [None]:
def get_color(input_vector):
    import numpy as np

    red = np.array([1, 0, 0])
    yellow = np.array([1, 1, 0])
    green = np.array([0, 1, 0])

    rgb = input_vector[0] * red + input_vector[1] * yellow + input_vector[2] * green

    color = "#" + "".join(f"{int(x*255):02x}" for x in rgb)
    return color


def add_color_column_of_df_to_matrix(matrix, dataframe):
    dataframe["color"] = (
        dataframe[["negative", "neutral", "positive"]]
        .apply(lambda row: np.array(row), axis=1)
        .apply(lambda row: get_color(row))
    )
    return pd.merge(matrix, dataframe[["url", "color"]], how="left", on=["url"])


def get_matrix_and_full_attributes_dataset(sentiment_data, scrape_data):
    s = pd.merge(
        sentiment_data, scrape_data[["url", "linkedpages"]], how="left", on=["url"]
    )  # .insert(loc=3, column="linkedpages", value=list_of_lipages)
    m = get_connection_matrix(scrape_data, "linkedpages")
    M = add_color_column_of_df_to_matrix(m, sentiment_data)
    S = pd.merge(s, M[["url", "color"]], how="left", on=["url"])
    S = S[
        [
            "url",
            "pagetitle",
            "negative",
            "neutral",
            "positive",
            "color",
            "text",
            "linkedpages",
        ]
    ]
    return M, S

In [None]:
A = ["D", "H"]
B = ["A", "C"]
C = ["A", "B", "C", "D"]
D = ["B", "C", "H"]

data = {
    "url": ["A", "B", "C", "D"],
    "linkedpages": [A, B, C, D,]
}
df = pd.DataFrame(data)

all_scraped_page_urls = df.url.tolist()
column_names = df.url.tolist()
column_names.insert(0, "url")

matrix = pd.DataFrame(columns=column_names)

for page_url in all_scraped_page_urls:
    idx = all_scraped_page_urls.index(page_url)
    list_of_linked_pages_per_url = df.loc[df.index[df["url"]==page_url].tolist(), "linkedpages"]
    is_in_list_of_all_linked_urls = [(linked_page in list_of_linked_pages_per_url[idx]) for linked_page in column_names]
    is_in_list_of_all_linked_urls[0] = page_url
    new_row = dict(zip(matrix.columns, is_in_list_of_all_linked_urls))      
    matrix = pd.concat([matrix, pd.DataFrame([new_row])], ignore_index=True)

# carac = pd.DataFrame({ 'page_url':['A', 'B', 'C', 'D'], 'color':[(0,0.5,0),(0.86,0,0),(1,1,0),(0.8,0.25,0)]})
matrix

In [None]:
data = {
    "url": ["A", "B", "C", "D"],
    "negative": [0.4, 0.8, 0.0, 0.1],
    "neutral": [0.2, 0.2, 0.1, 0.8],
    "positive": [0.4, 0.0, 0.9, 0.1],
}
sentiment_data = pd.DataFrame(data)
sentiment_data

In [None]:
test_matrix = add_color_column_of_df_to_matrix(matrix, sentiment_data)
test_matrix

## Run code on Scraped Data

In [None]:
sentiment_data = pd.read_json("../sentiment-analyser/saiv_28-04-24.json")
scrape_data = pd.read_json("../web-crawler/scrapy_mobiliar/mobiscraper/spiders/scrape_archive/full_scrape_IV.json")
matrix_with_color_column, full_data = get_matrix_and_full_attributes_dataset(sentiment_data, scrape_data)

# Create Graph

In [None]:
def get_graph_from_matrix_customized_color(
    matrix_df, color_col="color", edge_color="#87edec"
):
    import networkx as nx
    from pyvis.network import Network

    G = nx.DiGraph()
    for idx, row in matrix_df.iterrows():
        G.add_node(row["url"], color=row[color_col], size=120)
        start_node = row["url"]
        for column in matrix_df.columns:
            if matrix_df.loc[idx, column] == True:
                end_node = column
                G.add_edge(start_node, end_node, color=edge_color)

    N = Network(
        height="1500px",
        width="100%",
        bgcolor="#222222",
        font_color="white",
        directed=True,
        notebook=False,
    )
    N.barnes_hut(
        gravity=-80000,
        central_gravity=0.3,
        spring_length=250,
        spring_strength=0.001,
        damping=0.09,
        overlap=0,
    )
    N.from_nx(G)

    N.show_buttons()

    for node in N.nodes:
        node_id = node["id"]
        node["color"] = G.nodes[node_id].get("color", "gray")
    return G, N

In [None]:
G, N = get_graph_from_matrix_customized_color(matrix_with_color_column, color_col="color", edge_color="#018786")
N.show("graph_sentiment_coloring_iv.html")