In [1]:
import pandas as pd
import numpy as np

pd.set_option("display.max_colwidth", None)
df = pd.read_json("ratgeber_pages_url.json")

## Matrix build
To build the matrix, we need to list every single linked URL in the X-axis. The individual pages that were scraped are listed in the Y-axis.

In the first step, we need the URL of each scraped page in the form of a list. These can be found in the "url" column.

In the next step, we extract each individual URL from the column of URLs linked on the page, if it is not already in the all_linked_page_urls list, it is added.

In [2]:
all_scraped_page_urls = df.url.tolist()
len(all_scraped_page_urls)

148

In [3]:
def get_all_page_urls_of_linkedpages_column(pandas_dataframe, column_of_lists_with_linked_pages="linkedpages"):
    all_linked_page_urls = []
    for i in pandas_dataframe.index:
        list_of_linked_pages_per_url = pandas_dataframe.loc[i, column_of_lists_with_linked_pages]
        for linkedpage in list_of_linked_pages_per_url:
            if linkedpage not in all_linked_page_urls:
                all_linked_page_urls.append(linkedpage)
    return all_linked_page_urls

all_linked_page_urls = get_all_page_urls_of_linkedpages_column(df)
len(all_linked_page_urls)

568

In [None]:
liste_der_verbindungen = []
for i in df.index:
    list_of_linked_pages_per_url = df.loc[i, "linkedpages"]
    li = [df.loc[i, "url"]] * len(list_of_linked_pages_per_url)
    data = list(zip(li, list_of_linked_pages_per_url))
    liste_der_verbindungen += data
len(liste_der_verbindungen)

## Fill Matrix

Next, we want to fill the matrix. If the respective page (row) in your content links to the other website as an X-axis attribute, enter True, otherwise False. 

In [9]:
# To figure out how to make it work, I've used this simple Test
liste_mit_allen_linked_urls = ["A", "B", "C", "D", "E", "F", "G", "H"]
page_url_1 = ["A", "H", "D"]
page_url_2 = ["D", "F"]
page_url_3 = ["C", "E", "F", "H"]
liste_aller_page_urls = [page_url_1, page_url_2, page_url_3]

test_matrix = pd.DataFrame(columns=liste_mit_allen_linked_urls)

for page_url in liste_aller_page_urls:
    ist_in_liste_mit_allen_linked_urls = [(buchstabe in page_url) for buchstabe in liste_mit_allen_linked_urls]
    new_row = dict(zip(test_matrix.columns, ist_in_liste_mit_allen_linked_urls))      
    test_matrix = pd.concat([test_matrix, pd.DataFrame([new_row])], ignore_index=True)
test_matrix = test_matrix.assign(page_url=['page_url_1', 'page_url_2', 'page_url_3'])
test_matrix

Unnamed: 0,A,B,C,D,E,F,G,H,page_url
0,True,False,False,True,False,False,False,True,page_url_1
1,False,False,False,True,False,True,False,False,page_url_2
2,False,False,True,False,True,True,False,True,page_url_3


In [None]:
# all_linked_page_urls = get_all_page_urls_of_linkedpages_column(df)

# if all_linked_page_urls[0] != "page_url":
#     all_linked_page_urls.insert(0, "page_url")

# assert all_linked_page_urls[0] == "page_url", "first Element must be 'page_url'."
# assert all_linked_page_urls[1].startswith("https://"), "second Element must start with 'https://...'." 


# matrix = pd.DataFrame(columns=all_linked_page_urls)
# all_scraped_page_urls = df.url.tolist()


# for page_url in all_scraped_page_urls:
#     idx = all_scraped_page_urls.index(page_url)
#     list_of_linked_pages_per_url = df.loc[df.index[df["url"]==page_url].tolist(), "linkedpages"]
#     is_in_list_of_all_linked_urls = [(linked_page in list_of_linked_pages_per_url[idx]) for linked_page in all_linked_page_urls]
#     is_in_list_of_all_linked_urls[0] = page_url
#     new_row = dict(zip(matrix.columns, is_in_list_of_all_linked_urls))      
#     matrix = pd.concat([matrix, pd.DataFrame([new_row])], ignore_index=True)

# matrix

In [5]:
def get_connection_matrix(pandas_dataframe, column_containing_list_of_all_linked_pages_per_url):

    all_linked_page_urls = get_all_page_urls_of_linkedpages_column(pandas_dataframe)

    if all_linked_page_urls[0] != "page_url":
        all_linked_page_urls.insert(0, "page_url")

    assert all_linked_page_urls[0] == "page_url", "first Element must be 'page_url'."
    assert all_linked_page_urls[1].startswith("https://"), "second Element must start with 'https://...'." 

    matrix = pd.DataFrame(columns=all_linked_page_urls)
    all_scraped_page_urls = pandas_dataframe.url.tolist()

    for page_url in all_scraped_page_urls:
        idx = all_scraped_page_urls.index(page_url)
        list_of_linked_pages_per_url = pandas_dataframe.loc[pandas_dataframe.index[pandas_dataframe["url"]==page_url].tolist(), column_containing_list_of_all_linked_pages_per_url]
        is_in_list_of_all_linked_urls = [(linked_page in list_of_linked_pages_per_url[idx]) for linked_page in all_linked_page_urls]
        is_in_list_of_all_linked_urls[0] = page_url
        new_row = dict(zip(matrix.columns, is_in_list_of_all_linked_urls))      
        matrix = pd.concat([matrix, pd.DataFrame([new_row])], ignore_index=True)

    return matrix

df = pd.read_json("ratgeber_pages_url.json")
matrix = get_connection_matrix(pandas_dataframe=df, column_containing_list_of_all_linked_pages_per_url="linkedpages")

In [6]:

matrix.iloc[0].value_counts()

0
False                                                                               509
True                                                                                 59
https://www.mobiliar.ch/versicherungen-und-vorsorge/wohnen-und-eigentum/ratgeber      1
Name: count, dtype: int64

# Graph

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

In [None]:
G = nx.DiGraph()

G.add_nodes_from(test_matrix['page_url'])

for idx, row in test_matrix.iterrows():
    start_node = row["page_url"]
    for column in test_matrix.columns:
        if test_matrix.loc[idx,column] == True:
            end_node = column
            G.add_edge(start_node, end_node)
            
print(G.edges)
print(G.nodes)

In [None]:
plt.figure(figsize=(10, 6))
nx.draw(G, with_labels=True, node_color='skyblue', node_size=500, font_size=8)
plt.title('Graph der Verbindungen')
plt.show()

In [None]:
G = nx.DiGraph()

G.add_nodes_from(matrix['page_url'])

for idx, row in matrix.iterrows():
    start_node = row["page_url"]
    for column in matrix.columns:
        if matrix.loc[idx,column] == True:
            print(idx,column)
            end_node = column
            G.add_edge(start_node, end_node)

In [None]:
plt.figure(figsize=(10, 6))
nx.draw(G, with_labels=True, node_color='skyblue', node_size=500, font_size=8)
plt.title('Graph der Verbindungen')
plt.show()
