In [5]:
import os
from functools import reduce
from dotenv import load_dotenv
import numpy as np
import pandas as pd
import seaborn as sns

load_dotenv()

dijkstra_natural_file = rf"{os.getenv('PROJECT_PATH')}\results\dijkstra_natural.csv"
dijkstra_reverse_file = rf"{os.getenv('PROJECT_PATH')}\results\dijkstra_reverse.csv"
dijkstra_undirected_file = rf"{os.getenv('PROJECT_PATH')}\results\dijkstra_undirected.csv"


centrality_betweenness_natural_file = rf"{os.getenv('PROJECT_PATH')}\results\centrality_betweenness_natural.csv"
centrality_degree_natural_file = rf"{os.getenv('PROJECT_PATH')}\results\centrality_degree_natural.csv"
centrality_eigenvektor_natural_file = rf"{os.getenv('PROJECT_PATH')}\results\centrality_eigenvektor_natural.csv"
pagerank_natural_file = rf"{os.getenv('PROJECT_PATH')}\results\pagerank_natural.csv"

centrality_betweenness_reverse_file = rf"{os.getenv('PROJECT_PATH')}\results\centrality_betweenness_reverse.csv"
centrality_degree_reverse_file = rf"{os.getenv('PROJECT_PATH')}\results\centrality_degree_reverse.csv"
centrality_eigenvektor_reverse_file = rf"{os.getenv('PROJECT_PATH')}\results\centrality_eigenvektor_reverse.csv"
pagerank_reverse_file = rf"{os.getenv('PROJECT_PATH')}\results\pagerank_reverse.csv"

centrality_betweenness_undirected_file = rf"{os.getenv('PROJECT_PATH')}\results\centrality_betweenness_undirected.csv"
centrality_degree_undirected_file = rf"{os.getenv('PROJECT_PATH')}\results\centrality_degree_undirected.csv"
centrality_eigenvektor_undirected_file = rf"{os.getenv('PROJECT_PATH')}\results\centrality_eigenvektor_undirected.csv"
pagerank_undirected_file = rf"{os.getenv('PROJECT_PATH')}\results\pagerank_undirected.csv"

local_clustering_coefficient_file = rf"{os.getenv('PROJECT_PATH')}\results\local_clustering_coefficient.csv"

merged_file = rf"{os.getenv('PROJECT_PATH')}\results\centrality_and_pagerank.csv"

In [None]:
direction = "reverse"
df = pd.read_csv(dijkstra_reverse_file).assign(subset="all")
df["avg"].replace(0, np.nan, inplace=True)
df.sort_values(by=["avg"], inplace=True)
print(f"NaNs: {df['avg'].isna().sum()}")
df["avg"].describe(percentiles=[0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99]).to_csv(rf"{os.getenv('PROJECT_PATH')}\results\pandas\sssp\{direction}_describe.csv")

df_redirect = df.groupby(["is_redirect"])
print(df_redirect["linked_by", "avg", "articlelen"].describe())

df_is_redirect = df_redirect.get_group(True).assign(subset="redirect")
df_no_redirect = df_redirect.get_group(False).assign(subset="no redirect")

df_new = df.groupby(["is_new"])
print(df_new["linked_by", "avg", "articlelen"].describe())

df_is_new = df_new.get_group(True).assign(subset="new")
df_not_new = df_new.get_group(False).assign(subset="not new")


df = df
df_no_redirect = df_no_redirect
df_is_redirect = df_is_redirect
cdf = pd.concat([df, df_no_redirect, df_is_redirect, df_not_new, df_is_new])

sns.set_theme(style="whitegrid")

#ax = sns.boxplot(x="subset", y="avg", data= cdf, showmeans=True)
ax = sns.stripplot(x="subset", y="avg", data= cdf, color="black", alpha = 0.035, size=3, jitter=0.2)
ax = sns.violinplot(x="subset", y="avg", data= cdf, showmeans=True)

fig = ax.get_figure()

fig.savefig(rf"{os.getenv('PROJECT_PATH')}\results\images\sssp\{direction}.png", dpi=1200)

# centrality and pagerank
## combine csvs into 1

In [None]:
df_betw_natural = pd.read_csv(centrality_betweenness_natural_file, header=None, names=["id", "title", "betweenness_centrality_natural"])
df_deg_natural = pd.read_csv(centrality_degree_natural_file, header=None, usecols= [0, 2],names=["id", "title", "degree_centrality_natural"])
df_ev_natural = pd.read_csv(centrality_eigenvektor_natural_file, header=None, usecols= [0, 2], names=["id", "title", "eigenvektor_centrality_natural"])
df_rank_natural = pd.read_csv(pagerank_natural_file, header=None, usecols= [0, 2], names=["id", "title", "pagerank_natural"])

df_betw_reverse = pd.read_csv(centrality_betweenness_reverse_file, header=None, usecols= [0, 2], names=["id", "title", "betweenness_centrality_reverse"])
df_deg_reverse = pd.read_csv(centrality_degree_reverse_file, header=None, usecols= [0, 2],names=["id", "title", "degree_centrality_reverse"])
df_ev_reverse = pd.read_csv(centrality_eigenvektor_reverse_file, header=None, usecols= [0, 2], names=["id", "title", "eigenvektor_centrality_reverse"])
df_rank_reverse = pd.read_csv(pagerank_reverse_file, header=None, usecols= [0, 2], names=["id", "title", "pagerank_reverse"])

df_betw_undirected = pd.read_csv(centrality_betweenness_undirected_file, header=None, usecols= [0, 2], names=["id", "title", "betweenness_centrality_undirected"])
df_deg_undirected = pd.read_csv(centrality_degree_undirected_file, header=None, usecols= [0, 2],names=["id", "title", "degree_centrality_undirected"])
df_ev_undirected = pd.read_csv(centrality_eigenvektor_undirected_file, header=None, usecols= [0, 2], names=["id", "title", "eigenvektor_centrality_undirected"])
df_rank_undirected = pd.read_csv(pagerank_undirected_file, header=None, usecols= [0, 2], names=["id", "title", "pagerank_undirected"])

df_local_clustering_coefficient = pd.read_csv(local_clustering_coefficient_file, header=None, usecols=[0, 2], names=["id", "title", "local_clustering_coefficient"])
data_frames = [df_betw_natural,
               df_deg_natural,
               df_ev_natural,
               df_rank_natural,
               df_betw_reverse,
               df_deg_reverse,
               df_ev_reverse,
               df_rank_reverse,
               df_betw_undirected,
               df_deg_undirected,
               df_ev_undirected,
               df_rank_undirected,
               df_local_clustering_coefficient
               ]

print("a")
df_merged = reduce(lambda left, right: pd.merge(left, right, on=["id"], how="outer"), data_frames)
print("b")
df_merged.to_csv(merged_file, sep=',', na_rep='?', index=False)
df_merged = None
df_c = pd.read_csv(merged_file)
print("c")
df_c.describe(percentiles=[0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99]).to_csv(rf"{os.getenv('PROJECT_PATH')}\results\pandas\describe.csv")
df_c.corr().to_csv(rf"{os.getenv('PROJECT_PATH')}\results\pandas\correlation.csv")
df_c.sum().to_csv(rf"{os.getenv('PROJECT_PATH')}\results\pandas\sum.csv")

a
b
c


In [43]:
df_c = pd.read_csv(merged_file)

In [None]:
df_c.describe(percentiles=[0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99]).to_csv(rf"{os.getenv('PROJECT_PATH')}\results\pandas\describe.csv")
df_c.corr().to_csv(rf"{os.getenv('PROJECT_PATH')}\results\pandas\correlation.csv")
df_c.sum().to_csv(rf"{os.getenv('PROJECT_PATH')}\results\pandas\sum.csv")

In [34]:
df_c["pagerank_natural"].sum()


0.9999999999880178