In [17]:
import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('seaborn')
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import itertools

In [None]:
# run this for google colab
# ! git clone https://github.com/stepkurniawan/network-analysis-wikipedia-journals.git
# df_journal = pd.read_csv('/content/network-analysis-wikipedia-journals/files/df_journal.csv')

In [18]:
df_journal = pd.read_csv('C:\\Users\\Beka\Documents\\GitHub\\network-analysis-wikipedia-journals\\files\\df_journal.csv')

In [19]:
df_journal

Unnamed: 0,page_title,journal_name
0,overdeepening,special paper 337
1,ballitore,the economic history review
2,straffan,the economic history review
3,the true confessions of charlotte doyle,the horn book magazine
4,carbohydrate sulfotransferase,faseb j
...,...,...
1996716,muscle contraction,br j sports med
1996717,darrell issa,open medicine john willinsky
1996718,seth rollins,pro wrestling illustrated
1996719,chia seed,nutr hosp


In [49]:
df_num_citations = df_journal[['journal_name']].value_counts().reset_index(name='num_citation')
df_num_citations.head(10)

Unnamed: 0,journal_name,num_citation
0,nature,35475
1,journal of biological chemistry,30200
2,pnas,29544
3,science,25924
4,plos one,12602
...,...,...
176510,bull inst fr afr noire a,1
176511,httpwwwluxurytraveladvisorcom,1
176512,httpwwwluxurytravelmagazinecom,1
176513,newcastle herald,1


In [38]:
# df_num_citations.to_csv(r'df_num_citations.csv', index = False)

# Draw Network

In [None]:
# helper function used to draw the networks
def draw_network(graph, seed=42):
    d = dict(graph.degree)
    weights = [graph[u][v]['weight'] for u,v in graph.edges]
    nx.draw(graph, 
            pos=nx.spring_layout(graph, seed=seed, 
                                 #scale = 3, k=5/math.sqrt(g.order()) 
                                 ),
            with_labels=True, 
            node_size=[w['weight'] for (_,_,w) in graph.edges(data=True)], # TODO this is still not working
            width=weights
            )
    
    # for (u,v,w) in graph.edges(data=True):
    #     print(w['weight'])

In [21]:
df_journal.nunique()

page_title      486710
journal_name    176515
dtype: int64

In [22]:
df_groupby = df_journal.groupby('page_title')['journal_name'].apply(list)
df_groupby

page_title
!women art revolution                               [library journal, film comment, film internati...
"a" is for alibi                                                                    [library journal]
"as the old sing, so pipe the young" (jan steen)    [simiolus netherlands quarterly for the histor...
"baby lollipops" murder                             [florida supreme court, florida supreme court,...
"believing women" in islam                             [international journal of middle east studies]
                                                                          ...                        
‘aziziya                                            [bulletin of the american meteorological society]
‘elepaio                                            [conservation genetics, ornithological monogra...
‘eua rail                                           [proceedings of the biological society of wash...
‘ofa likiliki                                       [winfo tonga news l

In [26]:
# make list iterations based on the combinations

journal_relation_list = []

for page in range(len(df_groupby)):
    iterations = itertools.combinations(df_groupby[page], 2)
    for iter in iterations:
        journal_relation_list.append(iter)

#journal_relation_list


In [34]:
df1 = pd.DataFrame (journal_relation_list, columns = ['journal1', 'journal2'])
df1['source'] = df1['journal1'].astype(str)
df1['target'] = df1['journal2'].astype(str)
df2 = df1[['source','target']] = pd.DataFrame(np.sort(df1[['source','target']].values), 
                                 index=df1.index, columns=['source','target'])
df2

Unnamed: 0,journal1,journal2
0,film comment,library journal
1,film international,library journal
2,film comment,film international
3,artibus et historiae,simiolus netherlands quarterly for the history...
4,florida supreme court,florida supreme court
...,...,...
22751345,auk,ornithological monographs
22751346,auk,condor
22751347,winfo tonga news letter,women today pacific
22751348,iwtc womens globalnet,winfo tonga news letter


In [35]:
df_edges = df2[['source', 'target']].value_counts().reset_index(name='weight')
df_edges.head(20)

Unnamed: 0,journal1,journal2,weight
0,lloyds list,lloyds list,314693
1,billboard,billboard,97850
2,new lloyds list,new lloyds list,91271
3,journal of biological chemistry,journal of biological chemistry,61268
4,nature,science,51706
5,nature,pnas,43631
6,gaceta de madrid,gaceta de madrid,41051
7,nature,nature,40593
8,journal of biological chemistry,pnas,40297
9,cretaceous research,cretaceous research,34379


In [None]:
df_edges_sample = df_edges.sample(100)
# df_edges_sample

In [None]:
df_edges.nunique()

In [36]:
# edges before filtering 

# df_edges.to_csv(r'df_edges.csv') 

creating networkX from DF

In [None]:
G = nx.from_pandas_edgelist(df_edges, edge_attr=True) 

In [None]:
# test export back to DF

df_edges = nx.to_pandas_edgelist(G)
# df_edges

In [67]:
# df3 = df2[df2.weight>=1500]
# df3

Unnamed: 0,journal1,journal2,weight
0,lloyds list,lloyds list,314693
1,billboard,billboard,97850
2,new lloyds list,new lloyds list,91271
3,journal of biological chemistry,journal of biological chemistry,61268
4,nature,science,51706
...,...,...,...
675,acta palaeontologica polonica,palaeogeography palaeoclimatology palaeoecology,1505
676,acta palaeontologica polonica,neues jahrbuch fxfcr geologie und palxe4ontolo...,1504
677,geochimica et cosmochimica acta,nature,1504
678,journal of biological chemistry,the journal of experimental medicine,1503


# Path Finder

In [41]:
def minimal_pathfinder(G, r = float("inf")):
    """ 
    Args:
    -----
    G [networkX graph]:
        Graph to filter links from.
    r [float]:
        "r" parameter as in the paper.

    Returns:
    -----
    PFNET [networkX graph]:
        Graph containing only the PFNET links.
    """
    
    import networkx as nx
    from collections import defaultdict
    
    H = G.copy()
    
    # Initialize adjacency matrix W
    W = defaultdict(lambda: defaultdict(lambda: float("inf")))
    
    # Set diagonal to 0
    for u in H.nodes():
        W[u][u] = 0 
    
    # Get weights and set W values
    for i, j, d in H.edges(data=True):
        W[i][j] = d['weight'] # Add weights to W
        
    # Get shortest path distance matrix D
    dist = nx.floyd_warshall_predecessor_and_distance(H, weight='weight')[1]
    
    # Iterate over all triples to get values for D
    for k in H.nodes():
        for i in H.nodes():
            for j in H.nodes():
                if r == float("inf"): # adapted from the R-comato version which does a similar check
                # Discard non-shortest paths
                    dist[i][j] = min(dist[i][j], (dist[i][k] + dist[k][j]))
                else:
                    dist[i][j] = min(dist[i][j], (((dist[i][k]) ** r) + ((dist[k][j]) ** r )) ** (1/r))
                
    # Check for type; set placeholder for either case
    if not H.is_directed():
        PFNET = nx.Graph()
        PFNET.add_nodes_from(H.nodes(data=True))
    else:
        PFNET = nx.DiGraph()
        PFNET.add_nodes_from(H.nodes(data=True))
        
    # Add links D_ij only if == W_ij
    for i in H.nodes():
        for j in H.nodes():
            if dist[i][j] == W[i][j]: # If shortest path distance equals distance in adjacency
                if dist[i][j] == float("inf"): # Skip infinite path lengths
                    pass
                elif i == j: # Skip the diagonal
                    pass
                else: # Add link to PFNET
                    weight = dist[i][j]
                    PFNET.add_edge(i, j, weight=weight)
                    
    return PFNET

In [None]:
draw_network(G)

In [None]:
G_pf = minimal_pathfinder(G)
print(nx.info(G))
print(nx.info(G_pf))

In [None]:
draw_network(G_pf)

Edges Table

In [None]:
df_pf_edges = nx.to_pandas_edgelist(G_pf)
df_pf_edges

In [None]:
# edges after PFNET filtering 

# df_pf_edges.to_csv(r'df_pf_edges.csv', index = False) 

In [None]:
# edges after pfnet filtering and trim

df_pf_trim_edges = df_pf_edges[df_pf_edges.weight>=1500]
# df_pf_trim_edges.to_csv(r'df_pf_trim_edges.csv', index = False)

Nodes tables

In [None]:
# before pf

unique_journal_list = np.unique(df_edges[['source', 'target']].values)
print(len(unique_journal_list))

# from list to DF

df_unique_journal = pd.DataFrame(unique_journal_list, columns = ['journal_name'])


# create node df

df_node_size = pd.merge(df_unique_journal, 
                     df_num_citations, 
                     on ='journal_name', 
                     how ='left')

# TODO create df_nodelist by appending areas

df_node_area = pd.merge(df_node_size, 
                     df_ something*, 
                     on ='journal_name', 
                     how ='left')
df_node_area


In [None]:
# after PF

unique_journal_list_pf = np.unique(df_pf_edges[['source', 'target']].values)
unique_journal_list_pf
print(len(unique_journal_list_pf))

# from list to DF

df_unique_journal_pf = pd.DataFrame(unique_journal_list_pf, columns = ['journal_name'])

df_node_size_pf = pd.merge(df_unique_journal_pf, 
                     df_num_citations, 
                     on ='journal_name', 
                     how ='left')

# TODO create df_nodelist by appending areas

df_nodelist_pf = pd.merge(df_node_size_pf, 
                     df_ something*area*, 
                     on ='journal_name', 
                     how ='left')
df_nodelist_pf

In [None]:
# after trim

# TODO 

# Gephi csv 

In [None]:
# edge before pf
df_edgelist.to_csv(r'df_edgelist.csv', index = False)

In [None]:
# edge after pf
df_pf_edgelist.to_csv(r'df_pf_edgelist.csv', index = False)

In [None]:
# edge after pf and trim
df_pf_trim_edges.to_csv(r'df_pf_trim_edges.csv', index = False)

In [54]:
df_num_citations

Unnamed: 0,journal_name,num_citation
0,nature,35475
1,journal of biological chemistry,30200
2,pnas,29544
3,science,25924
4,plos one,12602
...,...,...
176510,bull inst fr afr noire a,1
176511,httpwwwluxurytraveladvisorcom,1
176512,httpwwwluxurytravelmagazinecom,1
176513,newcastle herald,1


In [68]:
unique_journal_list = np.unique(df3[['journal1', 'journal2']].values)
unique_journal_list
print(len(unique_journal_list))

249


In [69]:
df_unique_journal = pd.DataFrame(unique_journal_list, columns = ['journal_name'])
df_unique_journal

Unnamed: 0,journal_name
0,acta palaeontologica polonica
1,acta palaeontologica sinica
2,alcheringa an australasian journal of palaeont...
3,ameghiniana
4,american journal of botany
...,...
244,virology
245,wwwuniprotorg
246,zookeys
247,zoological journal of the linnean society


In [70]:
# create dataframe with unique journal list
# import pandas
# import pandas as pd
   
# read csv data
# only journals with num citations >=200
df_node_size = pd.merge(df_unique_journal, 
                     df_num_citations, 
                     on ='journal_name', 
                     how ='left')
df_node_size

Unnamed: 0,journal_name,num_citation
0,acta palaeontologica polonica,1316.0
1,acta palaeontologica sinica,87.0
2,alcheringa an australasian journal of palaeont...,339.0
3,ameghiniana,305.0
4,american journal of botany,1937.0
...,...,...
244,virology,968.0
245,wwwuniprotorg,3133.0
246,zookeys,2266.0
247,zoological journal of the linnean society,3304.0


In [71]:
df_node_size.nunique()

journal_name    249
num_citation    238
dtype: int64

In [85]:
areas = pd.read_csv('C:\\Users\\Beka\\Documents\\GitHub\\network-analysis-wikipedia-journals\\files2\\areas.csv', delimiter=";")
areas

Unnamed: 0,journal_name,area
0,21st Century Music,SSH
1,2D Materials,PS
2,3 Biotech,LS & PS
3,3D Printing and Additive Manufacturing,PS
4,3D Printing in Medicine,PS & HS
...,...,...
43011,ZWR,HS
43012,Zygon,SSH
43013,Zygote,LS
43014,Zywienie Czlowieka i Metabolizm,LS & HS


In [86]:
areas['journal_name'] = areas['journal_name'].str.lower()
areas['journal_name'] = areas['journal_name'].str.strip()
areas

Unnamed: 0,journal_name,area
0,21st century music,SSH
1,2d materials,PS
2,3 biotech,LS & PS
3,3d printing and additive manufacturing,PS
4,3d printing in medicine,PS & HS
...,...,...
43011,zwr,HS
43012,zygon,SSH
43013,zygote,LS
43014,zywienie czlowieka i metabolizm,LS & HS


In [87]:
# df_node_area = pd.merge_asof(df_node_size, areas, on='journal_name')
df_node_area = pd.merge(df_node_size, areas,  on ='journal_name', how = 'left')
# pd.merge(df_unique_journal, 
#                      df_num_citations, 
#                      on ='journal_name', 
#                      how ='left')
df_node_area

Unnamed: 0,journal_name,num_citation,area
0,acta palaeontologica polonica,1316.0,PS
1,acta palaeontologica sinica,87.0,
2,alcheringa an australasian journal of palaeont...,339.0,
3,ameghiniana,305.0,LS & PS
4,american journal of botany,1937.0,LS
...,...,...,...
244,virology,968.0,LS
245,wwwuniprotorg,3133.0,
246,zookeys,2266.0,LS
247,zoological journal of the linnean society,3304.0,LS


In [88]:
df_node_area.isna().sum()

journal_name      0
num_citation      1
area            111
dtype: int64

In [89]:
df_node_area.to_excel(r'df_node_area.xlsx', index = False)

In [47]:
# df_node_size.to_csv("node_size.csv", index=False)