In [1]:
import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import operator
import random as rd

In [2]:
data = pd.read_excel('C:/Users/HP/Desktop/Author_EdgeList_Weighted.xlsx', header = 0)
g = data[['Citing Author', 'Cited Author']]
# created a directed graph
graph = nx.DiGraph()

# Add edges to the graph using the 'Citing Author' and 'Cited Author' columns from the DataFrame
for _, row in g.iterrows():
    graph.add_edge(row['Citing Author'], row['Cited Author'])
data

Unnamed: 0,Citing Author,Cited Author,No of Citations,Total Scores
0,"Sporleder, Caroline","Jing, Hongyan",1,0.000
1,"Sporleder, Caroline","McKeown, Kathleen",1,0.000
2,"Lapata, Mirella","Jing, Hongyan",1,0.000
3,"Lapata, Mirella","McKeown, Kathleen",1,0.000
4,"Tanaka, Hideki","Jing, Hongyan",2,1.125
...,...,...,...,...
18227,"Zhang, Ruiqiang","Ratnaparkhi, Adwait",1,0.625
18228,"Marquez, Lluis","Ratnaparkhi, Adwait",1,0.000
18229,"Rodriguez, Horacio","Ratnaparkhi, Adwait",1,0.000
18230,"Carmona, Josep","Ratnaparkhi, Adwait",1,0.000


In [3]:
# Create a directed graph using NetworkX
graph_citations = nx.DiGraph()
graph_polarity = nx.DiGraph()

# Add edges to the graph using the 'Citing Author', 'Cited Author', and the respective normalized weights
for _, row in data.iterrows():
    graph_citations.add_edge(row['Citing Author'], row['Cited Author'], weight=row['No of Citations'])
    graph_polarity.add_edge(row['Citing Author'], row['Cited Author'], weight=row['Total Scores'])
# Calculate PageRank for the two graphs
page_rank_citations = nx.pagerank(graph_citations, alpha = 0.55 )
page_rank_polarity = nx.pagerank(graph_polarity, alpha = 0.55)

# Sort the nodes based on their PageRank scores (descending order)
sorted_page_rank_citations = sorted(page_rank_citations.items(), key=operator.itemgetter(1), reverse=True)
sorted_page_rank_polarity = sorted(page_rank_polarity.items(), key=operator.itemgetter(1), reverse=True)

# Print the top nodes and their corresponding PageRank scores for both versions
print("PageRank based on No. of Citations:")
for node, score in sorted_page_rank_citations[:10]:
    print(f"{node}: {score}")

print("\nPageRank based on Polarity Score:")
for node, score in sorted_page_rank_polarity[:10]:
    print(f"{node}: {score}")


PageRank based on No. of Citations:
Della Pietra, Vincent J.: 0.02734788078053505
Della Pietra, Stephen: 0.022892391947078636
Marcus, Mitch: 0.021287242309729833
Collins, Michael: 0.01870762461080031
Brown, Peter F.: 0.017642666664365786
Mercer, Robert L.: 0.017642666664365786
Santorini, Beatrice: 0.01592824955495648
Marcinkiewicz, Mary Ann: 0.01592824955495648
Carletta, Jean: 0.014364783693697461
Yarowsky, David: 0.013350336476351957

PageRank based on Polarity Score:
Della Pietra, Vincent J.: 0.03420710528258872
Brown, Peter F.: 0.028657124918793027
Mercer, Robert L.: 0.028657124918793027
Della Pietra, Stephen: 0.025175853084413554
Marcus, Mitch: 0.018663829941085452
Koehn, Philipp: 0.015527183803549326
Yarowsky, David: 0.01488367728598362
Och, Franz Josef: 0.013106370671946609
Santorini, Beatrice: 0.013070925340552488
Marcinkiewicz, Mary Ann: 0.013070925340552488


In [4]:
# Create DataFrames from the sorted PageRank results
df_citations = pd.DataFrame(sorted_page_rank_citations, columns=['Author', 'PageRank_Citations'])
df_polarity = pd.DataFrame(sorted_page_rank_polarity, columns=['Author', 'PageRank_Polarity'])

# Merge the two DataFrames based on the 'Author' column
result_df = pd.merge(df_citations, df_polarity, on='Author', how='outer')

# Calculate the number of incoming edges for each node in the graph_citations
incoming_edges_count = graph_citations.in_degree()

# Create a dictionary to store the 'Zero Incoming Edges' values for each author
zero_incoming_edges_dict = {}
for author, in_degree in incoming_edges_count:
    zero_incoming_edges_dict[author] = 1 if in_degree == 0 else 0

# Convert the dictionary to a DataFrame
zero_incoming_edges_df = pd.DataFrame(zero_incoming_edges_dict.items(), columns=['Author', 'Zero Incoming Edges'])

# Merge the zero_incoming_edges_df with the result_df based on the 'Author' column
result_df = pd.merge(result_df, zero_incoming_edges_df, on='Author', how='left')

# Fill any missing values (corresponding to authors with incoming edges) with 0
result_df['Zero Incoming Edges'] = result_df['Zero Incoming Edges'].fillna(0)

# Print the combined result with the 'Zero Incoming Edges' column
print("Combined PageRank Results with 'Zero Incoming Edges' column:")
print(result_df.head(10))

Combined PageRank Results with 'Zero Incoming Edges' column:
                     Author  PageRank_Citations  PageRank_Polarity  \
0  Della Pietra, Vincent J.            0.027348           0.034207   
1     Della Pietra, Stephen            0.022892           0.025176   
2             Marcus, Mitch            0.021287           0.018664   
3          Collins, Michael            0.018708           0.009177   
4           Brown, Peter F.            0.017643           0.028657   
5         Mercer, Robert L.            0.017643           0.028657   
6       Santorini, Beatrice            0.015928           0.013071   
7   Marcinkiewicz, Mary Ann            0.015928           0.013071   
8            Carletta, Jean            0.014365           0.004175   
9           Yarowsky, David            0.013350           0.014884   

   Zero Incoming Edges  
0                    0  
1                    0  
2                    0  
3                    0  
4                    0  
5                 

In [5]:
result_df.to_excel('C:/Users/HP/Desktop/Author_PageRank_55.xlsx', index = False)

In [6]:
page_rank_citations_85 = nx.pagerank(graph_citations, alpha = 0.85 )
page_rank_polarity_85 = nx.pagerank(graph_polarity, alpha = 0.85)

# Sort the nodes based on their PageRank scores (descending order)
sorted_page_rank_citations = sorted(page_rank_citations_85.items(), key=operator.itemgetter(1), reverse=True)
sorted_page_rank_polarity = sorted(page_rank_polarity_85.items(), key=operator.itemgetter(1), reverse=True)

# Print the top nodes and their corresponding PageRank scores for both versions
print("PageRank based on No. of Citations:")
for node, score in sorted_page_rank_citations[:10]:
    print(f"{node}: {score}")

print("\nPageRank based on Polarity Score:")
for node, score in sorted_page_rank_polarity[:10]:
    print(f"{node}: {score}")


PageRank based on No. of Citations:
Della Pietra, Vincent J.: 0.04220735664385795
Della Pietra, Stephen: 0.0350502680160838
Brown, Peter F.: 0.028587550394954545
Mercer, Robert L.: 0.028587550394954545
Marcus, Mitch: 0.02705504561948661
Collins, Michael: 0.025196927077287823
Santorini, Beatrice: 0.02061823137276275
Marcinkiewicz, Mary Ann: 0.02061823137276275
Och, Franz Josef: 0.018305995548066355
Yarowsky, David: 0.017077522575370173

PageRank based on Polarity Score:
Navigli, Roberto: 0.1094871436113544
Della Pietra, Vincent J.: 0.05828538779120406
Och, Franz Josef: 0.050234660857491654
Della Pietra, Stephen: 0.04838283290546124
Koehn, Philipp: 0.047850635416502876
Brown, Peter F.: 0.04764261668854351
Mercer, Robert L.: 0.04764261668854351
Ng, Andrew: 0.03374140967564652
Jurafsky, Dan: 0.03367294755442474
Snow, Rion: 0.033669945626025884


In [7]:
# Create DataFrames from the sorted PageRank results
df_citations = pd.DataFrame(sorted_page_rank_citations, columns=['Author', 'PageRank_Citations'])
df_polarity = pd.DataFrame(sorted_page_rank_polarity, columns=['Author', 'PageRank_Polarity'])

# Merge the two DataFrames based on the 'Author' column
result_df = pd.merge(df_citations, df_polarity, on='Author', how='outer')

# Calculate the number of incoming edges for each node in the graph_citations
incoming_edges_count = graph_citations.in_degree()

# Create a dictionary to store the 'Zero Incoming Edges' values for each author
zero_incoming_edges_dict = {}
for author, in_degree in incoming_edges_count:
    zero_incoming_edges_dict[author] = 1 if in_degree == 0 else 0

# Convert the dictionary to a DataFrame
zero_incoming_edges_df = pd.DataFrame(zero_incoming_edges_dict.items(), columns=['Author', 'Zero Incoming Edges'])

# Merge the zero_incoming_edges_df with the result_df based on the 'Author' column
result_df = pd.merge(result_df, zero_incoming_edges_df, on='Author', how='left')

# Fill any missing values (corresponding to authors with incoming edges) with 0
result_df['Zero Incoming Edges'] = result_df['Zero Incoming Edges'].fillna(0)

# Print the combined result with the 'Zero Incoming Edges' column
print("Combined PageRank Results with 'Zero Incoming Edges' column:")
print(result_df.head(10))

Combined PageRank Results with 'Zero Incoming Edges' column:
                     Author  PageRank_Citations  PageRank_Polarity  \
0  Della Pietra, Vincent J.            0.042207           0.058285   
1     Della Pietra, Stephen            0.035050           0.048383   
2           Brown, Peter F.            0.028588           0.047643   
3         Mercer, Robert L.            0.028588           0.047643   
4             Marcus, Mitch            0.027055           0.024456   
5          Collins, Michael            0.025197           0.017187   
6       Santorini, Beatrice            0.020618           0.016612   
7   Marcinkiewicz, Mary Ann            0.020618           0.016612   
8          Och, Franz Josef            0.018306           0.050235   
9           Yarowsky, David            0.017078           0.033617   

   Zero Incoming Edges  
0                    0  
1                    0  
2                    0  
3                    0  
4                    0  
5                 

In [8]:
result_df.to_excel('C:/Users/HP/Desktop/Author_PageRank_85.xlsx', index = False)