# SI 608 

### Build graph

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
from fuzzywuzzy import process

In [2]:
df = pd.read_csv("dblp_graph_conference_v12.csv")

In [4]:

exploded_author_ids = df['author_id'].str.split(';').explode()


unique_author_ids = exploded_author_ids.unique()

print("Unique Author IDs:", len(unique_author_ids))


Unique Author IDs: 2333


In [5]:
from itertools import combinations
unique_pairs = set()

for _, group in df.groupby('id'):
    all_authors = set(';'.join(group['author_id']).split(';'))
    
    if len(all_authors) < 2:
        continue

    unique_pairs.update(combinations(sorted(all_authors), 2))
print("Number of unique collaboration pairs:", len(unique_pairs))

Number of unique collaboration pairs: 1362


In [38]:
df.isnull().sum()

id                     0
title                  0
year                   0
author_name            0
author_org          7893
author_id              0
n_citation             0
doc_type               0
reference_count     6716
references          6716
venue_id               0
venue_name             0
venue_type             0
doi                10368
keyword                4
volume             49830
issue              54128
publisher           5901
dtype: int64

In [39]:
test = df[df["references"].isna() & df["reference_count"].isna()]
test.shape

(6716, 18)

In [40]:
df["author_org"] = df["author_org"].fillna("unknown")


df["reference_count"] = df["reference_count"].fillna(0)

df["keyword"] = df["keyword"].fillna("unknown")

df["publisher"] = df["publisher"].fillna("unknown")


Missing doi's do not affect the data and are not filled

Volume and issue are also not filled since there are too many vacancies

#### Title df to Author df

In [41]:
journal_cols = df.columns.to_list()[:3] + df.columns.to_list()[6:]
print(journal_cols)

['id', 'title', 'year', 'n_citation', 'doc_type', 'reference_count', 'references', 'venue_id', 'venue_name', 'venue_type', 'doi', 'keyword', 'volume', 'issue', 'publisher']


In [42]:
rows = []


for i in range(len(df)):
    author_names = df["author_name"][i].split(";")
    author_ids = df["author_id"][i].split(";")
    author_orgs = str(df["author_org"][i]).split(";")
    journal_values = {col: df[col][i] for col in journal_cols}

    for j, author_id in enumerate(author_ids):
        row = {
            "index": i,
            "author_id": author_id,
            "author_name": author_names[j] if j < len(author_names) else np.nan,
            "author_org": author_orgs[j] if j < len(author_orgs) else np.nan,
        }
        row.update(journal_values)
        rows.append(row)


author_df = pd.DataFrame(rows)


In [43]:
author_df["index"] = author_df["index"] + 1
author_df["author_id"] = author_df["author_id"].astype(int)
author_df['reference_count'] = author_df['reference_count'].astype(int)

#### Clean org

In [44]:
def clean_org(org):
    if "email" in org:
        org = org.rsplit(",", 1)[0]
    return org


author_df.loc[author_df["author_org"] == "nan", "author_org"] = "unknown"
author_df["author_org"] = author_df["author_org"].apply(
    lambda x: x.strip().strip('"').replace("#TAB#", "").strip()
)

author_df["author_org"] = author_df["author_org"].apply(clean_org)

In [45]:
print(author_df["author_org"].value_counts())

author_org
unknown                                                                                                                            8978
Carnegie - Mellon University                                                                                                        342
Carnegie-Mellon Univ., Pittsburgh, PA, USA                                                                                          222
Carnegie Mellon University, Pittsburgh Pa                                                                                           190
Rice University                                                                                                                     153
                                                                                                                                   ... 
Application Development Technology Institute, IBM Software Solutions Division, 555 Bailey Avenue, San Jose, California                1
Department of Mathematics and Compute

In [46]:
author_df["author_org"].isnull().sum()

0

In [47]:
organization_names = author_df["author_org"].unique()

standardized_dict = {}
threshold = 85

for org_name in organization_names:
    if org_name in standardized_dict:
        continue
    if org_name == "unknown":
        standardized_dict[org_name] = org_name
        continue
    if not org_name.strip():
        continue

    matched_name = None
    for std_name in standardized_dict.keys():
        if process.extractOne(org_name, [std_name])[1] >= threshold:
            matched_name = std_name
            break

    if matched_name:
        standardized_dict[org_name] = matched_name
    else:
        standardized_dict[org_name] = org_name

author_df["standard_org"] = author_df["author_org"].map(standardized_dict)

In [48]:
author_df["standard_org"].isnull().sum()

65

In [49]:
print(author_df.loc[author_df["standard_org"].isna(), "author_org"])

1079      
1266      
1806      
4037      
4836      
        ..
58499     
58599     
60516     
60887     
61003     
Name: author_org, Length: 65, dtype: object


In [50]:
author_df["standard_org"].value_counts()

standard_org
University of Massachusetts-Amherst                                                    22460
unknown                                                                                 8978
Carnegie Mellon University, Pittsburgh, PA 15213 USA                                    5072
Dipartimento di Informatica, University of Salerno, Italy                               3583
Department of Exact Sciences, Technical Institute of Kavala, Kavala, Greece             3154
                                                                                       ...  
CNRS, IRIT, France                                                                         1
CRIN-CNRS & Inria-Lorraine                                                                 1
Laboratoire LGCGM, UPRES, Institut National des Sciences Appliquées, Rennes, France        1
CUHK, HK                                                                                   1
CITS                                                     

In [51]:
author=author_df.drop(columns=["author_org"])
author["standard_org"].fillna("unknown", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  author["standard_org"].fillna("unknown", inplace=True)


#### Other paper related attributes

In [52]:
author["publisher"].value_counts()

publisher
Springer, Berlin, Heidelberg                         13436
IEEE                                                 13020
ACM                                                   9620
unknown                                               6273
Springer, Cham                                        3577
                                                     ...  
Max Planck Institute  for Software Systems               1
Academia Praha                                           1
European Council on Modelling and Simulation ECMS        1
IOP Publishing                                           1
Springer Publishing Company, Incorporated                1
Name: count, Length: 734, dtype: int64

In [54]:
author.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61173 entries, 0 to 61172
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   index            61173 non-null  int64  
 1   author_id        61173 non-null  int64  
 2   author_name      61173 non-null  object 
 3   id               61173 non-null  int64  
 4   title            61173 non-null  object 
 5   year             61173 non-null  int64  
 6   n_citation       61173 non-null  int64  
 7   doc_type         61173 non-null  object 
 8   reference_count  61173 non-null  int64  
 9   references       53980 non-null  object 
 10  venue_id         61173 non-null  int64  
 11  venue_name       61173 non-null  object 
 12  venue_type       61173 non-null  object 
 13  doi              50091 non-null  object 
 14  keyword          61173 non-null  object 
 15  volume           8532 non-null   float64
 16  issue            3947 non-null   float64
 17  publisher   

In [87]:
author['venue_name'].value_counts()

venue_name
Symposium on Discrete Algorithms                                   1433
Symposium on the Theory of Computing                               1201
Foundations of Computer Science                                    1083
National Conference on Artificial Intelligence                     1077
International Colloquium on Automata, Languages and Programming    1071
                                                                   ... 
Grid Economics and Business Models                                    1
Visual Computing for Biomedicine                                      1
Security and Artificial Intelligence                                  1
Digital Interactive Media in Entertainment and Arts                   1
Robotics and Applications                                             1
Name: count, Length: 2158, dtype: int64

In [55]:
author.to_csv("dblp_graph_conference_v12_author.csv", index=False)

In [3]:
author=pd.read_csv("dblp_graph_conference_v12_author.csv")

### Split train and validation

In [4]:
most_recent_indices = author.groupby("author_id")["year"].idxmax()

valid = author.loc[most_recent_indices]

train = author.drop(most_recent_indices)

In [5]:
valid.shape

(2333, 19)

In [6]:
train.shape

(58840, 19)

#### Build the graph

In [7]:
train.columns

Index(['index', 'author_id', 'author_name', 'id', 'title', 'year',
       'n_citation', 'doc_type', 'reference_count', 'references', 'venue_id',
       'venue_name', 'venue_type', 'doi', 'keyword', 'volume', 'issue',
       'publisher', 'standard_org'],
      dtype='object')

In [19]:
train['author_id'].value_counts()   

author_id
2121939561    630
2130941901    443
2047018924    436
2198983026    425
2168907694    385
             ... 
2129290901      1
2104783657      1
2303862483      1
2499330029      1
2604013832      1
Name: count, Length: 1767, dtype: int64

In [8]:
citation_grouped= train.groupby("author_id")['n_citation'].sum().reset_index(name="total_citation")
train=train.merge(citation_grouped, on="author_id", how="left")

paper_grouped= train.groupby("author_id")['index'].count().reset_index(name="total_paper")
train=train.merge(paper_grouped, on="author_id", how="left")

reference_grouped= train.groupby("author_id")['reference_count'].mean().reset_index(name="avg_reference")
train=train.merge(reference_grouped, on="author_id", how="left")



In [9]:
def merge_keywords(keywords):
    all_keywords = set()
    for kw in keywords:
        all_keywords.update(kw.split(';'))
    return all_keywords


author_keywords = train.groupby("author_id")["keyword"].apply(merge_keywords).reset_index(name='total_keyword')
train=train.merge(author_keywords, on="author_id", how="left")

In [10]:
train.head()

Unnamed: 0,index,author_id,author_name,id,title,year,n_citation,doc_type,reference_count,references,...,doi,keyword,volume,issue,publisher,standard_org,total_citation,total_paper,avg_reference,total_keyword
0,1,2099571765,Andrew McGregor,57330,Spectral Sparsification in Dynamic Graph Streams,2013,19,Conference,17,1592346261;1983193888;1984361668;1997010704;20...,...,https://doi.org/10.1007/978-3-642-40328-6_1,Spectral properties;Graph;Discrete mathematics...,,,"Springer, Berlin, Heidelberg",University of Massachusetts-Amherst,3112,76,20.368421,"{Hash function, Hausdorff distance, Probabilis..."
1,2,2069465661,Nicola Santoro,129505,Improving the optimal bounds for black hole se...,2011,10,Conference,23,1494344355;1495764901;1500058967;1500361666;15...,...,https://doi.org/10.1007/978-3-642-22212-2_18,Asynchronous communication;Combinatorics;Upper...,,,"Springer, Berlin, Heidelberg","Carleton University, Ottawa, Canada",2608,137,14.715328,"{Intelligent robots, Homogeneous space, Search..."
2,3,2068190112,Rada Mihalcea,149980,UNT: A Supervised Synergistic Approach to Sema...,2012,25,Conference,24,1503071992;1566018662;1567365482;1593045043;16...,...,,Semantic similarity;Training set;SemEval;Ranki...,1.0,,Association for Computational Linguistics,"University of North Texas, Denton, TX",11681,190,12.457895,"{Musical, PageRank, Sense and reference, WordN..."
3,4,2159253281,Yiming Yang,162256,Von Mises-Fisher Clustering Models,2014,37,Conference,15,1532325895;1880262756;1956559956;1996764654;20...,...,,Cluster (physics);Data mining;Latent Dirichlet...,32.0,,JMLR.org,"Carnegie Mellon University, Pittsburgh, PA 152...",11471,93,10.150538,"{PageRank, Kriging, Neural coding, Gaussian fu..."
4,5,2030694586,Ivan Visconti,185553,Simultaneously resettable arguments of knowledge,2012,20,Conference,22,120427013;1495266644;1502708181;1516766811;156...,...,https://doi.org/10.1007/978-3-642-28914-9_30,Identification scheme;Computer science;Commitm...,,,"Springer, Berlin, Heidelberg",University of Massachusetts-Amherst,876,66,26.242424,"{Hash function, Asynchronous communication, Cr..."


In [11]:
G_train = nx.Graph()

In [12]:
for index, row in train.iterrows():
    author_id = row['author_id']
    if not G_train.has_node(author_id):
        G_train.add_node(
            author_id,
            name=row['author_name'],
            org=row['standard_org'],
            total_citation=row["total_citation"], 
            total_paper=row["total_paper"],
            avg_reference=row["avg_reference"],     
            keywords=row["total_keyword"],   
        )

In [13]:
print(G_train.number_of_nodes())

1767


In [14]:
# 等待修改
for paper_id, group in train.groupby('id'):

    authors = group['author_id'].tolist()
    id = group['id'].iloc[0]
    title = group['title'].iloc[0]
    year = group['year'].iloc[0]
    venue_id = group['venue_id'].iloc[0]
    venue_name = group['venue_name'].iloc[0]
    publisher = group['publisher'].iloc[0]
    

    for i in range(len(authors)):
        for j in range(i + 1, len(authors)):
            author1 = authors[i]
            author2 = authors[j]
            
            if G_train.has_edge(author1, author2):
                G_train[author1][author2]['weight'] += 1
                G_train[author1][author2]['papers'].append({'id': id, 'title': title, 'year': year,'venue_id':venue_id,'venue_name':venue_name,'publisher':publisher})
            else:
                G_train.add_edge(
                    author1,
                    author2,
                    weight=1,
                    papers=[{'id': id, 'title': title, 'year': year,'venue_id':venue_id,'venue_name':venue_name,'publisher':publisher}],
                )


In [15]:
print("Number of edges:", G_train.number_of_edges())
print("Sample edges:", list(G_train.edges(data=True))[:5])


Number of edges: 1307
Sample edges: [(2099571765, 2106497157, {'weight': 1, 'papers': [{'id': 1755536430, 'title': 'Verifiable stream computation and arthur-merlin communication', 'year': 2015, 'venue_id': 1162013546, 'venue_name': 'Conference on Computational Complexity', 'publisher': 'Schloss Dagstuhl--Leibniz-Zentrum fuer Informatik'}]}), (2099571765, 2142501412, {'weight': 2, 'papers': [{'id': 2099941470, 'title': 'Space-efficient estimation of statistics over sub-sampled streams', 'year': 2012, 'venue_id': 1184151122, 'venue_name': 'Symposium on Principles of Database Systems', 'publisher': 'ACM'}, {'id': 2576239031, 'title': 'Stochastic Streams: Sample Complexity vs. Space Complexity', 'year': 2016, 'venue_id': 1154039276, 'venue_name': 'European Symposium on Algorithms', 'publisher': 'Schloss Dagstuhl - Leibniz-Zentrum fuer Informatik'}]}), (2099571765, 2097030689, {'weight': 1, 'papers': [{'id': 2119377856, 'title': 'Fast query expansion using approximations of relevance models