# Analyze Partial Affiliation Data

This notebook analyzes the affiliation data scraped so far (even if incomplete).
Shows affiliations ranked by number of authors.

In [1]:
import pickle
import os
from collections import defaultdict
import pandas as pd

In [None]:
# Configuration
scholar_id = 'HNw5OdcAAAAJ'
cache_folder = '../cache'  # Updated path since notebook is in utils/

# Try to load from different cache files
progress_file = os.path.join(cache_folder, scholar_id, 'affiliation_progress.pkl')
complete_file = os.path.join(cache_folder, scholar_id, 'author_paper_affiliation_tuple_list.pkl')

affiliation_data = None
data_source = None

# Try complete file first, then progress file
if os.path.exists(complete_file):
    with open(complete_file, 'rb') as f:
        affiliation_data = pickle.load(f)
    data_source = 'Complete cache'
elif os.path.exists(progress_file):
    with open(progress_file, 'rb') as f:
        progress = pickle.load(f)
        affiliation_data = progress['affiliations']
    data_source = f'Progress cache (partial - stopped at author {progress["last_index"] + 1}/351)'
else:
    print("No affiliation data found!")
    print(f"Looked for:")
    print(f"  - {complete_file}")
    print(f"  - {progress_file}")

if affiliation_data:
    print(f"Data source: {data_source}")
    print(f"Total affiliation records loaded: {len(affiliation_data)}")

In [3]:
# Analyze the data structure
if affiliation_data:
    print("Sample record:")
    if len(affiliation_data) > 0:
        sample = affiliation_data[0]
        print(f"  Type: {type(sample)}")
        print(f"  Structure: (author_name, citing_paper_title, cited_paper_title, affiliation)")
        print(f"\n  Example:")
        print(f"    Author: {sample[0]}")
        print(f"    Citing paper: {sample[1][:80]}...")
        print(f"    Your paper (cited): {sample[2][:80]}...")
        print(f"    Affiliation: {sample[3]}")

Sample record:
  Type: <class 'tuple'>
  Structure: (author_name, citing_paper_title, cited_paper_title, affiliation)

  Example:
    Author: Hyeongjun Park
    Citing paper: Intelligent path planning with cross-entropy method and recurrent neural network...
    Your paper (cited): Learning-based warm-starting for fast sequential convex programming and trajecto...
    Affiliation: Assistant Professor, Seoul National University, Republic of Korea


In [4]:
# Group by affiliation and count unique authors
if affiliation_data:
    affiliation_authors = defaultdict(set)
    affiliation_citations = defaultdict(list)
    
    for author_name, citing_paper, cited_paper, affiliation in affiliation_data:
        # Skip entries with no author found
        if author_name == 'No_author_found':
            continue
        
        affiliation_authors[affiliation].add(author_name)
        affiliation_citations[affiliation].append({
            'author': author_name,
            'citing_paper': citing_paper,
            'cited_paper': cited_paper
        })
    
    print(f"Total unique affiliations found: {len(affiliation_authors)}")
    print(f"Total unique authors: {sum(len(authors) for authors in affiliation_authors.values())}")

Total unique affiliations found: 191
Total unique authors: 235


In [5]:
# Create summary DataFrame sorted by author count
if affiliation_data:
    summary_data = []
    
    for affiliation, authors in affiliation_authors.items():
        summary_data.append({
            'Affiliation': affiliation,
            'Author Count': len(authors),
            'Authors': ', '.join(sorted(authors))
        })
    
    df = pd.DataFrame(summary_data)
    df = df.sort_values('Author Count', ascending=False).reset_index(drop=True)
    
    print(f"Created DataFrame with {len(df)} affiliations")

Created DataFrame with 191 affiliations


## Top Affiliations by Author Count

In [6]:
# Display top 50 affiliations
if affiliation_data:
    print("TOP 50 AFFILIATIONS BY AUTHOR COUNT")
    print("="*100)
    
    # Show only affiliation and count for clean view
    display(df[['Affiliation', 'Author Count']].head(50))

TOP 50 AFFILIATIONS BY AUTHOR COUNT


Unnamed: 0,Affiliation,Author Count
0,Stanford University,17
1,Shanghai Jiao Tong University,6
2,Harbin Institute of Technology,4
3,UC Berkeley,3
4,Technical University of Munich,3
5,University of Washington,3
6,Massachusetts Institute of Technology,3
7,Tsinghua University,2
8,MIT Lincoln Laboratory,2
9,Carnegie Mellon University,2


## NASA Affiliations

In [7]:
# Filter for NASA affiliations
if affiliation_data:
    nasa_df = df[df['Affiliation'].str.contains('NASA', case=False, na=False)]
    
    if len(nasa_df) > 0:
        print(f"Found {len(nasa_df)} NASA affiliations:")
        print("="*100)
        display(nasa_df[['Affiliation', 'Author Count', 'Authors']])
    else:
        print("No NASA affiliations found in the scraped data so far.")

Found 7 NASA affiliations:


Unnamed: 0,Affiliation,Author Count,Authors
11,"Robotics Technologist, NASA Jet Propulsion Lab",2,"Abhishek Cauligi, Erica Tevere"
41,"Chief Roboticist, Intelligent Robotics Group, ...",1,Terry (Terrence) Fong
98,"Intelligent Robotics Group, NASA Ames",1,Brian Coltin
109,Openmind Research Institute | NASA-JPL | Caltech,1,Elena Sorina Lupu
157,NASA's Jet Propulsion Laboratory,1,Jean-Pierre de la Croix
164,NASA Ames Research Center,1,Caleb Ashmore Adams
178,"Caltech, NASA Jet Propulsion Laboratory",1,Thomas Touma


## All Affiliations (Full List)

In [8]:
# Show full list with authors
if affiliation_data:
    print(f"Complete list of all {len(df)} affiliations:")
    print("="*100)
    
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_colwidth', None)
    display(df)

Complete list of all 191 affiliations:


Unnamed: 0,Affiliation,Author Count,Authors
0,Stanford University,17,"Amine Elhafsi, Amirhossein Afsharrad, Daniele Gammelli, Edward Schmerling, Ethan Foss, Jonathan Kuck, Justin Kruger, Luigi Di Lillo, Milan Ganai, Nicolas Lee, Rachel Luo, Sigrid Elschot, Simone D'Amico, Stephen Tian, Tommaso Guffanti, Yixiao (Alvin) Sun, Yuji Takubo"
1,Shanghai Jiao Tong University,6,"Cailian Chen, Chuan Wen, Jiangchao Yao, Jianping He, Jun Lv, Wenye Yu"
2,Harbin Institute of Technology,4,"G. T. Ran, Haitao Yu, Shanwu Li, Shuai Yuan"
3,UC Berkeley,3,"Claire Tomlin, Giuseppe Loianno, Isabella Huang"
4,Technical University of Munich,3,"Adrian Kobras, Ralf Römer, Simon Schaefer"
5,University of Washington,3,"Karen Leung, Kazuki Mizuta, Michael (Miki) Szmuk"
6,Massachusetts Institute of Technology,3,"Di Wu, Enrico M Zucchelli, Pasquale Antonante"
7,Tsinghua University,2,"Guangtong Xu, Ziyang Meng"
8,MIT Lincoln Laboratory,2,"Matthew Cleaveland, Mitchell K. Black Jr."
9,Carnegie Mellon University,2,"Junwon Seo, Margaret Hansen"


## Search for Specific Affiliation

In [9]:
# Search for affiliations containing a keyword
search_term = "University"  # Change this to search for different terms

if affiliation_data:
    search_results = df[df['Affiliation'].str.contains(search_term, case=False, na=False)]
    
    print(f"Affiliations containing '{search_term}': {len(search_results)}")
    print("="*100)
    display(search_results[['Affiliation', 'Author Count', 'Authors']].head(20))

Affiliations containing 'University': 97


Unnamed: 0,Affiliation,Author Count,Authors
0,Stanford University,17,"Amine Elhafsi, Amirhossein Afsharrad, Daniele Gammelli, Edward Schmerling, Ethan Foss, Jonathan Kuck, Justin Kruger, Luigi Di Lillo, Milan Ganai, Nicolas Lee, Rachel Luo, Sigrid Elschot, Simone D'Amico, Stephen Tian, Tommaso Guffanti, Yixiao (Alvin) Sun, Yuji Takubo"
1,Shanghai Jiao Tong University,6,"Cailian Chen, Chuan Wen, Jiangchao Yao, Jianping He, Jun Lv, Wenye Yu"
4,Technical University of Munich,3,"Adrian Kobras, Ralf Römer, Simon Schaefer"
5,University of Washington,3,"Karen Leung, Kazuki Mizuta, Michael (Miki) Szmuk"
7,Tsinghua University,2,"Guangtong Xu, Ziyang Meng"
9,Carnegie Mellon University,2,"Junwon Seo, Margaret Hansen"
12,"PhD Candidate, Stanford University",2,"Matthew Foutter, Rohan Sinha"
14,Beihang University,2,"Jianglong Yu, Tao Wang"
15,Johns Hopkins University,2,"Anqi (Angie) Liu, Xing Han (Aaron)"
16,University of Southern California,2,"Kaustav Chakraborty, Keenan Albee"


## Export to CSV

In [None]:
# Export the summary to CSV
if affiliation_data:
    output_file = '../results/partial_affiliation_summary.csv'
    # Ensure results directory exists
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    df.to_csv(output_file, index=False)
    print(f"Summary exported to: {output_file}")