# Title
 

### Imports

In [42]:
import requests, time, csv
from bs4 import BeautifulSoup

import pandas as pd
import re

import itertools

### Scraper

In [None]:
BATCH = 100         # docs per "page"
START = 1          # first record index (1-based on DARE)

count, start = 0, START
with open("uva_dare_v2.csv", "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow(["year","authors"])

    while True:
        url = (
            "https://dare.uva.nl/search"
            f"?sort=year;browse-all=yes;docsPerPage={BATCH};startDoc={start}"
        )
        r = requests.get(url, headers={"User-Agent":"uva-scraper-demo/1.0"}, timeout=30)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "html.parser")

        items = soup.select("div.doctitle")
        if not items:
            print("No more results — stopping.")
            break

        for dt in items:
            text = dt.get_text(" ", strip=True)
            # authors before '(', year inside '(...)'
            if "(" in text and ")" in text:
                authors = text.split("(", 1)[0].strip()
                year = text.split("(", 1)[1].split(")", 1)[0].strip()
            else:
                authors, year = "", ""

            w.writerow([year, authors])
            count += 1

        print(f"Fetched {len(items)} at startDoc={start} — total {count}")
        start += BATCH
        time.sleep(1.2)

print(f"Done. Collected {count} records.")


Fetched 100 at startDoc=1 — total 100
Fetched 100 at startDoc=101 — total 200
Fetched 100 at startDoc=101 — total 200
Fetched 100 at startDoc=201 — total 300
Fetched 100 at startDoc=201 — total 300
Fetched 100 at startDoc=301 — total 400
Fetched 100 at startDoc=301 — total 400
Fetched 100 at startDoc=401 — total 500
Fetched 100 at startDoc=401 — total 500


KeyboardInterrupt: 

### Cleaning

In [49]:
def standardise_data(df):
    """
    Standardise year format and clean author names
    """
    df_clean = df.copy()
    
    # clean year column 
    ## extract just the year part from dates like "1-10-2025"
    def extract_year(year_str):
        year_str = str(year_str)
        year_match = re.search(r'(\d{4})', year_str)
        if year_match:
            return year_match.group(1)
        
        return year_str
    
    df_clean['year'] = df_clean['year'].apply(extract_year)

    ## drop rows with invalid years
    df_clean = df_clean[df_clean['year'].apply(lambda x: bool(re.match(r'^\d{4}$', str(x))))]
    
    # clean author names
    ## remove extra whitespace
    df_clean['authors'] = df_clean['authors'].apply(
        lambda x: re.sub(r'\s+', ' ', str(x).strip()) if pd.notna(x) else x
    )
    
    return df_clean


def remove_near_duplicates(df):
    """
    Remove near-duplicates that have same authors but might have different year formats
    """
    df_clean = df.copy()
    
    df_clean['authors_clean'] = df_clean['authors'].apply(
        lambda x: re.sub(r'[^a-zA-Z]', '', str(x).lower()) if pd.notna(x) else ''
    )
    
    # find duplicates based on cleaned authors (ignoring year differences)
    author_duplicates = df_clean[df_clean.duplicated(['authors_clean'], keep=False)]
    
    if len(author_duplicates) > 0:
        
        author_groups = author_duplicates.groupby('authors_clean')
        
        removed_count = 0
        indices_to_remove = []
        
        for authors_clean, group in author_groups:
            if len(group) > 1:                
                keep_index = group.index[0]
                remove_indices = group.index[1:]
                indices_to_remove.extend(remove_indices)
                removed_count += len(remove_indices)
        
        if indices_to_remove:
            df_clean = df_clean.drop(indices_to_remove)
    
    df_clean = df_clean.drop('authors_clean', axis=1)
    
    return df_clean



In [50]:
def format_authors(authors_string):
    """
    Reformats author names to SurnameN format where N is the first letter(s) of the first name.
    Also removes white spaces.
    """

    # removes anything before the last author separator
    authors_string = authors_string.replace(' & ', ', ')
    authors_string = authors_string.replace(' ...', ', ')
    parts = [part.strip() for part in authors_string.split(',')]
    
    formatted_authors = []
    i = 0
    
    while i < len(parts):
        part = parts[i]
        
        # skips empty parts
        if not part:
            i += 1
            continue
            
        # check if this part looks like a surname 
        ## no dots, or contains prefixes
        has_prefix = any(prefix in part.lower() for prefix in ['van', 'de', 'der'])
        has_dot = '.' in part
        
        if not has_dot or has_prefix:
            # this is probably a surname
            surname = part
            
            # looks for first name next to it
            initials = ""
            j = i + 1
            while j < len(parts) and ('.' in parts[j] or '-' in parts[j]):
                # extracts everything from first name part
                first_name_part = parts[j]
                for char in first_name_part:
                    if char.isupper() or char in '-.':
                        initials += char
                j += 1
            
            # cleaning and formatting
            initials = initials.replace('.', '').replace(' ', '')
            initials = initials.replace('-', '')
            
            formatted_author = surname.replace(' ', '') + initials
            formatted_authors.append(formatted_author)
            
            i = j  # skips what we've done
        else:
            # for organisations
            initials = ""
            for char in part:
                if char.isupper() or char in '-.':
                    initials += char
            initials = initials.replace('.', '').replace(' ', '').replace('-', '')
            
            if initials:
                formatted_authors.append(initials)
            i += 1
    
    return ', '.join(formatted_authors)



In [39]:
def clean(input_file, output_file):
    """
    Clean the CSV file by removing duplicate entries and standardising the data
    """
    df = pd.read_csv(input_file)
    print(f"Initial dataset: {len(df)} rows")
        
    duplicates = df[df.duplicated(keep=False)]
    
    # remove exact duplicates (same year and same authors)
    df_clean = df.drop_duplicates()
    
    # standardise year format and author names
    df_clean = standardise_data(df_clean)
    
    # remove near-duplicates
    df_clean = remove_near_duplicates(df_clean)

    # drop na
    df_clean = df_clean.dropna(subset=['year', 'authors'])

    # format authors
    df_clean['authors'] = df_clean['authors'].apply(format_authors)

    # save
    df_clean.to_csv(output_file, index=False)
    print(f"\nCleaned data saved to: {output_file}")
    print(f"Final dataset: {len(df_clean)} rows")
    
    
    return df_clean



In [51]:
input_file = "uva_dare.csv"
output_file = "uva_dare_cleaned.csv"
    
cleaned_df = clean(input_file, output_file)

Initial dataset: 210079 rows

Cleaned data saved to: uva_dare_cleaned.csv
Final dataset: 110984 rows


### Convert to gephi

In [46]:
df = pd.read_csv('uva_dare_cleaned.csv')

# author list
authors_set = set()
for authors in df['authors']:
    author_list = [author.strip() for author in authors.split(',')]
    authors_set.update(author_list)

# nodes
nodes_df = pd.DataFrame(list(authors_set), columns=['Id'])
nodes_df['Label'] = nodes_df['Id']

# pairwise edges
edges = []
for _, row in df.iterrows():
    if pd.notna(row['authors']):
        authors = [author.strip() for author in row['authors'].split(',')]
        year = row['year']
        
        for author1, author2 in itertools.combinations(authors, 2):
            edges.append({
                'Source': author1,
                'Target': author2,
                'Year': year,
                'Weight': 1,  
                'Type': 'Undirected'
            })

edges_df = pd.DataFrame(edges)

nodes_df.to_csv('nodes.csv', index=False)
edges_df.to_csv('edges.csv', index=False)

print(f"Created {len(nodes_df)} nodes and {len(edges_df)} edges")

Created 144948 nodes and 1369071 edges


In [47]:
df['year'].unique()

array(['2026', '2025', 'TISP', 'Eds.', 'Ed.', '2024', 'PHAIR',
       'Guest ed.', 'JWST', 'and\n                        H-TEAM',
       'EuGMS', 'Emotion', 'SKON', '2015', 'FORRT', 'TRANS.', '2023',
       'H-TEAM', 'Zormpa', 'Author', '2022', 'ECCC', 'PSA', 'T2B!',
       'KaSP', '2021', 'H-Team', 'GROUP', 'PopART', '2020', '2019', 'ROC',
       '2018', 'ECLE', '1998', '2017', 'GRAWITA', 'PROBE', 'VMC', '2016',
       'ASKAP', '0297', 'jr.', 'The Lauder Institute-Wharton School',
       'BIOHAZ', 'SCHER', 'GEAS', '2014', '2013', '2012', '2011', '2010',
       'MDP', '2009', 'EDGE', '2008', '2007', '2006', '2005', 'et al.',
       '2004', '2003', '2002', 'ECJ 25/10/01', '2001', '2000', '1999',
       'FATIMA', '1997', '1996', '1995', 'et al', 'editors', '1994',
       '1993', '1992', '1991', '1990', '1989', '1988', '1987', '1986',
       '1985', '1984', '1983', '1982', '1981', '1980', '1979', '1978',
       '1977', '1976', '1975', '1974', '1973', '1972', '1971', '1970',
       '1969',

In [48]:
df_og = pd.read_csv('uva_dare.csv')
df_og['year'].unique()

array(['2026', 'Eds.', '2025', '1-10-2025', '17-8-2025', '24-4-2025',
       'TISP', '27-2-2025', '7-2-2025', '25-1-2025', '20-1-2025',
       '9-1-2025', '6-1-2025', 'Ed.', '2024', '23-12-2024', '20-12-2024',
       '13-12-2024', '29-11-2024', 'PHAIR', '20-11-2024', '19-11-2024',
       '16-11-2024', '15-11-2024', '14-11-2024', '12-11-2024',
       '7-11-2024', '5-11-2024', '4-11-2024', '1-11-2024', 'Guest ed.',
       '29-10-2024', '28-10-2024', '22-10-2024', '21-10-2024',
       '18-10-2024', '16-10-2024', '9-10-2024', '3-10-2024', '2-10-2024',
       '1-10-2024', '30-9-2024', '28-9-2024', '27-9-2024', '25-9-2024',
       '24-9-2024', '17-9-2024', '13-9-2024', '12-9-2024', '11-9-2024',
       '9-9-2024', '5-9-2024', '4-9-2024', '3-9-2024', '2-9-2024',
       '30-8-2024', '29-8-2024', '26-8-2024', '25-8-2024', '23-8-2024',
       '21-8-2024', '19-8-2024', '8-8-2024', '6-8-2024', '2-8-2024',
       'JWST', '31-7-2024', '30-7-2024', '29-7-2024', '26-7-2024',
       '25-7-2024', '23-7-2