# We start by Importing CSV Files and Show their existence

### Additionally we check if there are any missing data values (A common data science process)

In [6]:
import pandas as pd

df_authors = pd.read_csv('authors.csv')
df_papers = pd.read_csv('papers.csv')
df_scientists = pd.read_csv('scientists.csv')

display(df_authors.head())
display(df_papers.head())
display(df_scientists.head())

Unnamed: 0,paper_id,scientist_id
0,p1,1
1,p2,1
2,p3,5
3,p4,6
4,p1,2


Unnamed: 0,id,title,year
0,p1,paperOne,2012
1,p2,paperTwo,2013
2,p3,paperThree,2014
3,p4,paperFour,2015


Unnamed: 0,id,name
0,1,slysken
1,2,victor
2,3,helena
3,4,twapandula
4,5,liam


#### Shows that there are no missing values in the dataset

In [8]:
print("df_authors for missing values:")
print(df_authors.isnull().sum())
print("\n")

print("df_papers missing values:")
print(df_papers.isnull().sum())
print("\n")

print("df_scientists missing values:")
print(df_scientists.isnull().sum())
print("\n")

df_authors for missing values:
paper_id        0
scientist_id    0
dtype: int64


df_papers missing values:
id       0
title    0
year     0
dtype: int64


df_scientists missing values:
id      0
name    0
dtype: int64




### We Put the Data Frames Into to help with co-authorship later on

In [10]:
merged_df = pd.merge(df_authors, df_papers, left_on='paper_id', right_on='id', how='inner')
merged_df = pd.merge(merged_df, df_scientists, left_on='scientist_id', right_on='id', how='inner')
co_authorship_network = {}
for _, row in merged_df.iterrows():
    scientist_name = row['name']
    paper_title = row['title']
    co_authors = merged_df[(merged_df['paper_id'] == row['paper_id']) & (merged_df['name'] != scientist_name)]['name'].tolist()
    if scientist_name not in co_authorship_network:
        co_authorship_network[scientist_name] = {}
    for co_author in co_authors:
        if co_author not in co_authorship_network[scientist_name]:
            co_authorship_network[scientist_name][co_author] = []
        co_authorship_network[scientist_name][co_author].append(paper_title)

def get_neighbors(scientist, co_authorship_network):
    """
    Returns a list of co-authors and paper titles for a given scientist.
    """
    if scientist in co_authorship_network:
        neighbors = []
        for co_author, papers in co_authorship_network[scientist].items():
            for paper in papers:
                neighbors.append((co_author, paper))
        return neighbors
    else:
        return []


def is_goal_state(current_scientist, target_scientist):
    """
    Checks if the current scientist is the target scientist.
    """
    return current_scientist == target_scientist


### Breadth-First Search is then implemented

In [11]:
from collections import deque

def bfs_shortest_path(co_authorship_network, start_scientist, target_scientist):
    """
    Finds the shortest path between two scientists using breadth-first search.
    """
    if start_scientist not in co_authorship_network or target_scientist not in co_authorship_network:
        return []  

    queue = deque([(start_scientist, [])])  
    visited = {start_scientist}
    parent = {} 

    while queue:
        current_scientist, path = queue.popleft()

        if is_goal_state(current_scientist, target_scientist):
            return path

        for neighbor, paper in get_neighbors(current_scientist, co_authorship_network):
            if neighbor not in visited:
                visited.add(neighbor)
                new_path = path + [(current_scientist, paper, neighbor)]
                queue.append((neighbor, new_path))
                parent[neighbor] = (current_scientist, paper)

    return []  

### Now lets test to see if we will find a scientist Co-Authorship

In [12]:
start_scientist = 'slysken'
target_scientist = 'victor'
shortest_path = bfs_shortest_path(co_authorship_network, start_scientist, target_scientist)

if shortest_path:
    print(f"Shortest path between {start_scientist} and {target_scientist}:")
    for i, step in enumerate(shortest_path):
        print(f"{i+1}: {step[0]} and {step[2]} co-authored \"{step[1]}\"")
else:
    print(f"No path found between {start_scientist} and {target_scientist}")

Shortest path between slysken and victor:
1: slysken and victor co-authored "paperOne"


### Due to successful Co-authorship, let us test accuracy overall (With Scientist both in and not in the dataset)

In [13]:
test_cases = [
  
    {'start': 'slysken', 'target': 'victor', 'expected_degrees': 1, 'expected_path': [('slysken', 'paperOne', 'victor')]},
  
    {'start': 'slysken', 'target': 'Balázs Győrffy', 'expected_degrees': 0, 'expected_path': []},
  
    {'start': 'NonExistentScientist', 'target': 'victor', 'expected_degrees': 0, 'expected_path': []},
]


results = []
for i, case in enumerate(test_cases):
    start_scientist = case['start']
    target_scientist = case['target']
    actual_path = bfs_shortest_path(co_authorship_network, start_scientist, target_scientist)
    actual_degrees = len(actual_path)

    results.append({
        'case': i + 1,
        'start': start_scientist,
        'target': target_scientist,
        'expected_degrees': case['expected_degrees'],
        'actual_degrees': actual_degrees,
        'expected_path': case['expected_path'],
        'actual_path': actual_path
    })

    print(f"Test case {i+1}:")
    print(f"  Start: {start_scientist}, Target: {target_scientist}")
    print(f"  Expected degrees: {case['expected_degrees']}, Actual degrees: {actual_degrees}")
    print(f"  Expected path: {case['expected_path']}, Actual path: {actual_path}")
    print("-" * 20)


correct_count = sum(1 for result in results if result['expected_degrees'] == result['actual_degrees'] and result['expected_path'] == result['actual_path'])
accuracy = (correct_count / len(test_cases)) * 100
print(f"\nSummary:")
print(f"  Total test cases: {len(test_cases)}")
print(f"  Correctly predicted cases: {correct_count}")
print(f"  Accuracy: {accuracy:.2f}%")

Test case 1:
  Start: slysken, Target: victor
  Expected degrees: 1, Actual degrees: 1
  Expected path: [('slysken', 'paperOne', 'victor')], Actual path: [('slysken', 'paperOne', 'victor')]
--------------------
Test case 2:
  Start: slysken, Target: Balázs Győrffy
  Expected degrees: 0, Actual degrees: 0
  Expected path: [], Actual path: []
--------------------
Test case 3:
  Start: NonExistentScientist, Target: victor
  Expected degrees: 0, Actual degrees: 0
  Expected path: [], Actual path: []
--------------------

Summary:
  Total test cases: 3
  Correctly predicted cases: 3
  Accuracy: 100.00%
