In [38]:
import pandas as pd
from copy import deepcopy
from fuzzywuzzy import process, fuzz
from itertools import combinations
from collections import Counter

In [3]:
!pip install fuzzywuzzy



# Collaborations 

This notebook details the prepraration of data for the neo4j database.

This notebook contains **four** key components:

1. Creating methods that will help me gather unique collaborations between a DCU researcher and a DCU researcher/external researcher, the total number of papers collaborated on by that pair and total citation counts achieved by that pair.

2. Performing the parsing and creating the dataframe.

3. Created a tool that will help me gather unique collaborations between external researchers. Obviously, the total number of papers collaborated on by that pair  and total citation count achieved by that pair  are not available as we do not have their google scholar profiles.

4. Creating a CSV file containing unique authors and their job titles.


In [4]:
def add_capitals(l):# adds capitals to author name, cutting down the number of alias's
    if l.isupper():
        l = l.lower()
        l =  l.title()
    else:
        l = l.title()
    return l
    

def remove_duplicates_stage_1(myList):# removing duplicates and provides the index of the unique authors from the orgianl list
    result= []
    index = []
    marker = set()
    for idx,l in enumerate(myList):
        ll = l.lower()
        if ll not in marker:   # test presence
            marker.add(ll)
            if l.isupper():
                l = l.lower()
                l =  l.title()
            else:
                l = l.title()
            index.append(idx)
            result.append(l)   # preserve order
    return result, index


def add_capital_mc(myList):# the add_capitals function removes capitals from any string, if the capital is not at the start.This is a problem for second names that start with mc. e.g. Mccareen becomes McCarren
    result=[]
    for l in myList:
        try:
            last_name = l.split(" ")[-1]
            first_name = l.replace(last_name, "")
            if last_name[:2].lower() == "mc":
                last_name = last_name[:2] + last_name[2].upper() + last_name[3:]
                l = "{}{}".format(first_name, last_name).strip()
        except IndexError:
            pass
        result.append(l)
    return result

def remove_double_author_occurance(flattened_df): # removing occurances where name1 and name2 refer to the same person
    for index, name2 in flattened_df["Name2"].items():
        name1 = flattened_df["Name1"].loc[index]
        score = fuzz.ratio(name1.lower(),name2.lower())
        if score >= 90:
            flattened_df.drop(index, inplace=True)
    flattened_df.reset_index(drop=True, inplace=True)
    return flattened_df

def check_if_researcher_already_iterated(name, mylist):# the method helps me track the researchers alreaady visted so we don't end up with an occurame of name1:Michael Scriney, name2:Andrew mcCarren  and  name1:Andrew McCarren, name2:Michael Scriney, for example.
    if len(mylist) >  1:
        score = process.extractOne(str(name.strip()), mylist)[-1]# the ratio score
        return score < 90
    return True
    

    
  

In [5]:
#Had to use an excel file as when I tried to use a csv file it kept corrupting special charachters
alias_authors = pd.read_excel("../data/Neo4j/alias_author_excel.xlsx") 
alias_authors["Alias"] = alias_authors["Alias"].apply(lambda x: x.strip())# removing whitespaces
alias_authors["Author"] = alias_authors["Author"].apply(lambda x: x.strip())

In [6]:
# creating a dictionary. Keys are aliases, values are cannon names. 
alias_dict = dict(zip(alias_authors["Alias"].tolist(), alias_authors["Author"].tolist()))


In [7]:
list_name1 = [] #name1 of our dataframe 
list_name2 = []# name2 of our datarame
list_count = [] # a list ontaining the total citation achieved by a pair of researchers
total_paper_count = [] # a list ontaining the total citation count achieved by a pair of researchers
researchers_already_iterated = [] # a list cotaining the dcu researchers that we have already visited
researchers = pd.read_csv("../data/SOC_Researchers.csv")
for name in researchers.Researcher:
    name = name.strip() # name of a dcu researcher 
    filename = "_".join(name.split(" "))
    researcher_citation_count = []
    researcher_total_paper = []
    try: # checking if a researcher has a google scholar profile 
        scholar_df = pd.read_csv("../data/Google Scholar Publications/{}.csv".format(filename))
        collaborators = [] #list of collaborators for each researcher's
        list_researcher = []# list of dcu researchers
        for index, authors in scholar_df["Author List"].items():
            list_authors = []# a list of authors on a publication
            # have to replace diffferent kind of apostrophe to one 
            for author in authors.split(", "):
                author = author.replace("'","’").strip()
                author = add_capitals(author).strip()
                if author in alias_dict:
                    author = alias_dict[author]#changing to Cannon name
                if check_if_researcher_already_iterated(author, researchers_already_iterated):
                    collaborators.append(author)
                    list_authors.append(author)
            scholar_df.loc[index,"Author List"] = ", ".join(list_authors)#changing the scholar dataframe to replace aliases with cannon names
            
            
        for collab in collaborators:
            filter_df = scholar_df[scholar_df["Author List"].str.contains(collab)]
            researcher_total_paper.append(filter_df.shape[0])#finding total number of papers collaborated on by a pair
            researcher_citation_count.append(filter_df['Citation count'].sum()) # finding total citation count collaborated by a pair
        collaborators, index_list = remove_duplicates_stage_1(collaborators)
        researcher_citation_count = [researcher_citation_count[i] for i in index_list]# removes citation counts achieved by a pair that is duplicated 
        researcher_total_paper = [researcher_total_paper[i] for i in index_list] #  removes total number of papers achieved by a pair that is duplicated
        
        collaborators = deepcopy(add_capital_mc(collaborators))
        list_researcher = deepcopy([name] * len(collaborators)) 
        list_name2.extend(collaborators)
        list_name1.extend(list_researcher)
        list_count.extend(researcher_citation_count)
        total_paper_count.extend(researcher_total_paper)
        researchers_already_iterated.append(name)           
                
    except FileNotFoundError:
        pass
    
d = {"Name1":list_name1, "Name2": list_name2, "Total_Paper_Count": total_paper_count, "Total_Citations_Achieved":list_count}
flattened_df = pd.DataFrame(data=d)
flattened_df = remove_double_author_occurance(flattened_df)# remove occurances where name1 and name2 refer to the same person

## The resulting dataframe will have these columns:

| Column name | description |
|---:|:---|
| Name1 | The  first name of a pair that collaborated on at least one paper together. E.g. Michael Scriney |
| Name2 | The  second name of a pair that collaborated on at least one paper together. E.g. Andrew McCarren |
| Total_Paper_Count | Total number of papers a pair has  collaborated on. |
| Total_Citations_Achieved  | Total citation count achieved by a pair.  |

# Colllaborations Between External Researchers
Created a tool that will help me gather unique collaborations between external researchers. Obviously, the total number of papers collaborated on  and total citation count achieved by that pair  are not available as we do not have their google scholar profiles.

In [None]:
collaborators = []# a list of the format [[name1, name2], [name1, name2]] etc
researchers = pd.read_csv("../data/SOC_Researchers.csv")
total_paper_count = [] # a list ontaining the total citation count achieved by a pair of researchers
total_citation_count = [] # a list ontaining the total citation count achieved by a pair of researchers
dcu_researchers = researchers.Researcher.tolist()# a list of dcu researchers
total_collaborations_count = []
for name in researchers.Researcher:
    name = name.strip() # name of a dcu researcher 
    filename = "_".join(name.split(" "))

    try: # checking if a researcher has a google scholar profile 
        scholar_df = pd.read_csv("../data/Google Scholar Publications/{}.csv".format(filename))
        for index, authors in scholar_df["Author List"].items():
            list_authors = []# a list of authors per publication
            # have to replace diffferent kind of apostrophe to one 
            for author in authors.split(", "):
                author = author.replace("'","’").strip()
                author = add_capitals(author).strip()
                if author in alias_dict:
                    author = alias_dict[author].strip()
                if check_if_researcher_already_iterated(author, dcu_researchers):# removes dcu researchers from list_authors
                    list_authors.append(author)
            if len(list_authors) > 1:
                comb = combinations(list_authors,2) # get all combinations of pairs from list_authors
                for i in list(comb):
                    collaborators.append(list(i),)
        
        for collab in collaborators:
            filter_1_df = scholar_df[scholar_df["Author List"].str.contains(f"(?i){collab[0]}")]
            filter_2_df = scholar_df[scholar_df["Author List"].str.contains(f"(?i){collab[1]}")]
            filter_df = pd.merge(filter_1_df, filter_2_df, how='inner')
            total_collaborations_count.append([collab[0], collab[1],filter_df.shape[0], filter_df['Citation count'].sum()])
                
                
    except FileNotFoundError:
        pass


In [None]:
#removes duplicates from thw 2d list regardless of order
seen = set()
result = []
for lst in total_collaborations_count:
    current = frozenset(Counter(lst).items())
    if current not in seen:
        result.append(lst)
        seen.add(current)

collaborators = result

name1_ext, name2_ext, total_paper_count, total_citation_count = [], [], [], []
for i in collaborators:
    names = add_capital_mc(i[:2])
    print(type(names))
    print(i)
    if i[0] != i[1]:# sometimes the same author name appeared twice  in a publication author list
        name1_ext.append(i[0])
        name2_ext.append(i[1])
        total_paper_count.append(i[2])
        total_citation_count.append(i[3])
    

In [23]:
d = {"Name1":name1_ext, "Name2": name2_ext, "Total_Paper_Count": total_paper_count, "Total_Citations_Achieved":total_citation_count}
extrernal_df = pd.DataFrame(data=d)

# Concating the two dafarames

In [24]:
total_df = pd.concat([flattened_df, extrernal_df])
total_df = total_df.drop_duplicates(subset=["Name1","Name2"],keep="first",ignore_index=True)

# Unique Author Dataframe 
## The resulting dataframe will have these columns:

| Column name | description |
|---:|:---|
| Unique Authors | A coumn conatining unqiue author names, from all the publications found. |
| Job Title | If the author works for DCU, this coulumn displays their job title. Otherwise, unknown. |

In [25]:
name = total_df["Name1"].tolist()

In [26]:
name.extend(total_df["Name2"].tolist())

In [27]:
d = {"Unique Author" :sorted((list(set(name)))), "Job Title":"Unkown"}
unique_df = pd.DataFrame(data=d)

In [28]:
unique_df.loc[unique_df["Unique Author"] == "Michael Scriney", "Job Title"] = "Assistant Professor"
unique_df.loc[unique_df["Unique Author"] == "Malika Bendechache", "Job Title"] = "Assistant Professor"
unique_df.loc[unique_df["Unique Author"] == "Marija Bezbradica", "Job Title"] = "Assistant Professor"
unique_df.loc[unique_df["Unique Author"] == "Stephen Blott", "Job Title"] = "Associate Professor"
unique_df.loc[unique_df["Unique Author"] == "Rob Brennan", "Job Title"] = "Assistant Professor"
unique_df.loc[unique_df["Unique Author"] == "Annalina Caputo", "Job Title"] = "Assistant Professor"
unique_df.loc[unique_df["Unique Author"] == "Long Cheng", "Job Title"] = "Assistant Professor"
unique_df.loc[unique_df["Unique Author"] == "Paul M. Clarke", "Job Title"] = "Assistant Professor"
unique_df.loc[unique_df["Unique Author"] == "Martin Crane", "Job Title"] = "Associate Professor"
unique_df.loc[unique_df["Unique Author"] == "Charlie Daly", "Job Title"] = "Lecturer"
unique_df.loc[unique_df["Unique Author"] == "Brian Davis", "Job Title"] = "Assistant Professor"
unique_df.loc[unique_df["Unique Author"] == "Dónal Fitzpatrick", "Job Title"] = "Lecturer"
unique_df.loc[unique_df["Unique Author"] == "Jennifer Foster","Job Title"] = "Lecturer"
unique_df.loc[unique_df["Unique Author"] == "Yvette Graham", "Job Title"] = "Assistant Professor"
unique_df.loc[unique_df["Unique Author"] == "Cathal Gurrin", "Job Title"] = "Associate Professor"
unique_df.loc[unique_df["Unique Author"] == "Geoff Hamilton", "Job Title"] = "Associate Professor"
unique_df.loc[unique_df["Unique Author"] == "Graham Healy", "Job Title"] = "Assistant Professor"
unique_df.loc[unique_df["Unique Author"] == "Mark Humphrys", "Job Title"] = "Assistant Professor"
unique_df.loc[unique_df["Unique Author"] == "Musfira Jilani", "Job Title"] = "Assistant Professor"
unique_df.loc[unique_df["Unique Author"] == "Gareth Jones", "Job Title"] = "Professor"
unique_df.loc[unique_df["Unique Author"] == "Jane Kernan", "Job Title"] = "Lecturer"
unique_df.loc[unique_df["Unique Author"] == "Suzanne Little", "Job Title"] = "Associate Professor"
unique_df.loc[unique_df["Unique Author"] == "Silvana Togneri Mac Mahon", "Job Title"] = "Assistant Professor"
unique_df.loc[unique_df["Unique Author"] == "Andrew McCarren", "Job Title"] = "Assistant Professor"
unique_df.loc[unique_df["Unique Author"] == "John McKenna", "Job Title"] = "Lecturer"
unique_df.loc[unique_df["Unique Author"] == "Alessandra Mileo", "Job Title"] = "Assistant Professor"
unique_df.loc[unique_df["Unique Author"] == "Nivedha Nagarajan", "Job Title"] = "Teaching Assistant"
unique_df.loc[unique_df["Unique Author"] == "Dongyun Nie", "Job Title"] = "Teaching Assistant"
unique_df.loc[unique_df["Unique Author"] == "Mark Roantree", "Job Title"] = "Professor"
unique_df.loc[unique_df["Unique Author"] == "Darragh O'Brien", "Job Title"] = "Lecturer"
unique_df.loc[unique_df["Unique Author"] == "Heather J. Ruskin", "Job Title"] = "Professor"
unique_df.loc[unique_df["Unique Author"] == "Dimitar Shterionov", "Job Title"] = "Assistant Professor"
unique_df.loc[unique_df["Unique Author"] == "David Sinclair", "Job Title"] = "Associate Professor"
unique_df.loc[unique_df["Unique Author"] == "Alan F. Smeaton", "Job Title"] = "Professor"
unique_df.loc[unique_df["Unique Author"] == "Alistair Sutherland", "Job Title"] = "Lecturer"
unique_df.loc[unique_df["Unique Author"] == "Brian Stone", "Job Title"] = "Lecturer"
unique_df.loc[unique_df["Unique Author"] == "Irina Tal", "Job Title"] = "Assistant Professor"
unique_df.loc[unique_df["Unique Author"] == "Jagadeeswaran Thangaraj", "Job Title"] = "Teaching Assistant"
unique_df.loc[unique_df["Unique Author"] == "Renaat Verbruggen", "Job Title"] = "Lecturer"
unique_df.loc[unique_df["Unique Author"] ==  "Ray Walshe", "Job Title"] = "Assistant Professor"
unique_df.loc[unique_df["Unique Author"] ==  "Monica Ward", "Job Title"] = "Lecturer"
unique_df.loc[unique_df["Unique Author"] ==  "Tomas Ward", "Job Title"] = "Professor"
unique_df.loc[unique_df["Unique Author"] ==  "Andy Way", "Job Title"] = "Professor"
unique_df.loc[unique_df["Unique Author"] ==  "Murat Yilmaz", "Job Title"] = "Assistant Professor"

In [29]:
unique_df.to_csv("../data/Neo4j/unique_authors.csv", index = None, header=True)

In [30]:
unique_df.reset_index(inplace=True)


In [44]:
author_id_dict = dict(zip(unique_df["Unique Author"], unique_df.index))


In [50]:
total_df = total_df.replace({"Name1": author_id_dict})
total_df = total_df.replace({"Name2": author_id_dict})

In [53]:
total_df.to_csv("collaborations.csv", index = None, header=True)

In [54]:
unique_df.to_csv("unique_authors.csv", index = None, header=True)