In [1]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

## Initialise spark session

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
  .appName('Scholars Data Preparation')\
  .getOrCreate()

## Let's create our dataframes

In [3]:
df_dcu = spark.read.csv("data/Scholar/dcu-scholars.csv", header=True)

In [4]:
df_dcu

DataFrame[_c0: string, Researcher: string, Position: string]

In [5]:
df_dcu.limit(10).toPandas()

Unnamed: 0,_c0,Researcher,Position
0,0,Michael Scriney,Insight Centre for Data Analytics
1,1,Malika Bendechache,"School of Computing, Dublin City University (DCU)"
2,2,Marija Bezbradica,"Lecturer, School of Computing, Dublin City Uni..."
3,3,Rob Brennan,"Assistant Professor, School of Computing, Dubl..."
4,4,Annalina Caputo,"Assistant Professor, Dublin City University, A..."
5,5,Long Cheng,"Professor, North China Electric Power University"
6,6,Paul Clarke,Associate Professor at Dublin City University ...
7,7,Martin Crane,"School of Computing, Dublin City University"
8,8,Charlie Daly,Lecturer in Computer Science
9,9,Brian Davis,"Assistant Professor, School of Computing, Dubl..."


In [6]:
df_dcu.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- Researcher: string (nullable = true)
 |-- Position: string (nullable = true)



In [7]:
df_dcu = df_dcu.drop("_c0")

In [8]:
df_dcu.limit(10).toPandas()

Unnamed: 0,Researcher,Position
0,Michael Scriney,Insight Centre for Data Analytics
1,Malika Bendechache,"School of Computing, Dublin City University (DCU)"
2,Marija Bezbradica,"Lecturer, School of Computing, Dublin City Uni..."
3,Rob Brennan,"Assistant Professor, School of Computing, Dubl..."
4,Annalina Caputo,"Assistant Professor, Dublin City University, A..."
5,Long Cheng,"Professor, North China Electric Power University"
6,Paul Clarke,Associate Professor at Dublin City University ...
7,Martin Crane,"School of Computing, Dublin City University"
8,Charlie Daly,Lecturer in Computer Science
9,Brian Davis,"Assistant Professor, School of Computing, Dubl..."


In [9]:
df_dcu.printSchema()

root
 |-- Researcher: string (nullable = true)
 |-- Position: string (nullable = true)



In [10]:
df_art = spark.read.csv("data/Scholar/final.csv", header=True)

In [11]:
df_art.limit(10).toPandas()

Unnamed: 0,Title,Citation,Conference,Publisher,Journal,Date,Authors,AuthorDCU
0,Efficient cube construction for smart city data,10,Unknown,CEUR-WS. org,Unknown,2016,"Michael Scriney, Mark Roantree",Michael Scriney
1,Generating cubes from smart city web data,7,Proceedings of the Australasian Computer Scien...,ACM,Unknown,2017,"Michael Scriney, Martin F O'Connor, Mark Roantree",Michael Scriney
2,Automating Data Mart Construction from Semi-st...,6,Unknown,Oxford University Press,The Computer Journal,2018,"Michael Scriney, Suzanne McCarthy, Andrew McCa...",Michael Scriney
3,Predicting customer churn for insurance data,4,International Conference on Big Data Analytics...,"Springer, Cham",Unknown,2020,"Michael Scriney, Dongyun Nie, Mark Roantree",Michael Scriney
4,Attention Based Video Summaries of Live Online...,3,"""AAAI-2021 Workshop on AI Education: """"Imagini...",Unknown,Unknown,2021,"Hyowon Lee, Mingming Liu, Hamza Riaz, Navaneet...",Michael Scriney
5,Constructing data marts from web sources using...,2,Unknown,Unknown,Unknown,2018,Michael Scriney,Michael Scriney
6,Integrating online data for smart city data marts,2,British International Conference on Databases,"Springer, Cham",Unknown,2017,"Michael Scriney, Martin F O'Connor, Mark Roantree",Michael Scriney
7,Identification of movement categories and asso...,1,Unknown,Routledge,International Journal of Performance Analysis ...,2021,"Aidan J Brady, Michael Scriney, Niall M Moyna,...",Michael Scriney
8,Representative Sample Extraction from Web Data...,1,International Conference on Database and Exper...,"Springer, Cham",Unknown,2019,"Michael Scriney, Congcong Xing, Andrew McCarre...",Michael Scriney
9,Using Artificial Intelligence to Automate Meat...,Unknown,Unknown,Unknown,Journal of Animal Science,2021,"Satya Prakash, Donagh P Berry, Mark Roantree, ...",Michael Scriney


In [12]:
df_art.toPandas()

Unnamed: 0,Title,Citation,Conference,Publisher,Journal,Date,Authors,AuthorDCU
0,Efficient cube construction for smart city data,10,Unknown,CEUR-WS. org,Unknown,2016,"Michael Scriney, Mark Roantree",Michael Scriney
1,Generating cubes from smart city web data,7,Proceedings of the Australasian Computer Scien...,ACM,Unknown,2017,"Michael Scriney, Martin F O'Connor, Mark Roantree",Michael Scriney
2,Automating Data Mart Construction from Semi-st...,6,Unknown,Oxford University Press,The Computer Journal,2018,"Michael Scriney, Suzanne McCarthy, Andrew McCa...",Michael Scriney
3,Predicting customer churn for insurance data,4,International Conference on Big Data Analytics...,"Springer, Cham",Unknown,2020,"Michael Scriney, Dongyun Nie, Mark Roantree",Michael Scriney
4,Attention Based Video Summaries of Live Online...,3,"""AAAI-2021 Workshop on AI Education: """"Imagini...",Unknown,Unknown,2021,"Hyowon Lee, Mingming Liu, Hamza Riaz, Navaneet...",Michael Scriney
...,...,...,...,...,...,...,...,...
4838,Evaluating Automatic-Structure Annotation for ...,Unknown,Unknown,Unknown,Unknown,Unknown,"Aoife Cahill, Mairead McCarthy, Josef Van Gena...",Andy Way
4839,Contextual Bitext-Derived Paraphrases in Autom...,Unknown,Unknown,Unknown,Unknown,Unknown,"Karolina Owczarzak, Declan Groves, Josef Van G...",Andy Way
4840,A Suite of Linguistic Tools for Use with the P...,Unknown,Unknown,Unknown,Unknown,Unknown,"Aoife Cahill, Mairead McCarthy, Ruth O'Donovan...",Andy Way
4841,The Creation and Evaluation of an Irish Langua...,Unknown,Unknown,Unknown,Unknown,Unknown,"Bríd de Lóndra, Andy Way",Andy Way


In [13]:
df_art.printSchema()

root
 |-- Title: string (nullable = true)
 |-- Citation: string (nullable = true)
 |-- Conference: string (nullable = true)
 |-- Publisher: string (nullable = true)
 |-- Journal: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Authors: string (nullable = true)
 |-- AuthorDCU: string (nullable = true)



In [14]:
df_art.count()

4843

## Let's check for the presence of nulls and duplicate values

In [15]:
cols = df_art.columns
for i in range(len(cols)):
    print(df_art.filter(df_art[cols[i]].isNull()).count())

0
0
0
0
0
0
0
0


Thankfully, we have no nulls present in any of our dataframe columns. This allows us to continue working through our dataframe.

In [16]:
if df_art.count() > df_art.dropDuplicates(['Title']).count():
    print('NOTE: Data has duplicates!')

NOTE: Data has duplicates!


There is duplicates to be found in our dataframe. This is not surprising to the two of us. The reason we filtered on the column 'title' was that if a paper appeared on a scholars list of published articles and a fellow dcu staff member also contributed to this paper we presumed this paper would also appear on their list of published works. There would be differences in all columns bar the last column "AuthorDCU" as this was the unique identifier as to which page this paper was scraped off.

## Now we have done some initial background checks on our data we can begin some simple analysis using PySpark

In [17]:
# let's check the author with the most published pieces on their scholar profile
df_art.groupBy('AuthorDCU').count().collect()

[Row(AuthorDCU='Monica Ward', count=45),
 Row(AuthorDCU='Alan F Smeaton', count=796),
 Row(AuthorDCU='Suzanne Little', count=102),
 Row(AuthorDCU='Geoff Hamilton', count=84),
 Row(AuthorDCU='Long Cheng', count=63),
 Row(AuthorDCU='David Sinclair', count=41),
 Row(AuthorDCU='Paul M.Clarke', count=116),
 Row(AuthorDCU='Andy Way', count=512),
 Row(AuthorDCU='Jennifer Foster', count=100),
 Row(AuthorDCU='Yvette Graham', count=78),
 Row(AuthorDCU='Brian Davis', count=93),
 Row(AuthorDCU=' UK “Cambridge …"', count=1),
 Row(AuthorDCU='Rob Brennan', count=140),
 Row(AuthorDCU='Musfira Jilani', count=24),
 Row(AuthorDCU='John McKenna', count=14),
 Row(AuthorDCU='Michael Scriney', count=18),
 Row(AuthorDCU='Silvana Togneri Mac Mahon', count=34),
 Row(AuthorDCU='Marija Bezbradica', count=46),
 Row(AuthorDCU='Andrew McCarren', count=48),
 Row(AuthorDCU='Malika Bendechache', count=32),
 Row(AuthorDCU='Heather J. Ruskin', count=198),
 Row(AuthorDCU='Donal Fitzpatrick', count=44),
 Row(AuthorDCU='Ray

We can see that Alan Smeaton has the largest number of published pieces with 798 and Gareth Jones is in second with 610. There seems to be a couple of faulty values here not originally in the data: "Padraig O'leary, Noel Carroll, Paul Clarke, Ita Richardson" and "UK â€œCambridge â€¦". We will replace the first value mentioned with "Paul M.Clarke" as we see this name is featured in the list of authors shown.

In [18]:
from pyspark.sql.functions import when
df_art = df_art.withColumn("AuthorDCU", when(df_art.AuthorDCU == "Padraig O'leary, Noel Carroll, Paul M Clarke, Ita Richardson","Paul M.Clarke") \
      .otherwise(df_art.AuthorDCU))

In [19]:
df_art.groupBy('AuthorDCU').count().collect()

[Row(AuthorDCU='Monica Ward', count=45),
 Row(AuthorDCU='Alan F Smeaton', count=796),
 Row(AuthorDCU='Suzanne Little', count=102),
 Row(AuthorDCU='Geoff Hamilton', count=84),
 Row(AuthorDCU='Long Cheng', count=63),
 Row(AuthorDCU='David Sinclair', count=41),
 Row(AuthorDCU='Paul M.Clarke', count=117),
 Row(AuthorDCU='Andy Way', count=512),
 Row(AuthorDCU='Jennifer Foster', count=100),
 Row(AuthorDCU='Yvette Graham', count=78),
 Row(AuthorDCU='Brian Davis', count=93),
 Row(AuthorDCU=' UK “Cambridge …"', count=1),
 Row(AuthorDCU='Rob Brennan', count=140),
 Row(AuthorDCU='Musfira Jilani', count=24),
 Row(AuthorDCU='John McKenna', count=14),
 Row(AuthorDCU='Michael Scriney', count=18),
 Row(AuthorDCU='Silvana Togneri Mac Mahon', count=34),
 Row(AuthorDCU='Marija Bezbradica', count=46),
 Row(AuthorDCU='Andrew McCarren', count=48),
 Row(AuthorDCU='Malika Bendechache', count=32),
 Row(AuthorDCU='Heather J. Ruskin', count=198),
 Row(AuthorDCU='Donal Fitzpatrick', count=44),
 Row(AuthorDCU='Ray

In [20]:
df_art = df_art.filter(df_art.AuthorDCU != ' UK “Cambridge …"')

In [21]:
df_art.groupBy('AuthorDCU').count().collect()

[Row(AuthorDCU='Monica Ward', count=45),
 Row(AuthorDCU='Alan F Smeaton', count=796),
 Row(AuthorDCU='Suzanne Little', count=102),
 Row(AuthorDCU='Geoff Hamilton', count=84),
 Row(AuthorDCU='Long Cheng', count=63),
 Row(AuthorDCU='David Sinclair', count=41),
 Row(AuthorDCU='Paul M.Clarke', count=117),
 Row(AuthorDCU='Andy Way', count=512),
 Row(AuthorDCU='Jennifer Foster', count=100),
 Row(AuthorDCU='Yvette Graham', count=78),
 Row(AuthorDCU='Brian Davis', count=93),
 Row(AuthorDCU='Rob Brennan', count=140),
 Row(AuthorDCU='Musfira Jilani', count=24),
 Row(AuthorDCU='John McKenna', count=14),
 Row(AuthorDCU='Michael Scriney', count=18),
 Row(AuthorDCU='Silvana Togneri Mac Mahon', count=34),
 Row(AuthorDCU='Marija Bezbradica', count=46),
 Row(AuthorDCU='Andrew McCarren', count=48),
 Row(AuthorDCU='Malika Bendechache', count=32),
 Row(AuthorDCU='Heather J. Ruskin', count=198),
 Row(AuthorDCU='Donal Fitzpatrick', count=44),
 Row(AuthorDCU='Ray Walshe', count=73),
 Row(AuthorDCU='Tomas War

Now, our faulty values are either replaced or removed.

## Let's do the same for year

In [22]:
df_art.groupBy('Date').count().collect()

[Row(Date='1987', count=4),
 Row(Date='2016', count=274),
 Row(Date='2020', count=272),
 Row(Date='2012', count=234),
 Row(Date='1988', count=4),
 Row(Date='2019', count=272),
 Row(Date='2017', count=269),
 Row(Date='2014', count=239),
 Row(Date='1984', count=2),
 Row(Date='2013', count=234),
 Row(Date='1982', count=3),
 Row(Date='85', count=1),
 Row(Date='2005', count=142),
 Row(Date='2000', count=47),
 Row(Date='1981', count=2),
 Row(Date='1978', count=1),
 Row(Date='2002', count=72),
 Row(Date='2018', count=247),
 Row(Date='2009', count=223),
 Row(Date='1995', count=31),
 Row(Date='2006', count=211),
 Row(Date='Unknown', count=240),
 Row(Date='1976', count=1),
 Row(Date='2004', count=148),
 Row(Date='2011', count=205),
 Row(Date='1989', count=2),
 Row(Date='1992', count=19),
 Row(Date='2022', count=2),
 Row(Date='2008', count=191),
 Row(Date='1999', count=51),
 Row(Date='1963', count=1),
 Row(Date='1994', count=24),
 Row(Date='1997', count=34),
 Row(Date='2007', count=170),
 Row(Dat

Again, we have some slightly unreasonable / incorrect figures. These include 1963 and '85 which is presumably a slag term for 1985. It is highly unlikely that a member of the DCU staff has papers going back as far as 1963 as the next year to this is 1976 which seems slightly more plausible. 2022 also seems a particularly unusual value to have considering it is still 2021. We will filter our dataframe further and remove rows with these values.

In [23]:
df_art = df_art.filter(df_art.Date != '85')

In [24]:
df_art = df_art.filter(df_art.Date != '1963')

In [25]:
df_art = df_art.filter(df_art.Date != '2022')

In [26]:
df_art = df_art.filter(df_art.Date != '1976')
df_art = df_art.filter(df_art.Date != '1978')

In [27]:
df_art.groupBy('Date').count().collect()

[Row(Date='1987', count=4),
 Row(Date='2016', count=274),
 Row(Date='2020', count=272),
 Row(Date='2012', count=234),
 Row(Date='1988', count=4),
 Row(Date='2019', count=272),
 Row(Date='2017', count=269),
 Row(Date='2014', count=239),
 Row(Date='1984', count=2),
 Row(Date='2013', count=234),
 Row(Date='1982', count=3),
 Row(Date='2005', count=142),
 Row(Date='2000', count=47),
 Row(Date='1981', count=2),
 Row(Date='2002', count=72),
 Row(Date='2018', count=247),
 Row(Date='2009', count=223),
 Row(Date='1995', count=31),
 Row(Date='2006', count=211),
 Row(Date='Unknown', count=240),
 Row(Date='2004', count=148),
 Row(Date='2011', count=205),
 Row(Date='1989', count=2),
 Row(Date='1992', count=19),
 Row(Date='2008', count=191),
 Row(Date='1999', count=51),
 Row(Date='1994', count=24),
 Row(Date='1997', count=34),
 Row(Date='2007', count=170),
 Row(Date='1996', count=26),
 Row(Date='1983', count=1),
 Row(Date='1980', count=1),
 Row(Date='2021', count=246),
 Row(Date='1986', count=3),
 Ro

This seems more realistic. We are happy to continue processing our data as a result.

In [28]:
df_art.count()

4836

In [29]:
df_art.groupBy('Citation').count().collect()

[Row(Citation='7', count=138),
 Row(Citation='51', count=10),
 Row(Citation='718', count=2),
 Row(Citation='169', count=1),
 Row(Citation='205', count=1),
 Row(Citation='15', count=56),
 Row(Citation='383', count=1),
 Row(Citation='54', count=11),
 Row(Citation='200', count=1),
 Row(Citation='11', count=91),
 Row(Citation='101', count=1),
 Row(Citation='433', count=1),
 Row(Citation='138', count=2),
 Row(Citation='29', count=9),
 Row(Citation='42', count=11),
 Row(Citation='112', count=2),
 Row(Citation='73', count=2),
 Row(Citation='64', count=5),
 Row(Citation='3', count=280),
 Row(Citation='30', count=12),
 Row(Citation='113', count=2),
 Row(Citation='34', count=18),
 Row(Citation='133', count=1),
 Row(Citation='425', count=1),
 Row(Citation='59', count=5),
 Row(Citation='139', count=1),
 Row(Citation='146', count=1),
 Row(Citation='8', count=122),
 Row(Citation='160', count=1),
 Row(Citation='22', count=35),
 Row(Citation='28', count=17),
 Row(Citation='203', count=1),
 Row(Citatio

In [30]:
df_art = df_art.filter(df_art.Citation != ' may be modeled by means of functional dependencies (FD). They have been extended to â€¦"')

In [31]:
df_art.groupBy('Journal').count().collect()

[Row(Journal='International Journal of Performance Analysis in Sport', count=4),
 Row(Journal='The Journal of Logic and Algebraic Programming', count=1),
 Row(Journal='Artificial intelligence in medicine', count=1),
 Row(Journal='Proceedings of AICS 2018 -Irish Conference on Artificial Intelligence and Cognitive Science', count=1),
 Row(Journal='IEEE transactions on Education', count=1),
 Row(Journal='NTCIR-14 Conference', count=1),
 Row(Journal='Information technology: research and development', count=1),
 Row(Journal='Int. J. Asian Lang. Process.', count=1),
 Row(Journal='Automation in Construction', count=1),
 Row(Journal='arXiv preprint arXiv:2104.13473', count=2),
 Row(Journal='IConference 2016 Proceedings', count=1),
 Row(Journal='IEEE multimedia', count=1),
 Row(Journal='ICVS, Bielefeld', count=1),
 Row(Journal='Journal of Computational Science', count=5),
 Row(Journal='arXiv preprint arXiv:1906.09833', count=1),
 Row(Journal='Proceedings of the 2019 International Society of Beh

In [32]:
df_art.groupBy('Publisher').count().collect()

[Row(Publisher='ACM Press', count=2),
 Row(Publisher='European Languages Resources Association (ELRA)', count=1),
 Row(Publisher='MMM', count=1),
 Row(Publisher='Springer.', count=1),
 Row(Publisher='NLP Association of India (NLPAI)', count=1),
 Row(Publisher='In Press', count=1),
 Row(Publisher='IJSER', count=1),
 Row(Publisher='Cyan Publishing Projects', count=1),
 Row(Publisher='ACM IEEE', count=1),
 Row(Publisher='Elsevier Science BV', count=1),
 Row(Publisher='Lecture Notes in Computer Science (LNCS), Springer', count=1),
 Row(Publisher="Le Centre de Hautes Etudes Internationales D'Informatique Documentaire", count=1),
 Row(Publisher='http://ceur-ws.org/Vol-2670/', count=1),
 Row(Publisher='Institute of Information Scientists', count=1),
 Row(Publisher='SciTePress', count=3),
 Row(Publisher='AAAI Press', count=1),
 Row(Publisher='Taylor Graham', count=2),
 Row(Publisher='The Institution of Engineering and Technology', count=1),
 Row(Publisher='Universitat Politècnica de Catalunya'

In [33]:
df_art = df_art.filter((df_art.Publisher != 'ä¸€èˆ¬ç¤¾å›£æ³•äºº æ—¥æœ¬ç‰©ç\x90†å\xad¦ä¼š') & (df_art.Publisher != 'æ—¥çµŒ BP ç¤¾'))

# Let's try find the author with the most amount of citations attached to their work

In [34]:
df_grp = df_art.filter(df_art.Citation != 'Unknown')

In [35]:
df_grp.printSchema()

root
 |-- Title: string (nullable = true)
 |-- Citation: string (nullable = true)
 |-- Conference: string (nullable = true)
 |-- Publisher: string (nullable = true)
 |-- Journal: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Authors: string (nullable = true)
 |-- AuthorDCU: string (nullable = true)



In [36]:
from pyspark.sql.functions import col
from pyspark.sql.types import *
df_grp1 = df_grp.withColumn("Citation", col("Citation").cast(IntegerType()))

In [37]:
df_grp1.printSchema()

root
 |-- Title: string (nullable = true)
 |-- Citation: integer (nullable = true)
 |-- Conference: string (nullable = true)
 |-- Publisher: string (nullable = true)
 |-- Journal: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Authors: string (nullable = true)
 |-- AuthorDCU: string (nullable = true)



In [38]:
df_grp1.toPandas()

Unnamed: 0,Title,Citation,Conference,Publisher,Journal,Date,Authors,AuthorDCU
0,Efficient cube construction for smart city data,10.0,Unknown,CEUR-WS. org,Unknown,2016,"Michael Scriney, Mark Roantree",Michael Scriney
1,Generating cubes from smart city web data,7.0,Proceedings of the Australasian Computer Scien...,ACM,Unknown,2017,"Michael Scriney, Martin F O'Connor, Mark Roantree",Michael Scriney
2,Automating Data Mart Construction from Semi-st...,6.0,Unknown,Oxford University Press,The Computer Journal,2018,"Michael Scriney, Suzanne McCarthy, Andrew McCa...",Michael Scriney
3,Predicting customer churn for insurance data,4.0,International Conference on Big Data Analytics...,"Springer, Cham",Unknown,2020,"Michael Scriney, Dongyun Nie, Mark Roantree",Michael Scriney
4,Attention Based Video Summaries of Live Online...,3.0,"""AAAI-2021 Workshop on AI Education: """"Imagini...",Unknown,Unknown,2021,"Hyowon Lee, Mingming Liu, Hamza Riaz, Navaneet...",Michael Scriney
...,...,...,...,...,...,...,...,...
3584,Translating DVD Subtitles Using Example-Based ...,1.0,Unknown,proceedings from MuTra-Audiovisual Translation...,Unknown,2006,"Stephen Armstrong, Colm Caffrey, Marian Flanag...",Andy Way
3585,Classroom of the Future,1.0,Unknown,Unknown,Unknown,2006,"Gerard Cleary, Andy Way",Andy Way
3586,GF-DOP: Grammatical Feature Data-Oriented Parsing,1.0,Unknown,CSLI Publications,Unknown,2006,"Riona Finn, Mary Hearne, Andy Way, Josef van G...",Andy Way
3587,Lexicalisation of Long Distance Dependencies i...,1.0,Unknown,Unknown,Proceedings of the Eighth International Confer...,2003,"Aoife Cahill, Mairead McCarthy, Ruth O'Donovan...",Andy Way


In [39]:
x = df_grp1.groupBy("AuthorDCU").sum("Citation").orderBy("sum(Citation)", ascending=False)

In [40]:
x.show()

+-------------------+-------------+
|          AuthorDCU|sum(Citation)|
+-------------------+-------------+
|     Alan F Smeaton|        20641|
|       Gareth Jones|         8823|
|           Andy Way|         7495|
|         Tomas Ward|         5552|
|      Cathal Gurrin|         5033|
|  Heather J. Ruskin|         4021|
|      Yvette Graham|         3214|
|    Jennifer Foster|         2567|
|Alistair Sutherland|         2372|
|      Paul M.Clarke|         1797|
|       Martin Crane|         1710|
|   Alessandra Mileo|         1656|
|     Suzanne Little|         1317|
|        Rob Brennan|         1187|
|        Brian Davis|         1122|
|      Mark Roantree|         1073|
|       Murat Yilmaz|         1032|
|     Geoff Hamilton|          616|
|    Annalina Caputo|          591|
|         Long Cheng|          569|
+-------------------+-------------+
only showing top 20 rows



Alan Smeaton has the largest number of citations by some distance. Although he was involved in the most papers (797) there was still a lot of authors who had similarly large number of works with their name attached to them published but did not have near the same amount of publications. We will now try to look at the average number of citations per paper for each author.

In [41]:
from pyspark.sql import functions as F
y = df_grp1.groupBy("AuthorDCU").agg(F.mean('Citation'))

In [42]:
y.orderBy("avg(Citation)", ascending=False).show()

+-------------------+------------------+
|          AuthorDCU|     avg(Citation)|
+-------------------+------------------+
|      Yvette Graham| 55.41379310344828|
|Alistair Sutherland| 35.40298507462686|
|     Alan F Smeaton| 34.63255033557047|
|    Jennifer Foster|29.848837209302324|
|  Heather J. Ruskin| 27.35374149659864|
|         Tomas Ward|26.312796208530806|
|       Charlie Daly| 25.58823529411765|
|   Alessandra Mileo|22.684931506849313|
|      Paul M.Clarke|           22.4625|
|       Martin Crane|20.853658536585368|
| Dimitar Shterionov| 19.14814814814815|
|       Gareth Jones|18.974193548387095|
|           Andy Way|18.370098039215687|
|     Suzanne Little| 17.32894736842105|
|      Cathal Gurrin|16.889261744966444|
|          Irina Tal|16.428571428571427|
|       Murat Yilmaz|14.535211267605634|
|        Brian Davis|13.851851851851851|
|        Rob Brennan|12.112244897959183|
|       John McKenna|              12.0|
+-------------------+------------------+
only showing top

This is certainly an interesting result and completely changes how we see our results from earlier regarding number of citations related to an author. It is interesting to note that Alan Smeaton and Gareth Jones appear much lower in relative terms of average citations than they did in terms of total citations. Yvette Graham for example appeared in less than a tenth of the total of papers published related to Alan Smeaton but has on average roughly 20 more citations.

# Doras PySpark Analysis

In [109]:
df_doras = spark.read.csv("data/doras/combined_csv.csv", header=True)

In [110]:
df_doras.toPandas().shape

(1998, 29)

Let's run similar analysis on our doras data. Let's check for missing/unusual values and let's look at authors citations also.

In [111]:
cols = df_doras.columns
for i in range(len(cols)):
    print(str(cols[i]) + " has this number of null values: " + str(df_doras.filter(df_doras[cols[i]].isNull()).count()))

Research name has this number of null values: 0
Publication Title has this number of null values: 129
Author List has this number of null values: 321
Conf/Journal Details has this number of null values: 355
Year has this number of null values: 339
Full name has this number of null values: 343
Authors and Orcid has this number of null values: 482
Authors without a orcid has this number of null values: 801
ISBN has this number of null values: 1443
ISSN has this number of null values: 1546
Item Type has this number of null values: 408
Event Type has this number of null values: 730
Refereed has this number of null values: 576
Date of Award has this number of null values: 1986
Supervisor(s) has this number of null values: 1893
Uncontrolled Keywords has this number of null values: 851
Subject has this number of null values: 508
DCU Faculties and Centres has this number of null values: 528
Use License has this number of null values: 668
ID Code has this number of null values: 504
Deposited On

This certainly appears slightly more messy than before. We will try filter our data down a bit more. We will filter out papers with a missing title as this is too important of information to be missing before we analyse our data.

In [112]:
df_doras = df_doras.filter(df_doras['Publication Title'].isNotNull())

In [113]:
df_doras.groupBy('Research Name').count().collect()

[Row(Research Name='using neural networks."', count=1),
 Row(Research Name='exploration in virtual reality."', count=1),
 Row(Research Name='Algorithm; Multi-Criteria Decision Analysis."', count=1),
 Row(Research Name='Monica Ward', count=3),
 Row(Research Name='The original publication is available at www.springerlink.com"', count=31),
 Row(Research Name='user-generated spoken content retrieval."', count=1),
 Row(Research Name='retrieval targeting patient information needs."', count=1),
 Row(Research Name='to digital libraries."', count=1),
 Row(Research Name='development."', count=2),
 Row(Research Name='supervised relation extraction."', count=1),
 Row(Research Name='of user-generated content."', count=1),
 Row(Research Name='using data selection methods."', count=1),
 Row(Research Name='conditional context in interactive neural machine translation."', count=1),
 Row(Research Name='a case study on Hindi texts."', count=1),
 Row(Research Name='knowledge from slack chat channels."', c

There is a lot of incorrect values in this column in our PySpark dataframe. To get a more conclusive analysis we need to remove these values such as "helpful or harmful?" from the data. We will do this with our earlier data.

In [114]:
lst = df_dcu.toPandas()

In [115]:
lst = lst['Researcher']
lst = lst.tolist()

In [116]:
df_doras = df_doras.filter(F.col("Research Name").isin(lst))

In [117]:
df_doras.toPandas().shape

(1243, 29)

Let's check for the author who occurs the most now.

In [124]:
author_count = df_doras.groupby('Research Name').count()
author_count.orderBy('count', ascending=False).show()

+-------------------+-----+
|      Research Name|count|
+-------------------+-----+
|    Alan F. Smeaton|  319|
|      Cathal Gurrin|  221|
|           Andy Way|  215|
|    Jennifer Foster|   48|
|        Rob Brennan|   44|
|      Mark Roantree|   43|
|     Suzanne Little|   42|
|       Martin Crane|   40|
|       Graham Healy|   37|
|  Marija Bezbradica|   34|
|    Andrew McCarren|   31|
|      Yvette Graham|   25|
| Malika Bendechache|   24|
|         Tomas Ward|   23|
|Alistair Sutherland|   16|
|    Michael Scriney|   13|
| Dimitar Shterionov|   11|
|   Alessandra Mileo|   11|
|         Long Cheng|   10|
|          Irina Tal|   10|
+-------------------+-----+
only showing top 20 rows



These are very similar results to our google scholar analysis earlier. Alan F.Smeaton, Cathal Gurrin and Andy Way all have a large number of papers 

In [125]:
df_dcu

DataFrame[Researcher: string, Position: string]

In [126]:
df_dcu.toPandas()

Unnamed: 0,Researcher,Position
0,Michael Scriney,Insight Centre for Data Analytics
1,Malika Bendechache,"School of Computing, Dublin City University (DCU)"
2,Marija Bezbradica,"Lecturer, School of Computing, Dublin City Uni..."
3,Rob Brennan,"Assistant Professor, School of Computing, Dubl..."
4,Annalina Caputo,"Assistant Professor, Dublin City University, A..."
5,Long Cheng,"Professor, North China Electric Power University"
6,Paul Clarke,Associate Professor at Dublin City University ...
7,Martin Crane,"School of Computing, Dublin City University"
8,Charlie Daly,Lecturer in Computer Science
9,Brian Davis,"Assistant Professor, School of Computing, Dubl..."
