**What dataset are we working with?**

https://www.kaggle.com/datasets/mathurinache/citation-network-dataset/data?select=dblp.v12.json

How was the dataset collected?

For what purpose was the dataset collected?

In [None]:
import pandas as pd
import numpy as np
import json
import csv
import matplotlib.pyplot as plt


from kaggle.api.kaggle_api_extended import KaggleApi
from zipfile import ZipFile 


### RUN clean_data.py OR GET data.csv FROM GOOGLE DRIVE BEFORE RUNNING THIS NOTEBOOK!

In [None]:
citations_df = pd.read_csv('indexed_data_rory.csv')

citations_df.head()

In [None]:
print(citations_df.columns)
print(citations_df.shape)

# Check for NA values
print(citations_df["ID"].isna().sum())

There are 3,538,030 rows of data. Of these rows of data, all of them have an ID that is not NA.

In [None]:
print(citations_df.columns)

In [None]:
print(citations_df.ID.isna().sum())
print(citations_df.Title.isna().sum())
print(citations_df.Year.isna().sum())
print(citations_df.Citations.isna().sum())
print(citations_df["Document Type"].isna().sum())
print(citations_df.Authors.isna().sum())
print(citations_df.Venue.isna().sum())
print(citations_df["Field of Study"].isna().sum())

While looking at this we saw that "Document Type" had 245,676 rows containing NA elements; "Venue" had 33,499 rows with NA, and "Field of Study" had 3,128 rows with NA. We will be dropping these so that we are working with complete data.

In [None]:
citations_df = citations_df.dropna()
print(citations_df.shape)

There are now 3,277,181 rows of data that have all their columns filled out.

Check if all IDs are unique.

In [None]:
# Check if all IDs are unique
print(citations_df.shape[0])
print(len(citations_df["ID"].unique()))

ids = set()
repeatedIds = []

for id in citations_df["ID"]:
  if id in ids:
    repeatedIds.append(id)
  else:
    ids.add(id)

print("This is num of repeated IDs: " + str(len(repeatedIds)))
print("These are the repeated IDs:")
print(repeatedIds)

There are no repeated IDs.

In [None]:
# These columns are Series data types
print(citations_df.Year.min())
print(citations_df.Year.max())

There are no papers whose Year is 0.

In [None]:
# Papers written between 1800 (inclusive) and 1899 (inclusive)
print(citations_df[(citations_df["Year"] >= 1800) & (citations_df["Year"] < 1900)])

In [None]:
# Papers written between 1900 (inclusive) and 1999 (inclusive)
print(citations_df[(citations_df["Year"] >= 1900) & (citations_df["Year"] < 1999)])

In [None]:
# Papers written past 2000 (inclusive)
print(citations_df[(citations_df["Year"] >= 2000)])

In [None]:
print(len(citations_df.Title.unique()))
print("There are " + str(citations_df.shape[0] - len(citations_df.Title.unique())) + " papers that share the same title" )

Of the 3,277,181 papers in the dataset, there are only 3,232,994 unique titles. This means that 44,187 papers share their title with another paper.

In [None]:
citations_df.loc[citations_df[citations_df["Citations"] >= citations_df.Citations.max()].index[0]]


In [None]:
print(citations_df.Citations.min())
print(citations_df.Citations.max())

print(citations_df[citations_df["Citations"] >= citations_df.Citations.max()])

The most cited paper is called "Distinctive Image Features from Scale-Invariant Keypoints with 35,541 citations.

WORKING ON ISSUE #1

In [None]:
fosDict={}

for fields in citations_df["Field of Study"]:
  fieldsList=fields.split(", ")
  for field in fieldsList:
    # print(field)
    fosDict[field] = fosDict.get(field, 0) + 1
  # print("---------")

print(fosDict)


fieldsKeys=list(fosDict.keys())
fieldsValues=[fosDict.get(field) for field in fieldsKeys]

fos_df = pd.DataFrame({
    "Field": fieldsKeys,
    "Frequency": fieldsValues
})

fos_df.set_index("Field", inplace=True)

fos_df = fos_df.sort_values("Frequency", ascending=False)
fos_df = fos_df.reset_index()

In [None]:
# Print out Top 5 most common fields
print(fos_df.head(5))

In [None]:
# Print out 5 least common fields
print(fos_df.tail(24391))

In [None]:
num_lowest_freq_field = 0
for key in fosDict:
    if fosDict.get(key) == 1:
        num_lowest_freq_field += 1

print(num_lowest_freq_field)

24390 fields of study are seen only once in the entire dataset.

TASK 2:


In [None]:
# Explode the “Field of Study” column to create separate rows for each field
citations_df = citations_df.explode("Field of Study")

# Group citations dataset by the “Field of Study” column
grouped = citations_df.groupby("Field of Study")

# Initialize a dictionary to store the top 5 influential figures in each field
top_influential_figures = {}

# Iterate over each group
for field, group in grouped:
    # Initialize a dictionary to store the cumulative citations for each author
    author_citations = {}

    # Iterate over each row in the group
    for index, row in group.iterrows():
        # Split authors by comma and iterate over them
        authors = row["Authors"].split(", ")

        # Calculate cumulative citations for each author
        for author in authors:
            author_citations[author] = author_citations.get(author, 0) + row["Citations"]

    # Sort authors by their cumulative citations in descending order
    sorted_authors = sorted(author_citations.items(), key=lambda x: x[1], reverse=True)
    
    # Select the top 5 authors with the highest cumulative citations
    top_influential_figures[field] = sorted_authors[:5]

In [None]:
for key, value in top_influential_figures.items():
    print(key, value)

TASK 3: 
The fields with the top 5 biggest growth in papers being published. This is simply the increase of published papers over a year, or 5 years.

In [None]:
# Create a dataframe of papers that were published each year starting from 1800
grouped_df = citations_df.groupby("Year")

In [None]:
# Create dictionary where key=Year and value=Dictionary of Field of Study Frequency for that year
year_fos_dict = {}
for year in grouped_df.indices.keys():
    year_fos_list = grouped_df.get_group(year)["Field of Study"].tolist()
    temp_list = []
    for row in year_fos_list:
        temp_list.extend(row.split(", "))

    fos_dict = {}
    for fos in temp_list:
        fos_dict[fos] = fos_dict.get(fos, 0) + 1

    year_fos_dict[year] = fos_dict

In [None]:
# Look inside the year_fos_dict
for year, fos in year_fos_dict.items():
    print(year, fos)

In [None]:
topFields = fos_df.head(5)["Field"].tolist()
# topFields = ["Computer science", "Artificial intelligence", "Mathematics", "Machine learning", "Mathematical optimization"]

# Find the number of citations at a year for each of the top 5 fields
# by making a list whose length is the range between the smallest year in
# citations_df and the largest year in citations_df. 
# For this list, index 0=the lowest year in citations_df.
fos_freq_year_dict = {}
for fos in topFields:
    fos_freq_year = [] # Field of study frequency for that year
    for year in range(citations_df.Year.min(), citations_df.Year.max() + 1):
        if year not in list(year_fos_dict.keys()):
            fos_freq_year.append(0)
        else:
            if fos in year_fos_dict[year]:
                fos_freq_year.append(year_fos_dict[year][fos])
            else:
                fos_freq_year.append(0)
    fos_freq_year_dict[fos] = fos_freq_year

print(fos_freq_year_dict)
# print(fos_freq_year_dict.keys())
# print(fos_freq_year_dict.values())


In [None]:
# Check if outputs look good
print(len(fos_freq_year_dict["Computer science"]))
print(len(fos_freq_year_dict["Artificial intelligence"]))
print(len(fos_freq_year_dict["Mathematics"]))
print(len(fos_freq_year_dict["Machine learning"]))
print(len(fos_freq_year_dict["Mathematical optimization"]))

print(sum(fos_freq_year_dict["Computer science"]))
print(sum(fos_freq_year_dict["Artificial intelligence"]))
print(sum(fos_freq_year_dict["Mathematics"]))
print(sum(fos_freq_year_dict["Machine learning"]))
print(sum(fos_freq_year_dict["Mathematical optimization"]))

print(fos_df.head(5))

In [None]:
# Create a line chart starting from the year 1950 and 
# going to the maximum year in citations_df minus 2 year 
# (i.e 2018 because at 2020, it looks like graph just dips off)
years = [i for i in range(1950, citations_df.Year.max()-1)]

for key in fos_freq_year_dict:
  plt.plot(years, fos_freq_year_dict[key][150:-2], label=key)

plt.xlabel("Year")
plt.ylabel("Frequency")
plt.title('Num Mentions of a Field Per Year')
plt.legend(loc="upper left")
plt.show()

For the top 5 most popular fields, it seems that they started experiencing large growths around 1990.