# Notebook 1: Data Collection and inspection
Data taken from the [journal ranking dataset](https://www.kaggle.com/datasets/xabirhasan/journal-ranking-dataset). Details about the scraping process are in [this GitHub repo](https://github.com/abir0/SJR-Journal-Ranking).


In [None]:
# import all required libraries
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
# set matplotlib font size
plt.rcParams["font.size"] = "20"

In [None]:
# read in the dataset using pandas
# it will complain about the "Publisher" column, but we don't need it
raw_dataset = pd.read_csv("../data/journal_ranking_data.csv")

In [None]:
# inspect the first few rows of the dataset
raw_dataset.head()

In [None]:
raw_dataset.describe()

In [None]:
# look at correlations in the data
corr = raw_dataset.corr(numeric_only=True)
corr.style.background_gradient(cmap='coolwarm')

In [None]:
# inspect using plotly: raw data with unbalanced CiteScore
fig = px.histogram(raw_dataset, x="CiteScore", nbins=400)
fig.show()

In [None]:
# remove all high values of CiteScore
raw_dataset[raw_dataset["CiteScore"].gt(100)].index
clean_dataset = raw_dataset.drop(raw_dataset[raw_dataset["CiteScore"].gt(100)].index)

In [None]:
# inspect using plotly: cleaned data with slightly more balanced CiteScore
fig = px.histogram(clean_dataset, x="CiteScore", nbins=400)
fig.show()

In [None]:
# visualize outliers
fig = px.box(clean_dataset, x="CiteScore")
fig.show()

In [None]:
# inspect using plotly: raw data with unbalanced Cites/Doc. 2y
fig = px.histogram(raw_dataset, x="Cites/Doc. 2y", nbins=400)
fig.show()

In [None]:
# remove all high values of Cites/Doc. 2y
clean_dataset = clean_dataset.drop(
    clean_dataset[clean_dataset["Cites/Doc. 2y"].gt(43)].index
)

In [None]:
# inspect using plotly: cleaned data with more balanced Cites/Doc. 2y
fig = px.histogram(clean_dataset, x="Cites/Doc. 2y", nbins=400)
fig.show()

In [None]:
# visualize outliers
fig = px.box(clean_dataset, "Cites/Doc. 2y")
fig.show()

In [None]:
# remove duplicates from complete dataset
no_duplicates = clean_dataset.drop_duplicates(subset=["CiteScore", "Cites/Doc. 2y"])

In [None]:
fig = px.histogram(no_duplicates, x="CiteScore", nbins=400)
fig.show()

In [None]:
fig = px.histogram(no_duplicates, x="Cites/Doc. 2y", nbins=400)
fig.show()