In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

df = pd.read_csv('../input/journals-impact-factor/MainOpenAccessJournalsData.csv')

### View Data

In [None]:
df.head()

In [None]:
df.tail()

### Data Description and Information

In [None]:
df.info()

In [None]:
df.describe()

### Check for NaNs

In [None]:
df.isnull().sum() # NaN Values by Column

In [None]:
df.isnull().sum().sum() # Total NaN Values

In [None]:
len(df[df.isnull().any(axis=1)])

### Drop NaNs

In [None]:
df.shape

In [None]:
subset_df = df.dropna(inplace=False)

In [None]:
subset_df.shape # too much loss

### Select specific columns for analysis

In [None]:
subset_df = df[["JournalName","BackFileDoIs","CurrentDoIs","TotalDoIs","ImpactFactor"]]
subset_df.shape

In [None]:
num_df = subset_df.dropna(inplace=False)
num_df.shape

In [None]:
df.columns

# Analysis

### Top Impact Factor Journals in 2020

In [None]:
import warnings
warnings.filterwarnings('ignore')

def clean_text_series(ser):
    return ser.str.strip('"')

def show_details(df,cols,num):
    small_df = df[cols]
    small_df.dropna(axis = 0, how = 'all', inplace = True)
    for column_name in small_df.columns:
        # print(small_df[column_name].dtype)
        if small_df[column_name].dtype==object:
            small_df[column_name] = clean_text_series(small_df[column_name])
    if "Title" in cols:
        small_df = small_df[small_df["Title"].notna()]
    return small_df.head(num)

top_journals_2020 = df.sort_values('ImpactFactor2020',ascending=False)
show_details(top_journals_2020,
             cols=['Title','Publisher', 'ImpactFactor2020','BackFileDoIs', 'CurrentDoIs','TotalDoIs'],
             num=25)

### Top Impact Factor Journals in 2021

In [None]:
top_journals_2021 = df.sort_values('ImpactFactor',ascending=False)
show_details(top_journals_2021,
             cols=['Title','Publisher', 'ImpactFactor','BackFileDoIs', 'CurrentDoIs','TotalDoIs'],
             num=25)

### Correlation Between Number of DoIs and ImpactFactor?

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
# plt.style.use('dark_background')
import seaborn as sns
sns.set(rc={'figure.figsize':(20,12)})

In [None]:
df.hist(bins=30,figsize=(20,12),grid=False,color="crimson");

In [None]:
sns.pairplot(df,hue="TotalDoIs",palette='viridis'); # ,hue="ImpactFactor"

### Coefficient Correlation

In [None]:
correlation_matrix = df.corr().round(2) # default method =‘pearson’, also available : ‘kendall’, ‘spearman’ correlation coefficients

In [None]:
correlation_matrix

In [None]:
plt.figure(figsize=(20,10))             # set the figure size to display
sns.heatmap(data=correlation_matrix,cmap="inferno", annot=True)  # annot = True to print the values inside the squares of the corr matrix

In [None]:
two_vals = df[["TotalDoIs","ImpactFactor"]]
two_vals = two_vals.replace(0,np.nan)
# remove nans
two_vals.dropna(inplace=True)
two_vals.shape

In [None]:
plt.style.use('dark_background')
plt.scatter(two_vals["TotalDoIs"],two_vals["ImpactFactor"],c=two_vals["ImpactFactor"],cmap='viridis')
plt.xlabel("TotalDoIs")
plt.ylabel("ImpactFactor")
plt.colorbar()
plt.grid(False)
plt.show()

### Clearly, Higher DoIs indicate a higher Impact Factor and therefore constitute to a better journal