In [12]:
!pip3 install matplotlib numpy pandas seaborn scikit-learn plotly

Collecting plotly
  Downloading plotly-5.17.0-py2.py3-none-any.whl (15.6 MB)
[K     |████████████████████████████████| 15.6 MB 7.8 MB/s eta 0:00:01    |████████▋                       | 4.2 MB 7.8 MB/s eta 0:00:02
Collecting tenacity>=6.2.0
  Downloading tenacity-8.2.3-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, plotly
Successfully installed plotly-5.17.0 tenacity-8.2.3


In [126]:
import os
import matplotlib.pyplot as plt
import numpy as np
import json
import pandas as pd
import warnings
import seaborn as sns
import plotly.express as px
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
warnings.filterwarnings('ignore')

## Authorship Trends over years

In [130]:
author_files = ['144783904.json', '145580839.json', '1982950.json', '2928777.json']
author_files = [f'data/authors/{f}' for f in author_files]

for author_file in author_files:
    df = pd.read_json(author_file)
    yb_size = 2 if '2928777' in author_file else 6
    df['year_bucket'] = df['year'].min() + ((df['year'] - df['year'].min())// yb_size) * yb_size

    df_grouped = df.groupby('year_bucket')[['first_author', 'middle_author', 'last_author']].mean() * 100

    role_counts = df.groupby(['year_bucket', 'first_author', 'middle_author', 'last_author']).size().reset_index(name='role_paper_count')
    role_counts = role_counts.melt(id_vars=['year_bucket', 'role_paper_count'], value_vars=['first_author', 'middle_author', 'last_author'], var_name='role', value_name='is_author')
    role_counts = role_counts[role_counts['is_author'] == 1]
    role_counts['role'] = role_counts['role'].map({
        'first_author': 'First Author',
        'middle_author': 'Middle Author',
        'last_author': 'Last Author'
    })
    df_melted = df_grouped.reset_index().melt(id_vars='year_bucket', var_name='role', value_name='percentage')
    df_melted['role'] = df_melted['role'].map({
        'first_author': 'First Author',
        'middle_author': 'Middle Author',
        'last_author': 'Last Author'
    })
    df_melted = df_melted.merge(role_counts[['year_bucket', 'role', 'role_paper_count']], on=['year_bucket', 'role'], how='left')
    df_melted.fillna(0, inplace=True)
    fig = px.bar(df_melted, x='year_bucket', y='percentage',
                color='role', barmode='group', hover_name='percentage',
                labels={'year_bucket': 'Year Period', 'percentage': 'Percentage of Papers', 'role': 'Author Role'},
                title=f'Trends in Author Role over {df["author_of_interest"].iloc[0]}\'s Career',
                hover_data={"role_paper_count": True, "percentage": True, "year_bucket": False},
                text='percentage',
                )

    hover_template = "Year Period: %{x}<br>Paper Count: %{customdata[0]}"
    fig.update_traces(texttemplate='%{text:.1f}%', textposition='outside', hovertemplate=hover_template)
    x_axis_labels = [f"{year}-{min(int(year)+yb_size-1, 2023)}" if str(year) != "2023" else "2023" for year in df_melted['year_bucket'].unique()]
    fig.update_xaxes(tickvals=df_melted['year_bucket'].unique(), ticktext=x_axis_labels)
    fig.update_layout(title_x=0.5, title_y=0.95, title_xanchor='center')
    fig.update_yaxes(range=[0, 108])
    fig.show()

## t-SNE viz of papers across the years (color coded)

In [131]:
# All files in data/author/*, load json with pandas


# Load all files in data/author/
author_files = ['144783904.json', '145580839.json', '1982950.json', '2928777.json']
author_files = [f'data/authors/{f}' for f in author_files]

for author_file in author_files:
    df = pd.read_json(author_file)
    # df = df[pd.to_numeric(df['year'], errors='coerce').notna()]
    hover_template = "<b>%{hovertext}</b><br><i>%{customdata[0]}</i><br>Year: %{customdata[1]}"

    # Plot using plotly express for interactive hover action
    fig = px.scatter(df, x='x', y='y', color='year', hover_name='title', hover_data={'x': False, 'y': False, 'authors': True, 'year': True, 'title': False, 
                                                                                    'tldr': False},
                    color_continuous_scale='viridis', title=f"t-SNE Viz of {df['author_of_interest'][0]}\'s papers")
    
    # Update layout to remove gridlines
    fig.update_xaxes(showgrid=False, showticklabels=False, zeroline=False, title_text='')
    fig.update_yaxes(showgrid=False, showticklabels=False, zeroline=False, title_text='')
    if len(df['year'].unique()) < 5:
        # Adjust the color scale to display only whole numbers
        min_year = int(df['year'].min())
        max_year = int(df['year'].max())
        tick_values = list(range(min_year, max_year + 1))

        fig.update_layout(coloraxis_colorbar=dict(
            tickvals=tick_values,
            ticktext=tick_values
        ))
    fig.update_traces(hovertemplate=hover_template)
    # Move the title to the center
    fig.update_layout(title_x=0.5, title_y=0.95, title_xanchor='center')
    fig.show()


In [153]:
# All files in data/author/*, load json with pandas
import pandas as pd
import plotly.express as px

# Load all files in data/author/
author_files = ['144783904.json', '145580839.json', '1982950.json', '2928777.json']
author_files = [f'data/authors/{f}' for f in author_files]

for author_file in author_files:
    df = pd.read_json(author_file)
    # Add author_type column which is First Author if first_author is True, Last Author if last_author is True, and Middle Author otherwise
    hover_template = "<b>%{hovertext}</b><br><i>%{customdata[0]}</i><br>Year: %{customdata[1]}"

    symbol_map = {0: 'star', 1: 'circle', 2: 'square', 
                  3: 'diamond', 4: 'pentagon'}

    # Plot using plotly express for interactive hover action
    fig = px.scatter(df, x='x', y='y', color='year', hover_name='title', 
                    hover_data={'x': False, 'y': False, 'authors': True, 'year': True, 'title': False, 'tldr': False},
                    color_continuous_scale='viridis', title=f"t-SNE Viz of {df['author_of_interest'][0]}\'s papers - Cluster Relationship",
                    symbol='cluster', symbol_map=symbol_map)  # Added symbol attribute and symbol_map
    
    # Update layout to remove gridlines
    fig.update_xaxes(showgrid=False, showticklabels=False, zeroline=False, title_text='')
    fig.update_yaxes(showgrid=False, showticklabels=False, zeroline=False, title_text='')
    
    # Update the position of the legends
    fig.update_layout(legend=dict(x=0.5, xanchor='auto', orientation='h'))
    
    if len(df['year'].unique()) < 5:
        # Adjust the color scale to display only whole numbers
        min_year = int(df['year'].min())
        max_year = int(df['year'].max())
        tick_values = list(range(min_year, max_year + 1))

        fig.update_layout(coloraxis_colorbar=dict(
            tickvals=tick_values,
            ticktext=tick_values
        ))
    fig.update_traces(hovertemplate=hover_template)
    # Move the title to the center
    fig.update_layout(title_x=0.5, title_y=0.95, title_xanchor='center')
    fig.show()


In [149]:
# All files in data/author/*, load json with pandas
import pandas as pd
import plotly.express as px

# Load all files in data/author/
author_files = ['144783904.json', '145580839.json', '1982950.json', '2928777.json']
author_files = [f'data/authors/{f}' for f in author_files]

for author_file in author_files:
    df = pd.read_json(author_file)
    # Add author_type column which is First Author if first_author is True, Last Author if last_author is True, and Middle Author otherwise
    df['Author Type'] = df.apply(lambda row: 'First Author' if row['first_author'] else ('Last Author' if row['last_author'] else 'Middle Author'), axis=1)
    hover_template = "<b>%{hovertext}</b><br><i>%{customdata[0]}</i><br>Year: %{customdata[1]}"

    # Order author types and define symbols
    df['Author Type'] = pd.Categorical(df['Author Type'], categories=['First Author', 'Middle Author', 'Last Author'], ordered=True)
    symbol_map = {'First Author': 'star', 'Middle Author': 'circle', 'Last Author': 'square'}

    # Plot using plotly express for interactive hover action
    fig = px.scatter(df, x='x', y='y', color='year', hover_name='title', 
                    hover_data={'x': False, 'y': False, 'authors': True, 'year': True, 'title': False, 'tldr': False},
                    color_continuous_scale='viridis', title=f"t-SNE Viz of {df['author_of_interest'][0]}\'s papers - Author Type",
                    symbol='Author Type', symbol_map=symbol_map)  # Added symbol attribute and symbol_map
    
    # Update layout to remove gridlines
    fig.update_xaxes(showgrid=False, showticklabels=False, zeroline=False, title_text='')
    fig.update_yaxes(showgrid=False, showticklabels=False, zeroline=False, title_text='')
    
    # Update the position of the legends
    fig.update_layout(legend=dict(x=0.5, xanchor='auto', orientation='h'))
    
    if len(df['year'].unique()) < 5:
        # Adjust the color scale to display only whole numbers
        min_year = int(df['year'].min())
        max_year = int(df['year'].max())
        tick_values = list(range(min_year, max_year + 1))

        fig.update_layout(coloraxis_colorbar=dict(
            tickvals=tick_values,
            ticktext=tick_values
        ))
    fig.update_traces(hovertemplate=hover_template)
    # Move the title to the center
    fig.update_layout(title_x=0.5, title_y=0.95, title_xanchor='center')
    fig.show()


## t-SNE viz of papers across the years (color coded)

In [132]:
# All files in data/author/*, load json with pandas

import os

# Load all files in data/author/
author_files = ['144783904.json', '145580839.json', '1982950.json', '2928777.json']
author_files = [f'data/authors/{f}' for f in author_files]

for author_file in author_files:
    df = pd.read_json(author_file)
    # df = df[pd.to_numeric(df['year'], errors='coerce').notna()]
    # Custom hover template
    hover_template = "<b>%{hovertext}</b><br><i>%{customdata[0]}</i><br>Year: %{customdata[1]}<br>Citations: %{customdata[2]}"
    df['citation_count_scaled'] = df['citation_count']
    # Plot using plotly express for interactive hover action
    fig = px.scatter(df, x='x', y='y', color='year', hover_name='title', hover_data={'x': False, 'y': False, 'authors': True, 'year': True, 'citation_count': True, 'citation_count_scaled': False, 'title': False, 
                                                                                    'tldr': False},
                    size='citation_count_scaled',
                    color_continuous_scale='viridis', title=f"Citation-Scaled t-SNE Viz of {df['author_of_interest'][0]}\'s papers")
    
    # Update layout to remove gridlines
    fig.update_xaxes(showgrid=False, showticklabels=False, zeroline=False, title_text='')
    fig.update_yaxes(showgrid=False, showticklabels=False, zeroline=False, title_text='')
    fig.update_traces(hovertemplate=hover_template)
    if len(df['year'].unique()) < 5:
        # Adjust the color scale to display only whole numbers
        min_year = int(df['year'].min())
        max_year = int(df['year'].max())
        tick_values = list(range(min_year, max_year + 1))

        fig.update_layout(coloraxis_colorbar=dict(
            tickvals=tick_values,
            ticktext=tick_values
        ))
    # Move the title to the center
    fig.update_layout(title_x=0.5, title_y=0.95, title_xanchor='center')
    fig.show()
