# OVERLAPPING COEFFICIENT

In [1]:
%load_ext autoreload 
%autoreload 2

In [2]:
import pandas as pd 
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
from rich.progress import track
import ast
from tqdm.auto import tqdm
import ujson as json
import networkx as nx
import numpy as np 
import requests 
from scipy.stats import entropy

tqdm.pandas()
plt.rcParams.update({'font.size': 22})
sns.set(style="ticks", context="talk")
# plt.style.use("dark_background")

import plotly.express as px
import plotly.io as pio
from plotly.subplots import make_subplots
import plotly.graph_objects as go
pd.options.plotting.backend = 'plotly'
pio.templates.default = "plotly_dark"
pio.templates.default = 'presentation'

import rich
from itertools import combinations
import sys 
from statistics import mean, stdev
import struct, io, string
import os
import collections
from collections import Counter
import pickle
from scipy.stats import chisquare,kstest
from scipy import stats 
import random
import math
import random
from math import sqrt

from scipy import stats

In [3]:
def read_parquet(name, **args):
    path = basepath / f'{name}'
    df = pd.read_parquet(path, engine='pyarrow')
    # df.drop_duplicates(inplace=True)
    
    if 'publication_year' in df.columns:
        df.loc[:, 'publication_year'] = pd.to_numeric(df.publication_year)
        df = df[df.publication_year != 0]  # discard works with missing years
        
    print(f'Read {len(df):,} rows from {path.stem!r}')
    return df 

## LOAD FIELDS

### Physics

In [4]:
discipline = 'Physics'

In [None]:
#load 
basepath = Path('/N/project/openalex/slices/Physics/feb-2023')

works = read_parquet('works')
works_authors = read_parquet('works_authorships')
#works_concepts = read_parquet('works_concepts')
#works_referenced_works = read_parquet('works_referenced_works')

works['num_authors']=works['num_authors'].astype('int64')
works['n_coauthors'] = works['num_authors'] - 1
works_authors = pd.merge(works_authors, works['publication_date'], on="work_id")
works_authors.drop_duplicates(subset=['work_id','author_id'], inplace=True)
# works_concepts = pd.merge(works_concepts, works['publication_date'], on="work_id")
# works_concepts = works_concepts.query('score > 0.3', engine='python')

In [5]:
topic_list =[
    'Gravitational wave',
    'Dark matter',
    'Fluid dynamics',
    'Soliton',
    'Supersymmetry',
    'Statistical physics',          
    'Superconductivity' 
        ]

### Computer Science

In [4]:
discipline = 'CS'

In [134]:
basepath = Path('/N/project/openalex/slices/CS/feb-2023')

works = read_parquet('works')
works_authors = read_parquet('works_authorships')
# works_concepts = read_parquet('works_concepts')
# works_referenced_works = read_parquet('works_referenced_works')

works['num_authors']=works['num_authors'].astype('int64')
works['n_coauthors'] = works['num_authors'] - 1
works_authors = pd.merge(works_authors, works['publication_date'], on="work_id")
works_authors.drop_duplicates(subset=['work_id','author_id'], inplace=True)
# works_concepts = pd.merge(works_concepts, works['publication_date'], on="work_id")
# works_concepts = works_concepts.query('score > 0.3', engine='python')

Read 27,680,033 rows from 'works'
Read 83,048,887 rows from 'works_authorships'


In [5]:
topic_list =[
    'Compiler',
    'Mobile computing',
    'Cryptography',
    'Cluster analysis', 
    'Image processing',
    'Parallel computing'         
            ]  

### BioMed

In [4]:
discipline = 'BioMed'

In [None]:
basepath = Path('/N/project/openalex/slices/BioMed/feb-2023')

works = read_parquet('works')
works_authors = read_parquet('works_authorships')
# works_concepts = read_parquet('works_concepts')
# works_referenced_works = read_parquet('works_referenced_works')

works['num_authors']=works['num_authors'].astype('int64')
works['n_coauthors'] = works['num_authors'] - 1
works_authors = pd.merge(works_authors, works['publication_date'], on="work_id")
works_authors.drop_duplicates(subset=['work_id','author_id'], inplace=True)
# works_concepts = pd.merge(works_concepts, works['publication_date'], on="work_id")
# works_concepts = works_concepts.query('score > 0.3', engine='python')

Read 43,528,045 rows from 'works'


In [5]:
topic_list =[
            'Protein structure',
            'Genome', 
            'Peptide sequence',
            "Alzheimer's disease",
            'Neurology',          
            'Radiation therapy',
            'Chemotherapy'
            ]

## FUNCTIONS DEFINITIONS

### Overlapping

In [6]:
#overlapping sets
def sets_overlap(A, B):
    #Find intersection of two sets
    nominator = A.intersection(B)

    #Find union of two sets
    #denominator = A.union(B)
    denominator = min(len(A),len(B)) #A and B same size

    #Take the ratio of sizes
    index = len(nominator)/denominator
    
    return index

### Calculations

#### Productivity - impact mean

In [7]:
def Sets_overlap_calc(discipline,topic,my_path):

    #load windows
    my_path2 = os.path.join(discipline, 'Info')
    my_file = 'windows_cond_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        windows_cond = pickle.load(fp)
    
    #load active authors
    my_file = 'active_authors_classes_'+topic
    
    my_path3 = os.path.join(my_path2,'Productivity')    
    with open(os.path.join(my_path3, my_file),"rb") as fp:
        active_authors_classes_prod = pickle.load(fp) 
        
    my_path3 = os.path.join(my_path2,'Impact_mean1')
    with open(os.path.join(my_path3, my_file),"rb") as fp:
        active_authors_classes_imp1 = pickle.load(fp) 
    
    my_path3 = os.path.join(my_path2,'Impact_mean2')
    with open(os.path.join(my_path3, my_file),"rb") as fp:
        active_authors_classes_imp2 = pickle.load(fp) 
        
    my_path3 = os.path.join(my_path2,'Impact_mean3')
    with open(os.path.join(my_path3, my_file),"rb") as fp:
        active_authors_classes_imp3 = pickle.load(fp) 
    
    start_year = 1995
    Sets_overlap_dict = {}
    for w in range(0,23):
        
        #consider just windows with at least 2000 papers in EW and OW
        windows_cond_w = windows_cond[w]   
        if windows_cond_w:
            start_year_w = start_year+w #T_0 #start OW
        
            [active_authors_start_prod,samples_dict_prod,n_prod] = active_authors_classes_prod[w]
            high_active_authors_prod = samples_dict_prod['top 10%']
            low_active_authors_prod = samples_dict_prod['bottom 10%']
            
            [active_authors_start_imp1,samples_dict_imp1,n_imp1] = active_authors_classes_imp1[w]
            high_active_authors_imp1 = samples_dict_imp1['top 10%']
            low_active_authors_imp1 = samples_dict_imp1['bottom 10%']
            
            [active_authors_start_imp2,samples_dict_imp2,n_imp2] = active_authors_classes_imp2[w]
            high_active_authors_imp2 = samples_dict_imp2['top 10%']
            low_active_authors_imp2 = samples_dict_imp2['bottom 10%']
            
            [active_authors_start_imp3,samples_dict_imp3,n_imp3] = active_authors_classes_imp3[w]
            high_active_authors_imp3 = samples_dict_imp3['top 10%']
            low_active_authors_imp3 = samples_dict_imp3['bottom 10%']

            #Jaccard index    
            ji_impact1_high = sets_overlap(high_active_authors_prod, high_active_authors_imp1)
            ji_impact2_high = sets_overlap(high_active_authors_prod, high_active_authors_imp2)
            ji_impact3_high = sets_overlap(high_active_authors_prod, high_active_authors_imp3)
            
            ji_impact1_low = sets_overlap(low_active_authors_prod, low_active_authors_imp1)
            ji_impact2_low = sets_overlap(low_active_authors_prod, low_active_authors_imp2)
            ji_impact3_low = sets_overlap(low_active_authors_prod, low_active_authors_imp3)
   

            Sets_overlap_dict[start_year_w] = [ji_impact1_high,ji_impact2_high,ji_impact3_high,ji_impact1_low,ji_impact2_low,ji_impact3_low]  
        
    Sets_overlap_df = (pd.DataFrame.from_dict(Sets_overlap_dict, orient='index')).rename_axis('T_0').reset_index()
    # my_file = 'Sets_overlap_df_windows.csv'
    # Sets_overlap_df.to_csv(os.path.join(my_path, my_file), sep=';')
    
    Sets_overlap_df.insert(0, 'topic', topic)

    return Sets_overlap_df

In [8]:
my_path = os.path.join(discipline, 'Sets_overlap_mean')
#create folder
if not os.path.exists(my_path):
    os.makedirs(my_path)
Sets_overlap_df = pd.DataFrame() 
for topic in topic_list:
    Sets_overlap_df_top = Sets_overlap_calc(discipline=discipline,topic=topic,my_path=my_path) 
    Sets_overlap_df = pd.concat([Sets_overlap_df, Sets_overlap_df_top], axis = 0)

my_file = 'Sets_overlap_windows.csv'
Sets_overlap_df = Sets_overlap_df.rename(columns={0: "prod_impact1_high", 1: "prod_impact2_high", 2: "prod_impact3_high", 3: "prod_impact1_low", 4: "prod_impact2_low", 5: "prod_impact3_low"})
Sets_overlap_df.to_csv(os.path.join(my_path, my_file), sep=';')

In [1]:
#average across windows for each topic

my_path = os.path.join(discipline, 'Sets_overlap_mean')

my_file = 'Sets_overlap_windows.csv'
Sets_overlap_df = pd.read_csv(os.path.join(my_path, my_file),index_col=0, sep=';')
Sets_overlap_df.drop_duplicates(subset=['topic','T_0'], inplace=True)
Sets_overlap_df = Sets_overlap_df.set_index('topic')
Sets_overlap_df_ = pd.DataFrame()

for topic in topic_list:
    df_concept = Sets_overlap_df.query('topic == @topic')
    df_concept_mean = df_concept[["prod_impact1_high", "prod_impact2_high", "prod_impact3_high", "prod_impact1_low", "prod_impact2_low", "prod_impact3_low"]].mean(axis=0).to_frame().rename(columns={0:'mean'}) 
    df_concept_std = df_concept[["prod_impact1_high", "prod_impact2_high", "prod_impact3_high", "prod_impact1_low", "prod_impact2_low", "prod_impact3_low"]].std(axis=0).to_frame().rename(columns={0:'std'}) 
    df_concept_mean = pd.concat([df_concept_mean, df_concept_std], axis = 1)
    df_concept_mean = df_concept_mean.transpose().rename_axis('stat').reset_index()

    df_concept_mean.insert(0, 'topic', topic)

    Sets_overlap_df_ = pd.concat([Sets_overlap_df_, df_concept_mean], axis = 0)

my_file = 'Sets_overlap_table.csv'
Sets_overlap_df_.to_csv(os.path.join(my_path, my_file), sep=';')

NameError: name 'os' is not defined

In [31]:
#tables paper
my_path = os.path.join(discipline, 'Sets_overlap_mean')

my_file = 'Sets_overlap_table.csv'
Sets_overlap_df = pd.read_csv(os.path.join(my_path, my_file),index_col=0, sep=';')
Sets_overlap_df = Sets_overlap_df.reset_index(drop=True)

Sets_overlap_mean_ = pd.DataFrame()
for r in list(range(0, len(Sets_overlap_df),2)):
    Sets_overlap_mean = pd.DataFrame()
    Sets_overlap_mean["prod_impact1_high"] = Sets_overlap_df.loc[[r]]["prod_impact1_high"].reset_index(drop=True).round(2).astype(str) + "$\pm$" + Sets_overlap_df.loc[[r+1]]["prod_impact1_high"].reset_index(drop=True).round(2).astype(str)
    Sets_overlap_mean["prod_impact2_high"] = Sets_overlap_df.loc[[r]]["prod_impact2_high"].reset_index(drop=True).round(2).astype(str) + "$\pm$" + Sets_overlap_df.loc[[r+1]]["prod_impact2_high"].reset_index(drop=True).round(2).astype(str)
    Sets_overlap_mean["prod_impact3_high"] = Sets_overlap_df.loc[[r]]["prod_impact3_high"].reset_index(drop=True).round(2).astype(str) + "$\pm$" + Sets_overlap_df.loc[[r+1]]["prod_impact3_high"].reset_index(drop=True).round(2).astype(str)
    Sets_overlap_mean["prod_impact1_low"] = Sets_overlap_df.loc[[r]]["prod_impact1_low"].reset_index(drop=True).round(2).astype(str) + "$\pm$" + Sets_overlap_df.loc[[r+1]]["prod_impact1_low"].reset_index(drop=True).round(2).astype(str)
    Sets_overlap_mean["prod_impact2_low"] = Sets_overlap_df.loc[[r]]["prod_impact2_low"].reset_index(drop=True).round(2).astype(str) + "$\pm$" + Sets_overlap_df.loc[[r+1]]["prod_impact2_low"].reset_index(drop=True).round(2).astype(str)
    Sets_overlap_mean["prod_impact3_low"] = Sets_overlap_df.loc[[r]]["prod_impact3_low"].reset_index(drop=True).round(2).astype(str) + "$\pm$" + Sets_overlap_df.loc[[r+1]]["prod_impact3_low"].reset_index(drop=True).round(2).astype(str)
    Sets_overlap_mean.insert(0, "topic", Sets_overlap_df.loc[[r]]["topic"].reset_index(drop=True)[0])
    Sets_overlap_mean_ = pd.concat([Sets_overlap_mean_, Sets_overlap_mean], axis=0)
my_file = 'Sets_overlap_mean.csv'
Sets_overlap_mean_.to_csv(os.path.join(my_path, my_file), sep='&', index = False)

#### Productivity - impact sum

In [8]:
def Sets_overlap_calc2(discipline,topic,my_path):

    #load windows
    my_path2 = os.path.join(discipline, 'Info')
    my_file = 'windows_cond_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        windows_cond = pickle.load(fp)
    
    #load active authors
    my_file = 'active_authors_classes_'+topic
    
    my_path3 = os.path.join(my_path2,'Productivity')    
    with open(os.path.join(my_path3, my_file),"rb") as fp:
        active_authors_classes_prod = pickle.load(fp) 
        
    my_path3 = os.path.join(my_path2,'Impact_sum1')
    with open(os.path.join(my_path3, my_file),"rb") as fp:
        active_authors_classes_imp1 = pickle.load(fp) 
    
    my_path3 = os.path.join(my_path2,'Impact_sum2')
    with open(os.path.join(my_path3, my_file),"rb") as fp:
        active_authors_classes_imp2 = pickle.load(fp) 
        
    my_path3 = os.path.join(my_path2,'Impact_sum3')
    with open(os.path.join(my_path3, my_file),"rb") as fp:
        active_authors_classes_imp3 = pickle.load(fp) 
    
    start_year = 1995
    Sets_overlap_dict = {}
    for w in range(0,23):
        
        #consider just windows with at least 2000 papers in EW and OW
        windows_cond_w = windows_cond[w]   
        if windows_cond_w:
            start_year_w = start_year+w #T_0 #start OW
        
            [active_authors_start_prod,samples_dict_prod,n_prod] = active_authors_classes_prod[w]
            high_active_authors_prod = samples_dict_prod['top 10%']
            low_active_authors_prod = samples_dict_prod['bottom 10%']
            
            [active_authors_start_imp1,samples_dict_imp1,n_imp1] = active_authors_classes_imp1[w]
            high_active_authors_imp1 = samples_dict_imp1['top 10%']
            low_active_authors_imp1 = samples_dict_imp1['bottom 10%']
            
            [active_authors_start_imp2,samples_dict_imp2,n_imp2] = active_authors_classes_imp2[w]
            high_active_authors_imp2 = samples_dict_imp2['top 10%']
            low_active_authors_imp2 = samples_dict_imp2['bottom 10%']
            
            [active_authors_start_imp3,samples_dict_imp3,n_imp3] = active_authors_classes_imp3[w]
            high_active_authors_imp3 = samples_dict_imp3['top 10%']
            low_active_authors_imp3 = samples_dict_imp3['bottom 10%']

            #Jaccard index    
            ji_impact1_high = sets_overlap(high_active_authors_prod, high_active_authors_imp1)
            ji_impact2_high = sets_overlap(high_active_authors_prod, high_active_authors_imp2)
            ji_impact3_high = sets_overlap(high_active_authors_prod, high_active_authors_imp3)
            
            ji_impact1_low = sets_overlap(low_active_authors_prod, low_active_authors_imp1)
            ji_impact2_low = sets_overlap(low_active_authors_prod, low_active_authors_imp2)
            ji_impact3_low = sets_overlap(low_active_authors_prod, low_active_authors_imp3)
   

            Sets_overlap_dict[start_year_w] = [ji_impact1_high,ji_impact2_high,ji_impact3_high,ji_impact1_low,ji_impact2_low,ji_impact3_low]  
        
    Sets_overlap_df = (pd.DataFrame.from_dict(Sets_overlap_dict, orient='index')).rename_axis('T_0').reset_index()
    # my_file = 'Sets_overlap_df_windows.csv'
    # Sets_overlap_df.to_csv(os.path.join(my_path, my_file), sep=';')
    
    Sets_overlap_df.insert(0, 'topic', topic)

    return Sets_overlap_df

In [9]:
my_path = os.path.join(discipline, 'Sets_overlap_sum')
#create folder
if not os.path.exists(my_path):
    os.makedirs(my_path)
Sets_overlap_df = pd.DataFrame() 
for topic in topic_list:
    Sets_overlap_df_top = Sets_overlap_calc2(discipline=discipline,topic=topic,my_path=my_path) 
    Sets_overlap_df = pd.concat([Sets_overlap_df, Sets_overlap_df_top], axis = 0)

my_file = 'Sets_overlap_windows.csv'
Sets_overlap_df = Sets_overlap_df.rename(columns={0: "prod_impact1_high", 1: "prod_impact2_high", 2: "prod_impact3_high", 3: "prod_impact1_low", 4: "prod_impact2_low", 5: "prod_impact3_low"})
Sets_overlap_df.to_csv(os.path.join(my_path, my_file), sep=';')

In [10]:
#average across windows for each topic

my_path = os.path.join(discipline, 'Sets_overlap_sum')

my_file = 'Sets_overlap_windows.csv'
Sets_overlap_df = pd.read_csv(os.path.join(my_path, my_file),index_col=0, sep=';')
Sets_overlap_df.drop_duplicates(subset=['topic','T_0'], inplace=True)
Sets_overlap_df = Sets_overlap_df.set_index('topic')
Sets_overlap_df_ = pd.DataFrame()

for topic in topic_list:
    df_concept = Sets_overlap_df.query('topic == @topic')
    df_concept_mean = df_concept[["prod_impact1_high", "prod_impact2_high", "prod_impact3_high", "prod_impact1_low", "prod_impact2_low", "prod_impact3_low"]].mean(axis=0).to_frame().rename(columns={0:'mean'}) 
    df_concept_std = df_concept[["prod_impact1_high", "prod_impact2_high", "prod_impact3_high", "prod_impact1_low", "prod_impact2_low", "prod_impact3_low"]].std(axis=0).to_frame().rename(columns={0:'std'}) 
    df_concept_mean = pd.concat([df_concept_mean, df_concept_std], axis = 1)
    df_concept_mean = df_concept_mean.transpose().rename_axis('stat').reset_index()

    df_concept_mean.insert(0, 'topic', topic)

    Sets_overlap_df_ = pd.concat([Sets_overlap_df_, df_concept_mean], axis = 0)

my_file = 'Sets_overlap_table.csv'
Sets_overlap_df_.to_csv(os.path.join(my_path, my_file), sep=';')

In [11]:
#tables paper
my_path = os.path.join(discipline, 'Sets_overlap_sum')

my_file = 'Sets_overlap_table.csv'
Sets_overlap_df = pd.read_csv(os.path.join(my_path, my_file),index_col=0, sep=';')
Sets_overlap_df = Sets_overlap_df.reset_index(drop=True)

Sets_overlap_sum_ = pd.DataFrame()
for r in list(range(0, len(Sets_overlap_df),2)):
    Sets_overlap_sum = pd.DataFrame()
    Sets_overlap_sum["prod_impact1_high"] = Sets_overlap_df.loc[[r]]["prod_impact1_high"].reset_index(drop=True).round(2).astype(str) + "$\pm$" + Sets_overlap_df.loc[[r+1]]["prod_impact1_high"].reset_index(drop=True).round(2).astype(str)
    Sets_overlap_sum["prod_impact2_high"] = Sets_overlap_df.loc[[r]]["prod_impact2_high"].reset_index(drop=True).round(2).astype(str) + "$\pm$" + Sets_overlap_df.loc[[r+1]]["prod_impact2_high"].reset_index(drop=True).round(2).astype(str)
    Sets_overlap_sum["prod_impact3_high"] = Sets_overlap_df.loc[[r]]["prod_impact3_high"].reset_index(drop=True).round(2).astype(str) + "$\pm$" + Sets_overlap_df.loc[[r+1]]["prod_impact3_high"].reset_index(drop=True).round(2).astype(str)
    Sets_overlap_sum["prod_impact1_low"] = Sets_overlap_df.loc[[r]]["prod_impact1_low"].reset_index(drop=True).round(2).astype(str) + "$\pm$" + Sets_overlap_df.loc[[r+1]]["prod_impact1_low"].reset_index(drop=True).round(2).astype(str)
    Sets_overlap_sum["prod_impact2_low"] = Sets_overlap_df.loc[[r]]["prod_impact2_low"].reset_index(drop=True).round(2).astype(str) + "$\pm$" + Sets_overlap_df.loc[[r+1]]["prod_impact2_low"].reset_index(drop=True).round(2).astype(str)
    Sets_overlap_sum["prod_impact3_low"] = Sets_overlap_df.loc[[r]]["prod_impact3_low"].reset_index(drop=True).round(2).astype(str) + "$\pm$" + Sets_overlap_df.loc[[r+1]]["prod_impact3_low"].reset_index(drop=True).round(2).astype(str)
    Sets_overlap_sum.insert(0, "topic", Sets_overlap_df.loc[[r]]["topic"].reset_index(drop=True)[0])
    Sets_overlap_sum_ = pd.concat([Sets_overlap_sum_, Sets_overlap_sum], axis=0)
my_file = 'Sets_overlap_sum.csv'
Sets_overlap_sum_.to_csv(os.path.join(my_path, my_file), sep='&', index = False)

# TOPIC CONNECTEDNESS

In [12]:
def topic_conn_calc(discipline,topic,my_path):
    
    #load windows
    my_path2 = os.path.join(discipline, 'Info')
    my_file = 'windows_cond_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        windows_cond = pickle.load(fp)

    #load active authors
    my_file = 'active_authors_classes_'+topic
    my_path3 = os.path.join(my_path2,'Productivity')    
    with open(os.path.join(my_path3, my_file),"rb") as fp:
        active_authors_classes = pickle.load(fp)
        
    my_path4 = os.path.join(discipline, 'Productivity/Exp1_ver1')
    my_file = 'all_coauthors_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        all_coauthors_list = pickle.load(fp) 

    my_file = 'works_authors_activation_date_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        works_authors_activation_date = pickle.load(fp)
        
    start_year = 1995
    topics_conn_dict = {}
    for w in tqdm(range(0,23)): 
        windows_cond_w = windows_cond[w]   
        if windows_cond_w:
            
            start_year_w = start_year+w
            all_coauthors = all_coauthors_list[w]
            [active_authors_start,samples_dict_1,n_1] = active_authors_classes[w]
            
            #keep just works active_authors_start in this period and written in the period
            work_id_active = works_authors_activation_date[works_authors_activation_date.author_id.isin(active_authors_start)]
            work_id_active = work_id_active.query('@start_year_w-5 <= publication_year < @start_year_w', engine='python') 
            
            #add coauthors but not infected
            work_id_active_collab = works_authors[works_authors.work_id.isin(work_id_active.work_id)].query('author_id not in @active_authors_start')
            works_authors_collab = pd.concat([work_id_active,work_id_active_collab]).drop_duplicates(subset=['work_id', 'author_id']).reset_index(drop=True)
            
            #bipartite graph work-authors union exposure window
            bip_g = nx.from_pandas_edgelist(
                    works_authors_collab[['work_id', 'author_id']],
                    source='work_id', target='author_id'
                )

            #graph weight number papers written together
            author_ids_supp =  all_coauthors.intersection(set(works_authors_collab.author_id))
            G = nx.bipartite.weighted_projected_graph(bip_g, nodes=author_ids_supp)  #no weights
            
            
            #analysis
            conn_comps = sorted(nx.connected_components(G), key=len, reverse=True)
            conn_comps_len =  [len(c) for c in conn_comps]
            largest_cc = conn_comps[0]
            S = G.subgraph(largest_cc).copy()
            
            topics_conn_dict[start_year_w] = [
                G.number_of_nodes(),
                G.number_of_edges(),
                len(active_authors_start),
                #conn_comps_len,
                len(conn_comps_len),
                conn_comps_len[0],
                S.number_of_edges(),
                nx.average_clustering(G)    
            ] 
            
    topics_conn_df = (pd.DataFrame.from_dict(topics_conn_dict, orient='index')).rename_axis('T_0').reset_index()
    my_file = 'topics_conn_windows_'+topic+'.csv'
    topics_conn_df.to_csv(os.path.join(my_path, my_file), sep=';')
    
    topics_conn_df.insert(0, 'topic', topic)

    return topics_conn_df   

In [None]:
my_path = os.path.join(discipline, 'topics_conn_mean')
#create folder
if not os.path.exists(my_path):
    os.makedirs(my_path)
topics_conn_df = pd.DataFrame() 
for topic in topic_list:
    topics_conn_df_top = topic_conn_calc(discipline=discipline,topic=topic,my_path=my_path) 
    topics_conn_df = pd.concat([topics_conn_df, topics_conn_df_top], axis = 0)

my_file = 'topics_conn_windows.csv'
topics_conn_df = topics_conn_df.rename(columns={0: 'N',1: 'E',2: '#active_authors',3: '#cc',4: 'lcc_N',5: 'lcc_E',6: 'avg_clust'})
topics_conn_df.to_csv(os.path.join(my_path, my_file), sep=';')

In [13]:
#average across windows for each topic

my_path = os.path.join(discipline, 'topics_conn_mean')

my_file = 'topics_conn_windows.csv'
topics_conn_df = pd.read_csv(os.path.join(my_path, my_file),index_col=0, sep=';')
topics_conn_df.drop_duplicates(subset=['topic','T_0'], inplace=True)
topics_conn_df = topics_conn_df.set_index('topic')
topics_conn_df_ = pd.DataFrame()

for topic in topic_list:
    df_concept = topics_conn_df.query('topic == @topic')
    df_concept_mean = df_concept[['N','E','#active_authors','#cc','lcc_N','lcc_E','avg_clust']].mean(axis=0).to_frame().rename(columns={0:'mean'}) 
    df_concept_std = df_concept[['N','E','#active_authors','#cc','lcc_N','lcc_E','avg_clust']].std(axis=0).to_frame().rename(columns={0:'std'}) 
    df_concept_mean = pd.concat([df_concept_mean, df_concept_std], axis = 1)
    df_concept_mean = df_concept_mean.transpose().rename_axis('stat').reset_index()

    df_concept_mean.insert(0, 'topic', topic)

    topics_conn_df_ = pd.concat([topics_conn_df_, df_concept_mean], axis = 0)

my_file = 'topics_conn_table.csv'
topics_conn_df_.to_csv(os.path.join(my_path, my_file), sep=';')

In [14]:
#tables paper
my_path = os.path.join(discipline, 'topics_conn_mean')

my_file = 'topics_conn_table.csv'
topics_conn_df = pd.read_csv(os.path.join(my_path, my_file),index_col=0, sep=';')
topics_conn_df = topics_conn_df.reset_index(drop=True)

topics_conn_mean_ = pd.DataFrame()
for r in list(range(0, len(topics_conn_df),2)):
    topics_conn_mean = pd.DataFrame()
    topics_conn_mean["N"] = topics_conn_df.loc[[r]]["N"].reset_index(drop=True).round(2).astype(str) + "$\pm$" + topics_conn_df.loc[[r+1]]["N"].reset_index(drop=True).round(2).astype(str)
    topics_conn_mean["E"] = topics_conn_df.loc[[r]]["E"].reset_index(drop=True).round(2).astype(str) + "$\pm$" + topics_conn_df.loc[[r+1]]["E"].reset_index(drop=True).round(2).astype(str)   
    topics_conn_mean["#active_authors"] = topics_conn_df.loc[[r]]["#active_authors"].reset_index(drop=True).round(2).astype(str) + "$\pm$" + topics_conn_df.loc[[r+1]]["#active_authors"].reset_index(drop=True).round(2).astype(str)    
    topics_conn_mean["#cc"] = topics_conn_df.loc[[r]]["#cc"].reset_index(drop=True).round(2).astype(str) + "$\pm$" + topics_conn_df.loc[[r+1]]["#cc"].reset_index(drop=True).round(2).astype(str)    
    topics_conn_mean["lcc_N"] = topics_conn_df.loc[[r]]["lcc_N"].reset_index(drop=True).round(2).astype(str) + "$\pm$" + topics_conn_df.loc[[r+1]]["lcc_N"].reset_index(drop=True).round(2).astype(str)    
    topics_conn_mean["lcc_E"] = topics_conn_df.loc[[r]]["lcc_E"].reset_index(drop=True).round(2).astype(str) + "$\pm$" + topics_conn_df.loc[[r+1]]["lcc_E"].reset_index(drop=True).round(2).astype(str)    
    topics_conn_mean["avg_clust"] = topics_conn_df.loc[[r]]["avg_clust"].reset_index(drop=True).round(2).astype(str) + "$\pm$" + topics_conn_df.loc[[r+1]]["avg_clust"].reset_index(drop=True).round(2).astype(str)
    topics_conn_mean.insert(0, "topic", topics_conn_df.loc[[r]]["topic"].reset_index(drop=True)[0])
    topics_conn_mean_ = pd.concat([topics_conn_mean_, topics_conn_mean], axis=0)
    
my_file = 'topics_conn_mean.csv'
topics_conn_mean_.to_csv(os.path.join(my_path, my_file), sep='&', index = False)