# EXPERIMENT II - INFLUENCE

In [None]:
%load_ext autoreload 
%autoreload 2 

In [None]:
import pandas as pd 
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
from rich.progress import track
import ast
from tqdm.auto import tqdm
import ujson as json
import networkx as nx
import numpy as np 
import requests 
from scipy.stats import entropy

tqdm.pandas()
plt.rcParams.update({'font.size': 22})
sns.set(style="ticks", context="talk")
# plt.style.use("dark_background")

import plotly.express as px
import plotly.io as pio
from plotly.subplots import make_subplots
import plotly.graph_objects as go
pd.options.plotting.backend = 'plotly'
pio.templates.default = "plotly_dark"
pio.templates.default = 'presentation'

import rich
from itertools import combinations
import sys 
from statistics import mean, stdev
import struct, io, string
import os
import collections
from collections import Counter
import pickle
from scipy.stats import chisquare,kstest
from scipy import stats 
import random
import math
import random
from math import sqrt

In [None]:
def read_parquet(name, **args):
    path = basepath / f'{name}'
    df = pd.read_parquet(path, engine='pyarrow')
    # df.drop_duplicates(inplace=True)
    
    if 'publication_year' in df.columns:
        df.loc[:, 'publication_year'] = pd.to_numeric(df.publication_year)
        df = df[df.publication_year != 0]  # discard works with missing years
        
    print(f'Read {len(df):,} rows from {path.stem!r}')
    return df 

## LOAD FIELDS

### Physics

In [None]:
discipline = 'Physics'

In [None]:
basepath = Path('/N/project/openalex/slices/Physics/feb-2023')

works = read_parquet('works')
works_authors = read_parquet('works_authorships')
works_concepts = read_parquet('works_concepts')
works_referenced_works = read_parquet('works_referenced_works')

In [None]:
works['num_authors']=works['num_authors'].astype('int64')
works['n_coauthors'] = works['num_authors'] - 1
works_authors = pd.merge(works_authors, works['publication_date'], on="work_id")
works_authors.drop_duplicates(subset=['work_id','author_id'], inplace=True)
works_concepts = pd.merge(works_concepts, works['publication_date'], on="work_id")
works_concepts = works_concepts.query('score > 0.3', engine='python')

In [None]:
topic_list =[
    'Gravitational wave',
    'Dark matter',
    'Fluid dynamics',
    'Soliton',
    'Supersymmetry',
    'Statistical physics',          
    'Superconductivity' 
        ]

In [None]:
#create folder
if not os.path.exists(discipline):
    os.makedirs(discipline)

### Computer Science

In [None]:
discipline = 'CS'

In [None]:
basepath = Path('/N/project/openalex/slices/CS/feb-2023')

works = read_parquet('works')
works_authors = read_parquet('works_authorships')
works_concepts = read_parquet('works_concepts')
works_referenced_works = read_parquet('works_referenced_works')

In [None]:
works['num_authors']=works['num_authors'].astype('int64')
works['n_coauthors'] = works['num_authors'] - 1
works_authors = pd.merge(works_authors, works['publication_date'], on="work_id")
works_authors.drop_duplicates(subset=['work_id','author_id'], inplace=True)
works_concepts = pd.merge(works_concepts, works['publication_date'], on="work_id")
works_concepts = works_concepts.query('score > 0.3', engine='python')

In [None]:
topic_list =[
    'Compiler',
    'Mobile computing',
    'Cryptography',
    'Cluster analysis', 
    'Image processing',
    'Parallel computing'         
            ]  

In [None]:
#create folder
if not os.path.exists(discipline):
    os.makedirs(discipline)

### BioMed

In [None]:
discipline = 'BioMed'

In [None]:
basepath = Path('/N/project/openalex/slices/BioMed/feb-2023')

works = read_parquet('works')
works_authors = read_parquet('works_authorships')
works_concepts = read_parquet('works_concepts')
works_referenced_works = read_parquet('works_referenced_works')

In [None]:
works['num_authors']=works['num_authors'].astype('int64')
works['n_coauthors'] = works['num_authors'] - 1
works_authors = pd.merge(works_authors, works['publication_date'], on="work_id")
works_authors.drop_duplicates(subset=['work_id','author_id'], inplace=True)
works_concepts = pd.merge(works_concepts, works['publication_date'], on="work_id")
works_concepts = works_concepts.query('score > 0.3', engine='python')

In [None]:
topic_list =[
            'Protein structure',
            'Genome', 
            'Peptide sequence',
            "Alzheimer's disease",
            'Neurology',          
            'Radiation therapy',
            'Chemotherapy'
            ]

In [None]:
#create folder
if not os.path.exists(discipline):
    os.makedirs(discipline)

## FUNCTIONS DEFINITIONS

In [None]:
import warnings
warnings.filterwarnings('ignore')

### Collaboration graph

In [None]:
def make_collaboration_graph(works_authors,author_ids, start_year, end_year):
    
    current_work_ids = set(works_authors
                            .query('(@start_year <= publication_year < @end_year)')
                            .query('author_id.isin(@author_ids)')
                            .work_id)

    current_work_author_ids = (works_authors
        [['work_id', 'author_id']]
        .query('work_id.isin(@current_work_ids)'))
                              
    bip_g = nx.from_pandas_edgelist(
        current_work_author_ids,
        source='work_id', target='author_id'
    )
    
    author_ids =  author_ids.intersection(set(current_work_author_ids.author_id))
    
    #return bip_g
    collab_graph = nx.bipartite.projected_graph(bip_g, nodes=author_ids)

    return collab_graph

### Delate common neighbors

In [None]:
def delate_neig_incommon(collab_graph, active_authors):
    #infected authors neighbors not in common 
    nodes = set(collab_graph.nodes())
    nodes_active = list(active_authors)
    
    #any kind exposure
    neighbours_list = []
    for i in nodes_active:
        neighbours_list.extend(list(nx.neighbors(collab_graph,i)))       
    dict_count = dict(Counter(neighbours_list)) #count node occurency
    
    #multiple exposure 
    mult_exp_list = set([k for k,v in dict_count.items() if v > 1]) - set(nodes_active)
    #I consider just nodes with no multiple exposures 
    nodes = nodes - set(mult_exp_list)
    nodes = nodes - set(nodes_active)
    
    ##statistics
    multiple_exp = len(mult_exp_list)
    
    #single exposure 
    sing_exp = len(set([k for k,v in dict_count.items() if v == 1]) - set(nodes_active))
        
    return nodes,multiple_exp,sing_exp

### Definition experts

In [None]:
#mean_impact1 - papers:all, cits:topic
def experts_impact_mean_1(works_authors,start_year_i,active_authors_start,works_cit_counts_year_concept):

    #papers:all, citations:just tagged with concept 
    #all papers (with and without concept) written before start_date by active authors
    prior_works_ids_tot_5yr = (works_authors
                    .query('@start_year_i - 5 <= publication_year < @start_year_i', engine='python')
                    .query('author_id.isin(@active_authors_start)'))

    #just citations from papers with concept
    works_cit_counts_year_concept_startyear = works_cit_counts_year_concept.query('work_publication_year == @start_year_i - 1')

    prior_works_ids_tot_5yr_cit = pd.merge(prior_works_ids_tot_5yr, works_cit_counts_year_concept_startyear, on="work_id")
    
    #add authors zero citations
    miss_list = list(active_authors_start.difference(set(prior_works_ids_tot_5yr_cit.author_id)))
    miss_n = len(miss_list)
    miss = {'work_id': [np.NaN]*miss_n, 
            'author_id': miss_list,
            'author_name': [np.NaN]*miss_n, 
            'institution_id': [np.NaN]*miss_n, 
             'publication_year': [start_year_i-1]*miss_n,
            'publication_date': [np.NaN]*miss_n,
            'work_publication_year': [np.NaN]*miss_n,
             'cit_count': [0]*miss_n,
             'cit_count_cum': [0]*miss_n,
    }
    df_miss = pd.DataFrame(data=miss)
    prior_works_ids_tot_5yr_cit = pd.concat([prior_works_ids_tot_5yr_cit, df_miss])
    prior_works_ids_tot_5yr_cit = prior_works_ids_tot_5yr_cit[['author_id','cit_count_cum']].groupby(['author_id']).mean()

    impact_df = prior_works_ids_tot_5yr_cit.sort_values(by=['cit_count_cum'],ascending=False)
    impact_df = impact_df.reset_index()
    impact_df.columns = ['author_id', 'val']
    impact_df_len = len(impact_df)

    return impact_df,impact_df_len

In [None]:
#mean_impact2 - papers:topic, cits:all
def experts_impact_mean_2(works_authors,start_year_i,prior_works_ids_tot_5yr,active_authors_start,works_cit_counts_year):

    #papers:just tagged with concept, citations:all
    #just papers tagged with concept written before start_date by active authors 
    prior_works_ids_tot_5yr = (works_authors
                    .query('work_id.isin(@prior_work_ids_5yr)'))

    #just citations from papers with concept
    works_cit_counts_year_startyear = works_cit_counts_year.query('work_publication_year == @start_year_i - 1')

    prior_works_ids_tot_5yr_cit = pd.merge(prior_works_ids_tot_5yr, works_cit_counts_year_startyear, on="work_id")
    
    #add authors zero citations
    miss_list = list(active_authors_start.difference(set(prior_works_ids_tot_5yr_cit.author_id)))
    miss_n = len(miss_list)
    miss = {'work_id': [np.NaN]*miss_n, 
            'author_id': miss_list,
            'author_name': [np.NaN]*miss_n, 
            'institution_id': [np.NaN]*miss_n, 
             'publication_year': [start_year_i-1]*miss_n,
            'publication_date': [np.NaN]*miss_n,
            'work_publication_year': [np.NaN]*miss_n,
             'cit_count': [0]*miss_n,
             'cit_count_cum': [0]*miss_n,
    }
    df_miss = pd.DataFrame(data=miss)
    prior_works_ids_tot_5yr_cit = pd.concat([prior_works_ids_tot_5yr_cit, df_miss])
    prior_works_ids_tot_5yr_cit = prior_works_ids_tot_5yr_cit[['author_id','cit_count_cum']].groupby(['author_id']).mean()

    impact_df = prior_works_ids_tot_5yr_cit.sort_values(by=['cit_count_cum'],ascending=False)
    impact_df = impact_df.reset_index()
    impact_df.columns = ['author_id', 'val']
    impact_df_len = len(impact_df)

    return impact_df,impact_df_len

In [None]:
#mean_impact3 - papers:topic, cits:topic
def experts_impact_mean_3(works_authors,start_year_i,prior_work_ids_5yr,active_authors_start,works_cit_counts_year_concept):

    #papers:just tagged with concept, citations:just tagged with concept 
    #just papers tagged with concept written before start_date by active authors 
    prior_works_ids_tot_5yr = (works_authors
                    .query('work_id.isin(@prior_work_ids_5yr)'))

    #just citations from papers with concept
    works_cit_counts_year_concept_startyear = works_cit_counts_year_concept.query('work_publication_year == @start_year_i - 1')

    prior_works_ids_tot_5yr_cit = pd.merge(prior_works_ids_tot_5yr, works_cit_counts_year_concept_startyear, on="work_id")
    
    #add authors zero citations
    miss_list = list(active_authors_start.difference(set(prior_works_ids_tot_5yr_cit.author_id)))
    miss_n = len(miss_list)
    miss = {'work_id': [np.NaN]*miss_n, 
            'author_id': miss_list,
            'author_name': [np.NaN]*miss_n, 
            'institution_id': [np.NaN]*miss_n, 
             'publication_year': [start_year_i-1]*miss_n,
            'publication_date': [np.NaN]*miss_n,
            'work_publication_year': [np.NaN]*miss_n,
             'cit_count': [0]*miss_n,
             'cit_count_cum': [0]*miss_n,
    }
    df_miss = pd.DataFrame(data=miss)
    prior_works_ids_tot_5yr_cit = pd.concat([prior_works_ids_tot_5yr_cit, df_miss])
    prior_works_ids_tot_5yr_cit = prior_works_ids_tot_5yr_cit[['author_id','cit_count_cum']].groupby(['author_id']).mean()

    impact_df = prior_works_ids_tot_5yr_cit.sort_values(by=['cit_count_cum'],ascending=False)
    impact_df = impact_df.reset_index()
    impact_df.columns = ['author_id', 'val']
    impact_df_len = len(impact_df)

    return impact_df,impact_df_len

In [None]:
def experts_productivity(works_authors,prior_work_ids_5yr,active_authors_start):
    #count number of works written with topic during exposure window
    sorted_author_works_count = (
    works_authors
    .query('work_id.isin(@prior_work_ids_5yr) & author_id.isin(@active_authors_start)') 
    .groupby('author_id')
    .work_id
    .count()
    .sort_values(ascending=False)
    )

    sorted_author_works_count_len = len(sorted_author_works_count)
    
    sorted_author_works_count = sorted_author_works_count.to_frame().reset_index()
    sorted_author_works_count.columns = ['author_id', 'val']
    
    return sorted_author_works_count,sorted_author_works_count_len  

### Get author samples

In [None]:
def get_author_samples(author_stats_df, top_k, debug=False):
    """
    author_stats_df: DataFrame where author_id has active author ids, and val has the productivity/impact values for that author
    top_k: either 10 or 20 depending on top 10 or 20%
    
    Returns a dictionary where keys are class labels, and values are set of author IDs
    """
    # Note highest scoring authors are ranked LAST 
    author_stats_df.loc[:, 'rank_pct'] = author_stats_df.val.rank(method='min', pct=True)  # rank rows based on val convert to percentiles
    #author_stats_df.loc[:, 'rank_pct'] = author_stats_df.val.rank(pct=True)
    
    if top_k == 10:
        bins = [0, 0.1, 0.3, 0.45, 0.55, 0.7, 0.9, 1]
        labels=['bottom 10%', '10-30%', '30-45%', 'middle 10%', '55-70%', '70-90%', 'top 10%']
    else:
        bins = [0, 0.2, 0.3, 0.4, 0.6, 0.7, 0.8, 1]
        labels=['bottom 20%', '20-30%', '30-40%', 'middle 20%', '60-70%', '70-80%', 'top 20%']
        
    author_stats_df.loc[:, 'rank_cat'] = (  # assign category labels based on rank percentiles 
        pd.cut(
            author_stats_df.rank_pct,
            bins=bins,
            labels=labels
        )
    )
    
    samples_per_class = max(int((top_k / 100) * author_stats_df.author_id.nunique()), 1)
    if debug:
        print(f'{top_k=} taking {samples_per_class=:,}')
        display(author_stats_df.head(2))
    
    buckets_size = list(author_stats_df.groupby('rank_cat').count()['rank_pct'])
    #print(buckets_size)
    
    samples_dict = {}
    
    keep = [f'bottom {top_k}%', f'middle {top_k}%', f'top {top_k}%']  # keep only these classes
    #keep = [f'bottom {top_k}%', f'top {top_k}%']
    for i, label in enumerate(labels):
        if label not in keep:
            continue
        
        #initial bucket     
        candidates = set(author_stats_df[author_stats_df.rank_cat==label].author_id)
        candidates_size = buckets_size[i] #len(candidates)
        if candidates_size >=  samples_per_class:
            if debug:
                print(f'{label}: Sampling {samples_per_class:,} from {len(candidates):,} candidates')
            samples = set(random.sample(list(candidates), samples_per_class))  # sample here
        else:
            if debug:
                print(f'Insufficient items in {label}. Need {samples_per_class:,} have {len(candidates):,}')
            samples = candidates  # pick everyone
    
        missing = samples_per_class - len(samples)
        if missing > 0: 
            
            #1 next bucket 
            if i != len(labels) - 1: #not last bucket # try the next bucket
                next_label = author_stats_df.rank_cat.cat.categories[i+1]
                candidates = set(author_stats_df[author_stats_df.rank_cat==next_label].author_id)
                candidate_size = buckets_size[i+1]
            else: # for the highest bucket, go one below
                next_label = author_stats_df.rank_cat.cat.categories[i-1] 
                candidates = set(author_stats_df[author_stats_df.rank_cat==next_label].author_id)
                candidate_size = buckets_size[i-1]

            if candidate_size >= missing:    
                new_samples = set(random.sample(list(candidates), missing))  # sample here
                samples = samples | new_samples  # add these new samples
                if debug:
                    print(f'Missing {missing:,} samples for {label}. Expanding the range to {next_label}, Acquired {len(new_samples):,} new samples.')
            else: 
                new_samples = candidates  # pick everyone
                samples = samples | new_samples
            
            missing = samples_per_class - len(samples)
            if missing > 0: 

                #2 next bucket 
                if i != len(labels) - 1: #not last bucket # try the next bucket
                    next_next_label = author_stats_df.rank_cat.cat.categories[i+2]
                    candidates = set(author_stats_df[author_stats_df.rank_cat==next_next_label].author_id)
                    candidate_size = buckets_size[i+2]
                else: # for the highest bucket, go one below
                    next_next_label = author_stats_df.rank_cat.cat.categories[i-2] 
                    candidates = set(author_stats_df[author_stats_df.rank_cat==next_next_label].author_id)
                    candidate_size = buckets_size[i-2]
                
                if candidate_size >= missing:    
                    new_samples = set(random.sample(list(candidates), missing))  # sample here
                    samples = samples | new_samples  # add these new samples
                    if debug:
                        print(f'Missing {missing:,} samples for {label}. Expanding the range to {next_next_label}, Acquired {len(new_samples):,} new samples.')
                else: 
                    new_samples = candidates  # pick everyone
                    samples = samples | new_samples
    
        assert len(samples) == samples_per_class, f'Count mismatch {len(samples)=} {samples_per_class=} for samples {label}'
        samples_dict[label] = samples
        
    return samples_dict,samples_per_class

### Scores

In [None]:
def get_scores_high(author_ids, collab_graph,first_time_authors,prior_author_ids,nodes,authors_active_start_1paper_id_dict,first_time_authors_1paper_id_dict,high_active_authors_bin1,high_active_authors_bin2):

    #just first time authors one exposure
    first_time_authors = first_time_authors & nodes
    
    numerators_A = {}
    denominators_A = {}
    fractions_A = {}
    numerators_B = {}
    denominators_B = {}
    fractions_B = {}
    for author_id in author_ids: 

        # Exp2 - A 
        neighbors = set(collab_graph.neighbors(author_id))
        denominator = len((neighbors - prior_author_ids) & nodes)
        
        if denominator != 0:
            denominators_A[author_id] = denominator
            neigh_activated = neighbors & first_time_authors
            numerator = len(neigh_activated)
            numerators_A[author_id] = numerator
            fractions_A[author_id] = numerator / denominator
        else: #to have same number for each sample
            numerator = np.nan
            numerators_A[author_id] = numerator
            denominators_A[author_id] = np.nan
            fractions_A[author_id] = np.nan
            
        # Exp2 - B
        denominator = numerator

        if denominator!=0 and not np.isnan(denominator):
            denominators_B[author_id] = denominator
            author_id_1paper = authors_active_start_1paper_id_dict[author_id] #1 papers in which author_id coauthor         
            numerator = 0
            for na in neigh_activated:           
                first_time_authors_1paper_na = first_time_authors_1paper_id_dict[na]
                if first_time_authors_1paper_na in author_id_1paper:
                    numerator += 1
            numerators_B[author_id] = numerator
            fractions_B[author_id] = numerator / denominator
        else:
            numerators_B[author_id] = np.nan
            denominators_B[author_id] = np.nan
            fractions_B[author_id] = np.nan       

    
    # Exp2 - C 
    # numerators_A_bin1 = {key: numerators_A[key] for key in high_active_authors_bin1}
    # denominators_A_bin1 = {key: denominators_A[key] for key in high_active_authors_bin1}
    fractions_A_bin1 = {key: fractions_A[key] for key in high_active_authors_bin1}
    # numerators_A_bin2 = {key: numerators_A[key] for key in high_active_authors_bin2}
    # denominators_A_bin2 = {key: denominators_A[key] for key in high_active_authors_bin2}
    fractions_A_bin2 = {key: fractions_A[key] for key in high_active_authors_bin2} 
     
    return [fractions_A,fractions_B,fractions_A_bin1,fractions_A_bin2]                                                                                         

In [None]:
def get_scores_low(author_ids, collab_graph,first_time_authors,prior_author_ids,nodes,authors_active_start_1paper_id_dict,first_time_authors_1paper_id_dict):

    #just first time authors one exposure
    first_time_authors = first_time_authors & nodes
    
    numerators_A = {}
    denominators_A = {}
    fractions_A = {}
    numerators_B = {}
    denominators_B = {}
    fractions_B = {}
    for author_id in author_ids: 

        # Exp2 - A 
        neighbors = set(collab_graph.neighbors(author_id))
        denominator = len((neighbors - prior_author_ids) & nodes)
        
        if denominator != 0:
            denominators_A[author_id] = denominator
            neigh_activated = neighbors & first_time_authors
            numerator = len(neigh_activated)
            numerators_A[author_id] = numerator
            fractions_A[author_id] = numerator / denominator
        else: #to have same number for each sample
            numerator = np.nan
            numerators_A[author_id] = numerator
            denominators_A[author_id] = np.nan
            fractions_A[author_id] = np.nan
            
        # Exp2 - B
        denominator = numerator

        if denominator!=0 and not np.isnan(denominator):
            denominators_B[author_id] = denominator
            author_id_1paper = authors_active_start_1paper_id_dict[author_id] #1 papers in which author_id coauthor         
            numerator = 0
            for na in neigh_activated:           
                first_time_authors_1paper_na = first_time_authors_1paper_id_dict[na]
                if first_time_authors_1paper_na in author_id_1paper:
                    numerator += 1
            numerators_B[author_id] = numerator
            fractions_B[author_id] = numerator / denominator
        else:
            numerators_B[author_id] = np.nan
            denominators_B[author_id] = np.nan
            fractions_B[author_id] = np.nan       
     
    return [fractions_A,fractions_B]   

In [None]:
def get_bins_C(works_authors_active,work_id_valid,high_active_authors1):
          
    high1_dilution_df = ((works_authors_active.query('work_id.isin(@work_id_valid)').groupby('author_id')['n_coauthors'].mean()).to_frame()).sort_values(by=['n_coauthors'],ascending=False).reset_index(level=0)
    high1_dilution_df = high1_dilution_df.query('author_id.isin(@high_active_authors1)')
    
    high1_dilution_df.columns = ['author_id', 'val']
    
    samples_dict_1,n_1 = get_author_samples(high1_dilution_df, top_k=20, debug=False)
     
    high_active_authors1_bin1 = samples_dict_1['top 20%']
    high_active_authors1_bin2 = samples_dict_1['bottom 20%']

    return high_active_authors1_bin1,high_active_authors1_bin2

### Create folders

In [None]:
#create folder
my_path = os.path.join(discipline, 'Impact_mean1')
if not os.path.exists(my_path):
    os.makedirs(my_path)
    
my_path = os.path.join(discipline, 'Productivity')
if not os.path.exists(my_path):
    os.makedirs(my_path)

## EXP2

### Productivity

In [None]:
def Exp2(discipline,topic,my_path):
    
    #restrict to topic
    works_concepts_conc = works_concepts.query('concept_name==@topic', engine='python')
      
    #load
    my_path2 = os.path.join(discipline, 'Info')
    my_file = 'work_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        work_ids_list = pickle.load(fp)
    my_file = 'author_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        author_ids_list = pickle.load(fp)
    my_file = 'windows_cond_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        windows_cond = pickle.load(fp)
        
    #load
    my_path3 = os.path.join(os.path.join(discipline,'Info'),os.path.split(os.path.split(my_path)[0])[1])
    my_file = 'active_authors_classes_'+topic
    with open(os.path.join(my_path3, my_file),"rb") as fp:
        active_authors_classes = pickle.load(fp)   
    
    #consider consecutive EW and OW (5 years each)
    start_year = 1995     
    
    #my_path4 = os.path.join(os.path.split(my_path)[0],'Exp1_ver1')
    my_path4 = os.path.join(discipline, 'Productivity/Exp1_ver1')
    my_file = 'all_coauthors_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        all_coauthors_list = pickle.load(fp) 
    my_file = 'active_authors_start_union_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        active_authors_start_union = pickle.load(fp) 
    active_authors_start_union_list = list(active_authors_start_union)  
                
    my_file = 'prior_work_ids_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        prior_work_ids_list = pickle.load(fp) 
    my_file = 'prior_author_ids_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        prior_author_ids_list = pickle.load(fp)  
    my_file = 'first_time_authors_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_list = pickle.load(fp) 
    my_file = 'not_active_authors_start_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        not_active_authors_start_list = pickle.load(fp)
    my_file = 'first_time_authors_union_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_union = pickle.load(fp)
                  
    my_file = 'works_authors_activation_date_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        works_authors_activation_date = pickle.load(fp)

    works_authors_active_union = pd.merge(works_authors_activation_date,works[['n_coauthors']], left_on="work_id", right_index=True)
    works_authors_activation_set = set(works_authors_active_union.work_id)
    my_file = 'works_authors_active_union_'+topic
    with open(os.path.join(my_path, my_file),"wb") as fp:
        pickle.dump(works_authors_active_union,fp)
    works_authors_activation = works_authors.query('work_id.isin(@works_authors_activation_set)')
    my_file = 'works_authors_activation_'+topic
    with open(os.path.join(my_path, my_file),"wb") as fp:
        pickle.dump(works_authors_activation,fp)
    # my_file = 'works_authors_activation_'+topic       
    # with open(os.path.join(my_path3, my_file),"rb") as fp:
    #     works_authors_activation = pickle.load(fp)
    
    #first paper first_time_authors
    works_authors_conc = pd.merge(works_authors, works_concepts_conc[['work_id','concept_name']], on="work_id")
    works_authors_concept_period = works_authors_conc.query('@start_year <= publication_year & concept_name==@topic', engine='python') #papers with concept written during the windows
    work_ids_authors_new_df = works_authors_concept_period[works_authors_concept_period.author_id.isin(first_time_authors_union)].sort_values(by='publication_date').drop_duplicates('author_id') #chronological order and not repetition 
    first_time_authors_work_ids = set(work_ids_authors_new_df.work_id) #list works with new authors in the field, in chronological order
    #dictionary = {first_time_author_id : first_paper_id}
    work_ids_authors_new_df2 = work_ids_authors_new_df[['work_id','author_id']]
    #work_ids_authors_new_df2 = work_ids_authors_new_df.work_id.to_frame().join(work_ids_authors_new_df.author_id)
    first_time_authors_1paper_id_dict = work_ids_authors_new_df2.set_index('author_id').to_dict()['work_id']
    my_file = 'first_time_authors_1paper_id_dict_'+topic
    with open(os.path.join(my_path, my_file),"wb") as fp:
        pickle.dump(first_time_authors_1paper_id_dict,fp)
    
    #active_authors_start_union 1 papers of which they are coauthors
    work_ids_authors_active_df = works_authors_concept_period[works_authors_concept_period.work_id.isin(first_time_authors_work_ids)] #1 papers ids 
    work_ids_authors_active_df_ = work_ids_authors_active_df[work_ids_authors_active_df.author_id.isin(active_authors_start_union)] #just active authors  
    #dictionary: {authors_active_start_id : list first_paper_id coauthor}
    authors_active_start_1paper_id_dict = (
        work_ids_authors_active_df_
        .groupby('author_id')
        .work_id
        .apply(lambda g: list(g))
        .to_dict()
    )
    for aanc in list(active_authors_start_union - set(authors_active_start_1paper_id_dict.keys())):  #active authors coauthors no 1 paper
        authors_active_start_1paper_id_dict[aanc] = []
    my_file = 'authors_active_start_1paper_id_dict_'+topic
    with open(os.path.join(my_path, my_file),"wb") as fp:
        pickle.dump(authors_active_start_1paper_id_dict,fp)
    my_file = 'work_ids_authors_active_df_'+topic
    with open(os.path.join(my_path, my_file),"wb") as fp:
        pickle.dump(work_ids_authors_active_df,fp)
    
    info_df_  = pd.DataFrame()
    frac_vec = {} 
    for w in tqdm(range(0,23)): 
        
        windows_cond_w = windows_cond[w]   
        if windows_cond_w:
        
            start_year_w = start_year+w
            all_coauthors = all_coauthors_list[w]
            [active_authors_start,samples_dict_1,n_1] = active_authors_classes[w]
            high_active_authors1 = samples_dict_1['top 10%']
            low_active_authors1 = samples_dict_1['bottom 10%']
            first_time_authors = first_time_authors_list[w]
            prior_author_ids = prior_author_ids_list[w]
              
            #collaboration graph
            collab_graph = make_collaboration_graph(works_authors_activation,active_authors_start,start_year=start_year_w-5, end_year=start_year_w)
            #keep nodes with just single exposures
            nodes,multiple_exp,sing_exp = delate_neig_incommon(collab_graph=collab_graph, active_authors=active_authors_start) 
            #high and low infected authors 
            #papers written by infected authors in exposure window (5 years before)
            works_authors_active = (works_authors_active_union.query('@start_year_w - 5 <= publication_year < @start_year_w ')).query('author_id.isin(@active_authors_start)')
            works_authors_active_set = set(works_authors_active.work_id)
            #just works written with eligible coauthors
            nodes_prior = nodes - prior_author_ids
            work_id_valid = set(((works_authors.query('work_id.isin(@works_authors_active_set)')).query('author_id.isin(@nodes_prior)')).work_id)

            #Exp2 - C #two bins higly active authors depending on mean of number of coauthors
            high_active_authors1_bin1,high_active_authors1_bin2 = get_bins_C(works_authors_active,work_id_valid,high_active_authors1)

            #Exp2 - A and B
            #list of dictionaries [high1_A,high1_B,high1_bin1_A,high1_bin2_A]      
            frac_vec_high1 = get_scores_high(author_ids=high_active_authors1, collab_graph=collab_graph, first_time_authors=first_time_authors,prior_author_ids=prior_author_ids,nodes=nodes,authors_active_start_1paper_id_dict=authors_active_start_1paper_id_dict,first_time_authors_1paper_id_dict=first_time_authors_1paper_id_dict,high_active_authors_bin1=high_active_authors1_bin1,high_active_authors_bin2=high_active_authors1_bin2)
            frac_vec_low1 = get_scores_low(author_ids=low_active_authors1, collab_graph=collab_graph, first_time_authors=first_time_authors,prior_author_ids=prior_author_ids,nodes=nodes,authors_active_start_1paper_id_dict=authors_active_start_1paper_id_dict,first_time_authors_1paper_id_dict=first_time_authors_1paper_id_dict)
            
            frac_vec[start_year_w] = [frac_vec_high1,frac_vec_low1]

            #save on files #info
            info_w_dict = { '#NODES MULTIPLE EXPOSURES':multiple_exp, 
                           '#NODES SINGLE EXPOSURE':sing_exp
                          }

            info_w = pd.DataFrame(data=[info_w_dict])
            info_w.insert(0, 'T_0', start_year_w)
            info_df_ = pd.concat([info_df_, info_w], ignore_index = True, axis = 0)

    #save on file : concept - year_start 
    my_file = 'info_df_'+topic+'_windows.csv'
    info_df_.to_csv(os.path.join(my_path, my_file), sep=';')
    my_file = 'frac_vec_'+topic+'_windows'
    with open(os.path.join(my_path, my_file),"wb") as fp:
        pickle.dump(frac_vec,fp)

    #save all concept dataframes in one file
    info_df_.insert(0, 'topic', topic)

    return info_df_,frac_vec

In [None]:
my_path = os.path.join(discipline, 'Productivity/Exp2')
#create folder
if not os.path.exists(my_path):
    os.makedirs(my_path)
info_df = pd.DataFrame() 
frac_vec = {}
for topic in topic_list:
    info_df_top,frac_vec_top = Exp2(discipline=discipline,topic=topic,my_path=my_path) 
    info_df = pd.concat([info_df, info_df_top], ignore_index = True, axis = 0)
    frac_vec[topic] = frac_vec_top
my_file = 'info_windows.csv'
info_df.to_csv(os.path.join(my_path, my_file), sep=';')
my_file = 'frac_vec_windows'
with open(os.path.join(my_path, my_file),"wb") as fp:
    pickle.dump(frac_vec,fp)

### Impact

#### Def. impact 1 - papers:all, cits:topic

In [None]:
def Exp2_1(discipline,topic,my_path):
    
    #restrict to topic
    works_concepts_conc = works_concepts.query('concept_name==@topic', engine='python')
  
    #load
    my_path2 = os.path.join(discipline, 'Info')
    my_file = 'work_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        work_ids_list = pickle.load(fp)
    my_file = 'author_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        author_ids_list = pickle.load(fp)
    my_file = 'windows_cond_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        windows_cond = pickle.load(fp)
        
    #load
    my_path3 = os.path.join(my_path2,'Impact_mean1')
    my_file = 'active_authors_classes_'+topic
    with open(os.path.join(my_path3, my_file),"rb") as fp:
        active_authors_classes = pickle.load(fp)   
    
    #consider consecutive EW and OW (5 years each)
    start_year = 1995      
    
    my_path4 = os.path.join(discipline, 'Productivity/Exp1_ver1')
    my_file = 'all_coauthors_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        all_coauthors_list = pickle.load(fp) 
    my_file = 'active_authors_start_union_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        active_authors_start_union = pickle.load(fp) 
    active_authors_start_union_list = list(active_authors_start_union)  
     
                
    my_file = 'prior_work_ids_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        prior_work_ids_list = pickle.load(fp) 
    my_file = 'prior_author_ids_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        prior_author_ids_list = pickle.load(fp)  
    my_file = 'first_time_authors_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_list = pickle.load(fp) 
    my_file = 'not_active_authors_start_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        not_active_authors_start_list = pickle.load(fp)
    my_file = 'first_time_authors_union_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_union = pickle.load(fp)
                 
    my_file = 'works_authors_activation_date_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        works_authors_activation_date = pickle.load(fp)

    my_path5 = os.path.join(discipline, 'Productivity/Exp2')    
    my_file = 'works_authors_activation_'+topic       
    with open(os.path.join(my_path5, my_file),"rb") as fp:
        works_authors_activation = pickle.load(fp)
        
    my_file = 'works_authors_active_union_'+topic       
    with open(os.path.join(my_path5, my_file),"rb") as fp:
        works_authors_active_union = pickle.load(fp)
    
    my_file = 'first_time_authors_1paper_id_dict_'+topic       
    with open(os.path.join(my_path5, my_file),"rb") as fp:
        first_time_authors_1paper_id_dict = pickle.load(fp)
    
    my_file = 'authors_active_start_1paper_id_dict_'+topic       
    with open(os.path.join(my_path5, my_file),"rb") as fp:
        authors_active_start_1paper_id_dict = pickle.load(fp)
    my_file = 'work_ids_authors_active_df_'+topic       
    with open(os.path.join(my_path5, my_file),"rb") as fp:
        work_ids_authors_active_df = pickle.load(fp)
    
    info_df_  = pd.DataFrame()
    frac_vec = {} 
    for w in tqdm(range(0,23)): 
        
        windows_cond_w = windows_cond[w]   
        if windows_cond_w:
        
            start_year_w = start_year+w
            all_coauthors = all_coauthors_list[w]
            [active_authors_start,samples_dict_1,n_1] = active_authors_classes[w]
            high_active_authors1 = samples_dict_1['top 10%']
            low_active_authors1 = samples_dict_1['bottom 10%']
            first_time_authors = first_time_authors_list[w]
            prior_author_ids = prior_author_ids_list[w]
              
            #collaboration graph
            collab_graph = make_collaboration_graph(works_authors_activation,active_authors_start,start_year=start_year_w-5, end_year=start_year_w)
            #keep nodes with just single exposures
            nodes,multiple_exp,sing_exp = delate_neig_incommon(collab_graph=collab_graph, active_authors=active_authors_start) 
            #high and low infected authors 
            #papers written by infected authors in exposure window (5 years before)
            works_authors_active = (works_authors_active_union.query('@start_year_w - 5 <= publication_year < @start_year_w ')).query('author_id.isin(@active_authors_start)')
            works_authors_active_set = set(works_authors_active.work_id)
            #just works written with eligible coauthors
            nodes_prior = nodes - prior_author_ids
            work_id_valid = set(((works_authors.query('work_id.isin(@works_authors_active_set)')).query('author_id.isin(@nodes_prior)')).work_id)

            #Exp2 - C #two bins higly active authors depending on number of coauthors
            high_active_authors1_bin1,high_active_authors1_bin2 = get_bins_C(works_authors_active,work_id_valid,high_active_authors1)

            #highly infected
            #list of dictionaries [high1_A,high1_B,high1_bin1_A,high1_bin2_A]      
            frac_vec_high1 = get_scores_high(author_ids=high_active_authors1, collab_graph=collab_graph, first_time_authors=first_time_authors,prior_author_ids=prior_author_ids,nodes=nodes,authors_active_start_1paper_id_dict=authors_active_start_1paper_id_dict,first_time_authors_1paper_id_dict=first_time_authors_1paper_id_dict,high_active_authors_bin1=high_active_authors1_bin1,high_active_authors_bin2=high_active_authors1_bin2)
            frac_vec_low1 = get_scores_low(author_ids=low_active_authors1, collab_graph=collab_graph, first_time_authors=first_time_authors,prior_author_ids=prior_author_ids,nodes=nodes,authors_active_start_1paper_id_dict=authors_active_start_1paper_id_dict,first_time_authors_1paper_id_dict=first_time_authors_1paper_id_dict)
            
            frac_vec[start_year_w] = [frac_vec_high1,frac_vec_low1]

            #save on files #info
            info_w_dict = { '#NODES MULTIPLE EXPOSURE':multiple_exp, 
                           '#NODES SINGLE EXPOSURE':sing_exp
                          }

            info_w = pd.DataFrame(data=[info_w_dict])
            info_w.insert(0, 'T_0', start_year_w)
            info_df_ = pd.concat([info_df_, info_w], ignore_index = True, axis = 0)

    #save on file : concept - year_start 
    my_file = 'info_df_'+topic+'_windows.csv'
    info_df_.to_csv(os.path.join(my_path, my_file), sep=';')
    my_file = 'frac_vec_'+topic+'_windows'
    with open(os.path.join(my_path, my_file),"wb") as fp:
        pickle.dump(frac_vec,fp)

    #save all concept dataframes in one file
    info_df_.insert(0, 'topic', topic)

    return info_df_,frac_vec

In [None]:
my_path = os.path.join(discipline, 'Impact_mean1/Exp2_1')
#create folder
if not os.path.exists(my_path):
    os.makedirs(my_path)
info_df = pd.DataFrame() 
frac_vec = {}
for topic in topic_list:
    info_df_top,frac_vec_top = Exp2_1(discipline=discipline,topic=topic,my_path=my_path) 
    info_df = pd.concat([info_df, info_df_top], ignore_index = True, axis = 0)
    frac_vec[topic] = frac_vec_top
my_file = 'info_windows.csv'
info_df.to_csv(os.path.join(my_path, my_file), sep=';')
my_file = 'frac_vec_windows'
with open(os.path.join(my_path, my_file),"wb") as fp:
    pickle.dump(frac_vec,fp)

#### Def. impact 2 - papers:topic, cits:all

In [None]:
def Exp2_2(discipline,topic,my_path):
    
    #restrict to topic
    works_concepts_conc = works_concepts.query('concept_name==@topic', engine='python')
  
    #load
    my_path2 = os.path.join(discipline, 'Info')
    my_file = 'work_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        work_ids_list = pickle.load(fp)
    my_file = 'author_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        author_ids_list = pickle.load(fp)
    my_file = 'windows_cond_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        windows_cond = pickle.load(fp)
        
    #load
    my_path3 = os.path.join(my_path2,'Impact_mean2')
    my_file = 'active_authors_classes_'+topic
    with open(os.path.join(my_path3, my_file),"rb") as fp:
        active_authors_classes = pickle.load(fp)   
    
    #consider consecutive EW and OW (5 years each)
    start_year = 1995      
    
    my_path4 = os.path.join(discipline, 'Productivity/Exp1_ver1')
    my_file = 'all_coauthors_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        all_coauthors_list = pickle.load(fp) 
    my_file = 'active_authors_start_union_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        active_authors_start_union = pickle.load(fp) 
    active_authors_start_union_list = list(active_authors_start_union)  
     
                
    my_file = 'prior_work_ids_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        prior_work_ids_list = pickle.load(fp) 
    my_file = 'prior_author_ids_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        prior_author_ids_list = pickle.load(fp)  
    my_file = 'first_time_authors_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_list = pickle.load(fp) 
    my_file = 'not_active_authors_start_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        not_active_authors_start_list = pickle.load(fp)
    my_file = 'first_time_authors_union_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_union = pickle.load(fp)
                 
    my_file = 'works_authors_activation_date_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        works_authors_activation_date = pickle.load(fp)

    my_path5 = os.path.join(discipline, 'Productivity/Exp2')    
    my_file = 'works_authors_activation_'+topic       
    with open(os.path.join(my_path5, my_file),"rb") as fp:
        works_authors_activation = pickle.load(fp)
        
    my_file = 'works_authors_active_union_'+topic       
    with open(os.path.join(my_path5, my_file),"rb") as fp:
        works_authors_active_union = pickle.load(fp)
    
    my_file = 'first_time_authors_1paper_id_dict_'+topic       
    with open(os.path.join(my_path5, my_file),"rb") as fp:
        first_time_authors_1paper_id_dict = pickle.load(fp)
    
    my_file = 'authors_active_start_1paper_id_dict_'+topic       
    with open(os.path.join(my_path5, my_file),"rb") as fp:
        authors_active_start_1paper_id_dict = pickle.load(fp)
    my_file = 'work_ids_authors_active_df_'+topic       
    with open(os.path.join(my_path5, my_file),"rb") as fp:
        work_ids_authors_active_df = pickle.load(fp)
    
    info_df_  = pd.DataFrame()
    frac_vec = {} 
    for w in tqdm(range(0,23)): 
        
        windows_cond_w = windows_cond[w]   
        if windows_cond_w:
        
            start_year_w = start_year+w
            all_coauthors = all_coauthors_list[w]
            [active_authors_start,samples_dict_1,n_1] = active_authors_classes[w]
            high_active_authors1 = samples_dict_1['top 10%']
            low_active_authors1 = samples_dict_1['bottom 10%']
            first_time_authors = first_time_authors_list[w]
            prior_author_ids = prior_author_ids_list[w]
              
            #collaboration graph
            collab_graph = make_collaboration_graph(works_authors_activation,active_authors_start,start_year=start_year_w-5, end_year=start_year_w)
            #keep nodes with just single exposures
            nodes,multiple_exp,sing_exp = delate_neig_incommon(collab_graph=collab_graph, active_authors=active_authors_start) 
            #high and low infected authors 
            #papers written by infected authors in exposure window (5 years before)
            works_authors_active = (works_authors_active_union.query('@start_year_w - 5 <= publication_year < @start_year_w ')).query('author_id.isin(@active_authors_start)')
            works_authors_active_set = set(works_authors_active.work_id)
            #just works written with eligible coauthors
            nodes_prior = nodes - prior_author_ids
            work_id_valid = set(((works_authors.query('work_id.isin(@works_authors_active_set)')).query('author_id.isin(@nodes_prior)')).work_id)

            #Exp2 - C #two bins higly active authors depending on number of coauthors
            high_active_authors1_bin1,high_active_authors1_bin2 = get_bins_C(works_authors_active,work_id_valid,high_active_authors1)

            #highly infected
            #list of dictionaries [high1_A,high1_B,high1_bin1_A,high1_bin2_A]      
            frac_vec_high1 = get_scores_high(author_ids=high_active_authors1, collab_graph=collab_graph, first_time_authors=first_time_authors,prior_author_ids=prior_author_ids,nodes=nodes,authors_active_start_1paper_id_dict=authors_active_start_1paper_id_dict,first_time_authors_1paper_id_dict=first_time_authors_1paper_id_dict,high_active_authors_bin1=high_active_authors1_bin1,high_active_authors_bin2=high_active_authors1_bin2)
            frac_vec_low1 = get_scores_low(author_ids=low_active_authors1, collab_graph=collab_graph, first_time_authors=first_time_authors,prior_author_ids=prior_author_ids,nodes=nodes,authors_active_start_1paper_id_dict=authors_active_start_1paper_id_dict,first_time_authors_1paper_id_dict=first_time_authors_1paper_id_dict)
            
            frac_vec[start_year_w] = [frac_vec_high1,frac_vec_low1]

            #save on files #info
            info_w_dict = { '#NODES MULTIPLE EXPOSURE':multiple_exp, 
                           '#NODES SINGLE EXPOSURE':sing_exp
                          }

            info_w = pd.DataFrame(data=[info_w_dict])
            info_w.insert(0, 'T_0', start_year_w)
            info_df_ = pd.concat([info_df_, info_w], ignore_index = True, axis = 0)

    #save on file : concept - year_start 
    my_file = 'info_df_'+topic+'_windows.csv'
    info_df_.to_csv(os.path.join(my_path, my_file), sep=';')
    my_file = 'frac_vec_'+topic+'_windows'
    with open(os.path.join(my_path, my_file),"wb") as fp:
        pickle.dump(frac_vec,fp)

    #save all concept dataframes in one file
    info_df_.insert(0, 'topic', topic)

    return info_df_,frac_vec

In [None]:
my_path = os.path.join(discipline, 'Impact_mean2/Exp2_2')
#create folder
if not os.path.exists(my_path):
    os.makedirs(my_path)
info_df = pd.DataFrame() 
frac_vec = {}
for topic in topic_list:
    info_df_top,frac_vec_top = Exp2_2(discipline=discipline,topic=topic,my_path=my_path) 
    info_df = pd.concat([info_df, info_df_top], ignore_index = True, axis = 0)
    frac_vec[topic] = frac_vec_top
my_file = 'info_windows.csv'
info_df.to_csv(os.path.join(my_path, my_file), sep=';')
my_file = 'frac_vec_windows'
with open(os.path.join(my_path, my_file),"wb") as fp:
    pickle.dump(frac_vec,fp)

#### Def. impact 3 - papers:topic, cits:topic

In [None]:
def Exp2_3(discipline,topic,my_path):
    
    #restrict to topic
    works_concepts_conc = works_concepts.query('concept_name==@topic', engine='python')
 
    #load
    my_path2 = os.path.join(discipline, 'Info')
    my_file = 'work_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        work_ids_list = pickle.load(fp)
    my_file = 'author_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        author_ids_list = pickle.load(fp)
    my_file = 'windows_cond_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        windows_cond = pickle.load(fp)
        
    #load
    my_path3 = os.path.join(my_path2,'Impact_mean3')
    my_file = 'active_authors_classes_'+topic
    with open(os.path.join(my_path3, my_file),"rb") as fp:
        active_authors_classes = pickle.load(fp)   
    
    #consider consecutive EW and OW (5 years each)
    start_year = 1995      
    
    my_path4 = os.path.join(discipline, 'Productivity/Exp1_ver1')
    my_file = 'all_coauthors_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        all_coauthors_list = pickle.load(fp) 
    my_file = 'active_authors_start_union_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        active_authors_start_union = pickle.load(fp) 
    active_authors_start_union_list = list(active_authors_start_union)  
     
                
    my_file = 'prior_work_ids_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        prior_work_ids_list = pickle.load(fp) 
    my_file = 'prior_author_ids_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        prior_author_ids_list = pickle.load(fp)  
    my_file = 'first_time_authors_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_list = pickle.load(fp) 
    my_file = 'not_active_authors_start_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        not_active_authors_start_list = pickle.load(fp)
    my_file = 'first_time_authors_union_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_union = pickle.load(fp)
                 
    my_file = 'works_authors_activation_date_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        works_authors_activation_date = pickle.load(fp)

    my_path5 = os.path.join(discipline, 'Productivity/Exp2')    
    my_file = 'works_authors_activation_'+topic       
    with open(os.path.join(my_path5, my_file),"rb") as fp:
        works_authors_activation = pickle.load(fp)
        
    my_file = 'works_authors_active_union_'+topic       
    with open(os.path.join(my_path5, my_file),"rb") as fp:
        works_authors_active_union = pickle.load(fp)
    
    my_file = 'first_time_authors_1paper_id_dict_'+topic       
    with open(os.path.join(my_path5, my_file),"rb") as fp:
        first_time_authors_1paper_id_dict = pickle.load(fp)
    
    my_file = 'authors_active_start_1paper_id_dict_'+topic       
    with open(os.path.join(my_path5, my_file),"rb") as fp:
        authors_active_start_1paper_id_dict = pickle.load(fp)
    my_file = 'work_ids_authors_active_df_'+topic       
    with open(os.path.join(my_path5, my_file),"rb") as fp:
        work_ids_authors_active_df = pickle.load(fp)
    
    info_df_  = pd.DataFrame()
    frac_vec = {} 
    for w in tqdm(range(0,23)): 
        
        windows_cond_w = windows_cond[w]   
        if windows_cond_w:
        
            start_year_w = start_year+w
            all_coauthors = all_coauthors_list[w]
            [active_authors_start,samples_dict_1,n_1] = active_authors_classes[w]
            high_active_authors1 = samples_dict_1['top 10%']
            low_active_authors1 = samples_dict_1['bottom 10%']
            first_time_authors = first_time_authors_list[w]
            prior_author_ids = prior_author_ids_list[w]
              
            #collaboration graph
            collab_graph = make_collaboration_graph(works_authors_activation,active_authors_start,start_year=start_year_w-5, end_year=start_year_w)
            #keep nodes with just single exposures
            nodes,multiple_exp,sing_exp = delate_neig_incommon(collab_graph=collab_graph, active_authors=active_authors_start) 
            #high and low infected authors 
            #papers written by infected authors in exposure window (5 years before)
            works_authors_active = (works_authors_active_union.query('@start_year_w - 5 <= publication_year < @start_year_w ')).query('author_id.isin(@active_authors_start)')
            works_authors_active_set = set(works_authors_active.work_id)
            #just works written with eligible coauthors
            nodes_prior = nodes - prior_author_ids
            work_id_valid = set(((works_authors.query('work_id.isin(@works_authors_active_set)')).query('author_id.isin(@nodes_prior)')).work_id)

            #Exp2 - C #two bins higly active authors depending on number of coauthors
            high_active_authors1_bin1,high_active_authors1_bin2 = get_bins_C(works_authors_active,work_id_valid,high_active_authors1)

            #highly infected
            #list of dictionaries [high1_A,high1_B,high1_bin1_A,high1_bin2_A]      
            frac_vec_high1 = get_scores_high(author_ids=high_active_authors1, collab_graph=collab_graph, first_time_authors=first_time_authors,prior_author_ids=prior_author_ids,nodes=nodes,authors_active_start_1paper_id_dict=authors_active_start_1paper_id_dict,first_time_authors_1paper_id_dict=first_time_authors_1paper_id_dict,high_active_authors_bin1=high_active_authors1_bin1,high_active_authors_bin2=high_active_authors1_bin2)
            frac_vec_low1 = get_scores_low(author_ids=low_active_authors1, collab_graph=collab_graph, first_time_authors=first_time_authors,prior_author_ids=prior_author_ids,nodes=nodes,authors_active_start_1paper_id_dict=authors_active_start_1paper_id_dict,first_time_authors_1paper_id_dict=first_time_authors_1paper_id_dict)
            
            frac_vec[start_year_w] = [frac_vec_high1,frac_vec_low1]

            #save on files #info
            info_w_dict = { '#NODES MULTIPLE EXPOSURE':multiple_exp, 
                           '#NODES SINGLE EXPOSURE':sing_exp
                          }

            info_w = pd.DataFrame(data=[info_w_dict])
            info_w.insert(0, 'T_0', start_year_w)
            info_df_ = pd.concat([info_df_, info_w], ignore_index = True, axis = 0)

    #save on file : concept - year_start 
    my_file = 'info_df_'+topic+'_windows.csv'
    info_df_.to_csv(os.path.join(my_path, my_file), sep=';')
    my_file = 'frac_vec_'+topic+'_windows'
    with open(os.path.join(my_path, my_file),"wb") as fp:
        pickle.dump(frac_vec,fp)

    #save all concept dataframes in one file
    info_df_.insert(0, 'topic', topic)

    return info_df_,frac_vec

In [None]:
my_path = os.path.join(discipline, 'Impact_mean3/Exp2_3')
#create folder
if not os.path.exists(my_path):
    os.makedirs(my_path)
info_df = pd.DataFrame() 
frac_vec = {}
for topic in topic_list:
    info_df_top,frac_vec_top = Exp2_3(discipline=discipline,topic=topic,my_path=my_path) 
    info_df = pd.concat([info_df, info_df_top], ignore_index = True, axis = 0)
    frac_vec[topic] = frac_vec_top
my_file = 'info_windows.csv'
info_df.to_csv(os.path.join(my_path, my_file), sep=';')
my_file = 'frac_vec_windows'
with open(os.path.join(my_path, my_file),"wb") as fp:
    pickle.dump(frac_vec,fp)

## CALCULATIONS

In [None]:
my_path_list = [
os.path.join(discipline, 'Productivity/Exp2'),
os.path.join(discipline, 'Impact_mean1/Exp2_1'),
os.path.join(discipline, 'Impact_mean2/Exp2_2'),
os.path.join(discipline, 'Impact_mean3/Exp2_3')
]

In [None]:
#union results
for my_path in my_path_list:
    
    topics_df = pd.DataFrame()
    for topic in topic_list:
        my_file = 'info_df_'+topic+'_windows.csv'
        topic_df_top = pd.read_csv(os.path.join(my_path, my_file),index_col=0, sep=';') 
        topic_df_top.insert(0, 'topic', topic)
        topics_df = pd.concat([topics_df, topic_df_top], ignore_index = True, axis = 0)  
    my_file = 'info_windows.csv'    
    topics_df.to_csv(os.path.join(my_path, my_file), sep=';')
    
    frac_vec = {}
    for topic in topic_list:
        my_file = 'frac_vec_'+topic+'_windows'
        with open(os.path.join(my_path, my_file),"rb") as fp:
            frac_vec_top = pickle.load(fp)
        frac_vec[topic] = frac_vec_top
    my_file = 'frac_vec_windows'
    with open(os.path.join(my_path, my_file),"wb") as fp:
        pickle.dump(frac_vec,fp)

In [None]:
def percentage(frac_vec,c,perc_values):
    high = np.array(list(frac_vec[c].values()))
    high_nan = high[~np.isnan(high)]
    high_nan_len = len(high_nan) #high active not Nan #with eligible neighbors
    high_perc = []
    high_perc_num = [] 
    for perc in perc_values: #percentage
        high_p = len(high[high>=perc]) #number authors activation fraction above percentage 
        high_perc_num.append(high_p)
        if high_nan_len!=0:
            high_perc.append(high_p/high_nan_len)
        else:
            high_perc.append(np.nan)
    
    return high_perc,high_perc_num,high_nan_len

In [None]:
#calculate percentage
def Exp2_calculation(frac_vec,topic,my_path,info_topics):
    
    frac_vec_topic = frac_vec[topic]
    topic_df_  = pd.DataFrame()
    info_df = pd.DataFrame()
    
    my_path2 = os.path.join(discipline, 'Info')
    my_file = 'windows_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        windows_list = pickle.load(fp)
    start_year_window_list = windows_list #consider just windows selected by condition
    #num_windows_topic = len(start_year_window_list) #number windows
    for window in start_year_window_list: #each window     
    #for window in list(frac_vec_topic.keys()):
        frac_vec_topic_window = frac_vec_topic[window] #[frac_vec_high1,frac_vec_high2,frac_vec_base1,frac_vec_base2]

        
        #percentage 
        #high active
        #10% #frac_vec_topic_window[0] = fractions_A,fractions_B,fractions_A_bin1,fractions_A_bin2
        frac_vec_high1 = np.array(list(frac_vec_topic_window[0]))
        #A
        high1_A_perc,high1_A_perc_num,high1_A_nan_len = percentage(frac_vec_high1,0,perc_values)
        #B
        high1_B_perc,high1_B_perc_num,high1_B_nan_len = percentage(frac_vec_high1,1,perc_values)
        #C
        high1_bin1_A_perc,high1_bin1_A_perc_num,high1_bin1_A_nan_len = percentage(frac_vec_high1,2,perc_values)
        high1_bin2_A_perc,high1_bin2_A_perc_num,high1_bin2_A_nan_len = percentage(frac_vec_high1,3,perc_values)
        #low active
        frac_vec_low1 = np.array(list(frac_vec_topic_window[1]))
        #A
        low1_A_perc,low1_A_perc_num,low1_A_nan_len = percentage(frac_vec_low1,0,perc_values)
        #B
        low1_B_perc,low1_B_perc_num,low1_B_nan_len = percentage(frac_vec_low1,1,perc_values)

        #dataframe
        topic_df_w  = pd.DataFrame(list(zip(perc_values,
                      high1_A_perc,low1_A_perc,
                      high1_bin1_A_perc,high1_bin2_A_perc,
                      high1_B_perc,low1_B_perc,
                      high1_A_perc_num,low1_A_perc_num,
                      high1_bin1_A_perc_num,high1_bin2_A_perc_num,
                      high1_B_perc_num,low1_B_perc_num)),
             columns =['Perc',
                       'high1_A','low1_A','high1_bin1_A','high1_bin2_A',
                       'high1_B','low1_B',
                       'high1_A_num','low1_A_num','high1_bin1_A_num','high1_bin2_A_num',
                       'high1_B_num','low1_B_num',
                      ])  
        topic_df_w.insert(0, 'T_0', window)
        topic_df_ = pd.concat([topic_df_, topic_df_w], ignore_index = True, axis = 0)
        
        #save info
        info_df_w_dict = {
        'HIGH ACTIVE - NOT NAN - EXP A - 10%': high1_A_nan_len,
        'LOW ACTIVE - NOT NAN - EXP A - 10%':low1_A_nan_len,
        'HIGH ACTIVE - BIN1 - NOT NAN - EXP C - 10%': high1_bin1_A_nan_len,
        'HIGH ACTIVE - BIN2 - NOT NAN - EXP C - 10%': high1_bin2_A_nan_len,
        'HIGH ACTIVE - NOT NAN - EXP B - 10%': high1_B_nan_len,
        'LOW ACTIVE - NOT NAN - EXP B - 10%':low1_B_nan_len}
        info_df_w = pd.DataFrame(data=[info_df_w_dict])
        info_df_w.insert(0, 'T_0', window)
        info_df = pd.concat([info_df, info_df_w], ignore_index = True, axis = 0)

    topic_df_.insert(0, 'topic', topic)
    info_df.insert(0, 'topic', topic)
    return topic_df_,info_df

In [None]:
perc_values = np.array(range(0,105,5))/100 #percentages studied
for my_path in my_path_list:
    #load saved results
    my_file_vector = 'frac_vec_windows'
    with open(os.path.join(my_path, my_file_vector),"rb") as fp:
        frac_vec = pickle.load(fp)
        
    topics_df = pd.DataFrame()
    info_df = pd.DataFrame()
    #save info
    my_file = 'info_windows.csv'
    info_topics = pd.read_csv(os.path.join(my_path, my_file),index_col=0, sep=';')

    for topic in topic_list:
        topic_df_,info_df_ = Exp2_calculation(frac_vec,topic,my_path,info_topics)
        topics_df = pd.concat([topics_df, topic_df_], ignore_index = True, axis = 0) 
        info_df = pd.concat([info_df, info_df_], ignore_index = True, axis = 0) 

    my_file = 'df_topic_windows.csv'
    topics_df.to_csv(os.path.join(my_path, my_file))
    
    my_file = 'info_windows.csv'
    info_df_old = pd.read_csv(os.path.join(my_path, my_file),index_col=0, sep=';')
    info_df_old.drop_duplicates(subset=['topic','T_0'], inplace=True)

    info_df_old = info_df_old.merge(info_df, on=['topic','T_0'])
    my_file = 'info_all_windows.csv'
    info_df_old.to_csv(os.path.join(my_path, my_file), sep=';')

In [None]:
#average across windows for each topic 
for my_path in my_path_list:
   
    #info
    my_file = 'info_all_windows.csv'
    df_topics = pd.read_csv(os.path.join(my_path, my_file),index_col=0, sep=';')
    df_topics.drop_duplicates(subset=['topic','T_0'], inplace=True)
    df_topics = df_topics.set_index('topic')
    topic_df_ = pd.DataFrame()
    for topic in topic_list: 
        df_topic = df_topics.query('topic == @topic')
        num_windows_topic = len(df_topic)
        df_topic_mean = df_topic[['#NODES MULTIPLE EXPOSURE','#NODES SINGLE EXPOSURE']].mean(axis=0).to_frame() 
        df_topic_mean = df_topic_mean.rename(index={'#NODES MULTIPLE EXPOSURE' : '#NODES MULTIPLE EXPOSURE - MEAN','#NODES SINGLE EXPOSURE':'#NODES SINGLE EXPOSURE - MEAN'})       
        df_topic_sterr = (df_topic[['#NODES MULTIPLE EXPOSURE','#NODES SINGLE EXPOSURE'  ]].std(axis=0)/sqrt(num_windows_topic)).to_frame()
        df_topic_sterr = df_topic_sterr.rename(index={'#NODES MULTIPLE EXPOSURE' : '#NODES MULTIPLE EXPOSURE - STERR','#NODES SINGLE EXPOSURE':'#NODES SINGLE EXPOSURE - STERR', })
        df_topic_mean = pd.concat([df_topic_mean, df_topic_sterr], axis = 0)
        df_topic_mean.columns = [topic]
        df_topic_mean = df_topic_mean.T
        topic_df_ = pd.concat([topic_df_, df_topic_mean], axis = 0)
    my_file = 'info_table.csv'
    topic_df_.to_csv(os.path.join(my_path, my_file), sep=';')


In [None]:
# statistics
def Exp2_stat(df_topics,topic,my_path):
    df_topic = df_topics.query('topic==@topic') #topic
    my_path2 = os.path.join(discipline, 'Info')
    my_file = 'windows_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        windows_list = pickle.load(fp)
    start_year_window_list = windows_list #consider just windows selected by condition
    #start_year_window_list = list(set(df_topic.T_0))
    num_windows_topic = len(start_year_window_list)
    
    df_topic_mean = pd.DataFrame(columns=[
                           'high1_A','low1_A','high1_bin1_A','high1_bin2_A',
                           'high1_B','low1_B',
                            'high1_A_num','low1_A_num','high1_bin1_A_num','high1_bin2_A_num',
                           'high1_B_num','low1_B_num',
                            ])
    
    df_topic_sterr = pd.DataFrame(columns=[
                           'high1_A','low1_A','high1_bin1_A','high1_bin2_A',
                           'high1_B','low1_B',
                            'high1_A_num','low1_A_num','high1_bin1_A_num','high1_bin2_A_num',
                           'high1_B_num','low1_B_num',
    ])
    
    for perc in perc_values:
        df_topic_perc = df_topic.query('Perc == @perc')
        df_topic_perc = df_topic_perc.drop(columns=['topic', 'T_0', 'Perc'])
        df_topic_perc_mean = df_topic_perc.mean()
        df_topic_perc_sterr = df_topic_perc.std()/sqrt(num_windows_topic)
        
        df_topic_mean.loc[perc] = df_topic_perc_mean
        df_topic_sterr.loc[perc] = df_topic_perc_sterr
        
    
    df_topic_mean.rename(columns = {
                           'high1_A':'high1_A_mean','low1_A':'low1_A_mean','high1_bin1_A':'high1_bin1_A_mean','high1_bin2_A':'high1_bin2_A_mean',
                           'high1_B':'high1_B_mean','low1_B':'low1_B_mean',
                            'high1_A_num':'high1_A_num_mean','low1_A_num':'low1_A_num_mean','high1_bin1_A_num':'high1_bin1_A_num_mean','high1_bin2_A_num':'high1_bin2_A_num_mean',
                           'high1_B_num':'high1_B_num_mean','low1_B_num':'low1_B_num_mean',
    }, inplace = True)

    df_topic_sterr.rename(columns = {
                           'high1_A':'high1_A_sterr','low1_A':'low1_A_sterr','high1_bin1_A':'high1_bin1_A_sterr','high1_bin2_A':'high1_bin2_A_sterr',
                           'high1_B':'high1_B_sterr','low1_B':'low1_B_sterr',
                            'high1_A_num':'high1_A_num_sterr','low1_A_num':'low1_A_num_sterr','high1_bin1_A_num':'high1_bin1_A_num_sterr','high1_bin2_A_num':'high1_bin2_A_num_sterr',
                           'high1_B_num':'high1_B_num_sterr','low1_B_num':'low1_B_num_sterr',    
    }, inplace = True)
    
    df_topic = df_topic_mean.merge(df_topic_sterr, left_index=True, right_index=True)
    df_topic = df_topic[[
        'high1_A_mean','high1_A_sterr',
        'low1_A_mean','low1_A_sterr',
        'high1_bin1_A_mean','high1_bin1_A_sterr',
        'high1_bin2_A_mean','high1_bin2_A_sterr',
        'high1_B_mean','high1_B_sterr',
        'low1_B_mean','low1_B_sterr',
        'high1_A_num_mean','high1_A_num_sterr',
        'low1_A_num_mean','low1_A_num_sterr',
        'high1_bin1_A_num_mean','high1_bin1_A_num_sterr',
        'high1_bin2_A_num_mean','high1_bin2_A_num_sterr',
        'high1_B_num_mean','high1_B_num_sterr',
        'low1_B_num_mean','low1_B_num_sterr',
    ]]
    df_topic = df_topic.rename_axis('Perc').reset_index()
    df_topic.insert(0, 'topic', topic)
        
    return df_topic


In [None]:
perc_values = np.array(range(0,105,5))/100 #percentages studied
for my_path in my_path_list:
    my_file = 'df_topic_windows.csv'
    df_topics = pd.read_csv(os.path.join(my_path, my_file),index_col=0)
    topics_df = pd.DataFrame()
    for topic in topic_list:  
        topic_df = Exp2_stat(df_topics=df_topics,topic=topic,my_path=my_path)  
        topics_df = pd.concat([topics_df, topic_df], ignore_index = True, axis = 0)
    my_file = 'df_topic_stat.csv'
    topics_df.to_csv(os.path.join(my_path, my_file))