# EXPERIMENT I - CLOSURE

In [1]:
%load_ext autoreload 
%autoreload 2

In [None]:
import pandas as pd 
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
from rich.progress import track
import ast
from tqdm.auto import tqdm
import ujson as json
import networkx as nx
import numpy as np 
import requests 
from scipy.stats import entropy

tqdm.pandas()
plt.rcParams.update({'font.size': 22})
sns.set(style="ticks", context="talk")
# plt.style.use("dark_background")

import plotly.express as px
import plotly.io as pio
from plotly.subplots import make_subplots
import plotly.graph_objects as go
pd.options.plotting.backend = 'plotly'
pio.templates.default = "plotly_dark"
pio.templates.default = 'presentation'

import rich
from itertools import combinations
import sys 
from statistics import mean, stdev
import struct, io, string
import os
import collections
from collections import Counter
import pickle
from scipy.stats import chisquare,kstest
from scipy import stats 
import random
import math
import random
from math import sqrt

In [None]:
def read_parquet(name, **args):
    path = basepath / f'{name}'
    df = pd.read_parquet(path, engine='pyarrow')
    # df.drop_duplicates(inplace=True)
    
    if 'publication_year' in df.columns:
        df.loc[:, 'publication_year'] = pd.to_numeric(df.publication_year)
        df = df[df.publication_year != 0]  # discard works with missing years
        
    print(f'Read {len(df):,} rows from {path.stem!r}')
    return df 

## LOAD FIELDS

### Physics

In [None]:
discipline = 'Physics'

In [None]:
basepath = Path('/N/project/openalex/slices/Physics/feb-2023')

works = read_parquet('works')
works_authors = read_parquet('works_authorships')
works_concepts = read_parquet('works_concepts')
works_referenced_works = read_parquet('works_referenced_works')

In [None]:
works['num_authors']=works['num_authors'].astype('int64')
works['n_coauthors'] = works['num_authors'] - 1
works_authors = pd.merge(works_authors, works['publication_date'], on="work_id")
works_authors.drop_duplicates(subset=['work_id','author_id'], inplace=True)
works_concepts = pd.merge(works_concepts, works['publication_date'], on="work_id")
works_concepts = works_concepts.query('score > 0.3', engine='python')

In [None]:
#used in def. impact 2 
works_cit_counts_year = works_referenced_works.groupby(['referenced_work_id','work_publication_year']).count()["work_id"].reset_index(name="cit_count")
works_cit_counts_year.set_index(['referenced_work_id', 'work_publication_year'], inplace=True)
index = pd.MultiIndex.from_product(works_cit_counts_year.index.levels)
works_cit_counts_year = works_cit_counts_year.reindex(index)
works_cit_counts_year = works_cit_counts_year.reset_index(level=0).reset_index(level=0)
works_cit_counts_year = works_cit_counts_year.fillna(0)
works_cit_counts_year['cit_count_cum'] = works_cit_counts_year.groupby(['referenced_work_id'])['cit_count'].cumsum()
works_cit_counts_year = works_cit_counts_year.rename(columns = {'referenced_work_id':'work_id'})

In [None]:
topic_list =[
    'Gravitational wave',
    'Dark matter',
    'Fluid dynamics',
    'Soliton',
    'Supersymmetry',
    'Statistical physics',          
    'Superconductivity' 
        ]

In [None]:
#create folder
if not os.path.exists(discipline):
    os.makedirs(discipline)

### Computer Science

In [None]:
discipline = 'CS'

In [None]:
basepath = Path('/N/project/openalex/slices/CS/feb-2023')

works = read_parquet('works')
works_authors = read_parquet('works_authorships')
works_concepts = read_parquet('works_concepts')
works_referenced_works = read_parquet('works_referenced_works')

In [None]:
works['num_authors']=works['num_authors'].astype('int64')
works['n_coauthors'] = works['num_authors'] - 1
works_authors = pd.merge(works_authors, works['publication_date'], on="work_id")
works_authors.drop_duplicates(subset=['work_id','author_id'], inplace=True)
works_concepts = pd.merge(works_concepts, works['publication_date'], on="work_id")
works_concepts = works_concepts.query('score > 0.3', engine='python')

In [None]:
#used in def. impact 2 
works_cit_counts_year = works_referenced_works.groupby(['referenced_work_id','work_publication_year']).count()["work_id"].reset_index(name="cit_count")
works_cit_counts_year.set_index(['referenced_work_id', 'work_publication_year'], inplace=True)
index = pd.MultiIndex.from_product(works_cit_counts_year.index.levels)
works_cit_counts_year = works_cit_counts_year.reindex(index)
works_cit_counts_year = works_cit_counts_year.reset_index(level=0).reset_index(level=0)
works_cit_counts_year = works_cit_counts_year.fillna(0)
works_cit_counts_year['cit_count_cum'] = works_cit_counts_year.groupby(['referenced_work_id'])['cit_count'].cumsum()
works_cit_counts_year = works_cit_counts_year.rename(columns = {'referenced_work_id':'work_id'})

In [None]:
topic_list =[
    'Compiler',
    'Mobile computing',
    'Cryptography',
    'Cluster analysis', 
    'Image processing',
    'Parallel computing'         
            ]   

In [None]:
#create folder
if not os.path.exists(discipline):
    os.makedirs(discipline)

### BioMed

In [None]:
discipline = 'BioMed'

In [None]:
basepath = Path('/N/project/openalex/slices/BioMed/feb-2023')

works = read_parquet('works')
works_authors = read_parquet('works_authorships')
works_concepts = read_parquet('works_concepts')
works_referenced_works = read_parquet('works_referenced_works')

In [None]:
works['num_authors']=works['num_authors'].astype('int64')
works['n_coauthors'] = works['num_authors'] - 1
works_authors = pd.merge(works_authors, works['publication_date'], on="work_id")
works_authors.drop_duplicates(subset=['work_id','author_id'], inplace=True)
works_concepts = pd.merge(works_concepts, works['publication_date'], on="work_id")
works_concepts = works_concepts.query('score > 0.3', engine='python')

In [None]:
#used in def. impact 2 
works_cit_counts_year = works_referenced_works.groupby(['referenced_work_id','work_publication_year']).count()["work_id"].reset_index(name="cit_count")
works_cit_counts_year.set_index(['referenced_work_id', 'work_publication_year'], inplace=True)
index = pd.MultiIndex.from_product(works_cit_counts_year.index.levels)
works_cit_counts_year = works_cit_counts_year.reindex(index)
works_cit_counts_year = works_cit_counts_year.reset_index(level=0).reset_index(level=0)
works_cit_counts_year = works_cit_counts_year.fillna(0)
works_cit_counts_year['cit_count_cum'] = works_cit_counts_year.groupby(['referenced_work_id'])['cit_count'].cumsum()
works_cit_counts_year = works_cit_counts_year.rename(columns = {'referenced_work_id':'work_id'})

In [None]:
topic_list =[
            'Protein structure',
            'Genome', 
            'Peptide sequence',
            "Alzheimer's disease",
            'Neurology',          
            'Radiation therapy',
            'Chemotherapy'
            ]

In [None]:
#create folder
if not os.path.exists(discipline):
    os.makedirs(discipline)

## FUNCTIONS DEFINITIONS

In [None]:
import warnings
warnings.filterwarnings('ignore')

### Definition experts

In [None]:
#mean_impact1 - papers:all, cits:topic
def experts_impact_mean_1(works_authors,start_year_i,active_authors_start,works_cit_counts_year_concept):

    #papers:all, citations:just tagged with concept 
    #all papers (with and without concept) written before start_date by active authors
    prior_works_ids_tot_5yr = (works_authors
                    .query('@start_year_i - 5 <= publication_year < @start_year_i', engine='python')
                    .query('author_id.isin(@active_authors_start)'))

    #just citations from papers with concept
    works_cit_counts_year_concept_startyear = works_cit_counts_year_concept.query('work_publication_year == @start_year_i - 1')

    prior_works_ids_tot_5yr_cit = pd.merge(prior_works_ids_tot_5yr, works_cit_counts_year_concept_startyear, on="work_id")
    
    #add authors zero citations
    miss_list = list(active_authors_start.difference(set(prior_works_ids_tot_5yr_cit.author_id)))
    miss_n = len(miss_list)
    miss = {'work_id': [np.NaN]*miss_n, 
            'author_id': miss_list,
            'author_name': [np.NaN]*miss_n, 
            'institution_id': [np.NaN]*miss_n, 
             'publication_year': [start_year_i-1]*miss_n,
            'publication_date': [np.NaN]*miss_n,
            'work_publication_year': [np.NaN]*miss_n,
             'cit_count': [0]*miss_n,
             'cit_count_cum': [0]*miss_n,
    }
    df_miss = pd.DataFrame(data=miss)
    prior_works_ids_tot_5yr_cit = pd.concat([prior_works_ids_tot_5yr_cit, df_miss])
    prior_works_ids_tot_5yr_cit = prior_works_ids_tot_5yr_cit[['author_id','cit_count_cum']].groupby(['author_id']).mean()

    impact_df = prior_works_ids_tot_5yr_cit.sort_values(by=['cit_count_cum'],ascending=False)
    impact_df = impact_df.reset_index()
    impact_df.columns = ['author_id', 'val']
    impact_df_len = len(impact_df)

    return impact_df,impact_df_len

In [None]:
#mean_impact2 - papers:topic, cits:all
def experts_impact_mean_2(works_authors,start_year_i,prior_works_ids_5yr,active_authors_start,works_cit_counts_year):

    #papers:just tagged with concept, citations:all
    #just papers tagged with concept written before start_date by active authors 
    prior_works_ids_tot_5yr = (works_authors
                    .query('work_id.isin(@prior_works_ids_5yr)'))

    #just citations from papers with concept
    works_cit_counts_year_startyear = works_cit_counts_year.query('work_publication_year == @start_year_i - 1')

    prior_works_ids_tot_5yr_cit = pd.merge(prior_works_ids_tot_5yr, works_cit_counts_year_startyear, on="work_id")
    
    #add authors zero citations
    miss_list = list(active_authors_start.difference(set(prior_works_ids_tot_5yr_cit.author_id)))
    miss_n = len(miss_list)
    miss = {'work_id': [np.NaN]*miss_n, 
            'author_id': miss_list,
            'author_name': [np.NaN]*miss_n, 
            'institution_id': [np.NaN]*miss_n, 
             'publication_year': [start_year_i-1]*miss_n,
            'publication_date': [np.NaN]*miss_n,
            'work_publication_year': [np.NaN]*miss_n,
             'cit_count': [0]*miss_n,
             'cit_count_cum': [0]*miss_n,
    }
    df_miss = pd.DataFrame(data=miss)
    prior_works_ids_tot_5yr_cit = pd.concat([prior_works_ids_tot_5yr_cit, df_miss])
    prior_works_ids_tot_5yr_cit = prior_works_ids_tot_5yr_cit[['author_id','cit_count_cum']].groupby(['author_id']).mean()

    impact_df = prior_works_ids_tot_5yr_cit.sort_values(by=['cit_count_cum'],ascending=False)
    impact_df = impact_df.reset_index()
    impact_df.columns = ['author_id', 'val']
    impact_df_len = len(impact_df)

    return impact_df,impact_df_len

In [None]:
#mean_impact3 - papers:topic, cits:topic
def experts_impact_mean_3(works_authors,start_year_i,prior_works_ids_5yr,active_authors_start,works_cit_counts_year_concept):

    #papers:just tagged with concept, citations:just tagged with concept 
    #just papers tagged with concept written before start_date by active authors 
    prior_works_ids_tot_5yr = (works_authors
                    .query('work_id.isin(@prior_works_ids_5yr)'))

    #just citations from papers with concept
    works_cit_counts_year_concept_startyear = works_cit_counts_year_concept.query('work_publication_year == @start_year_i - 1')

    prior_works_ids_tot_5yr_cit = pd.merge(prior_works_ids_tot_5yr, works_cit_counts_year_concept_startyear, on="work_id")
    
    #add authors zero citations
    miss_list = list(active_authors_start.difference(set(prior_works_ids_tot_5yr_cit.author_id)))
    miss_n = len(miss_list)
    miss = {'work_id': [np.NaN]*miss_n, 
            'author_id': miss_list,
            'author_name': [np.NaN]*miss_n, 
            'institution_id': [np.NaN]*miss_n, 
             'publication_year': [start_year_i-1]*miss_n,
            'publication_date': [np.NaN]*miss_n,
            'work_publication_year': [np.NaN]*miss_n,
             'cit_count': [0]*miss_n,
             'cit_count_cum': [0]*miss_n,
    }
    df_miss = pd.DataFrame(data=miss)
    prior_works_ids_tot_5yr_cit = pd.concat([prior_works_ids_tot_5yr_cit, df_miss])
    prior_works_ids_tot_5yr_cit = prior_works_ids_tot_5yr_cit[['author_id','cit_count_cum']].groupby(['author_id']).mean()

    impact_df = prior_works_ids_tot_5yr_cit.sort_values(by=['cit_count_cum'],ascending=False)
    impact_df = impact_df.reset_index()
    impact_df.columns = ['author_id', 'val']
    impact_df_len = len(impact_df)

    return impact_df,impact_df_len

In [None]:
def experts_productivity(works_authors,prior_work_ids_5yr,active_authors_start):
    #count number of works written with topic during exposure window
    sorted_author_works_count = (
    works_authors
    .query('work_id.isin(@prior_work_ids_5yr) & author_id.isin(@active_authors_start)') 
    .groupby('author_id')
    .work_id
    .count()
    .sort_values(ascending=False)
    )

    sorted_author_works_count_len = len(sorted_author_works_count)
    
    sorted_author_works_count = sorted_author_works_count.to_frame().reset_index()
    sorted_author_works_count.columns = ['author_id', 'val']
    
    return sorted_author_works_count,sorted_author_works_count_len 

### Get author samples

In [None]:
def get_author_samples(author_stats_df, top_k, debug=False):
    """
    author_stats_df: DataFrame where author_id has active author ids, and val has the productivity/impact values for that author
    top_k: either 10 or 20 depending on top 10 or 20%
    
    Returns a dictionary where keys are class labels, and values are set of author IDs
    """
    # Note highest scoring authors are ranked LAST 
    author_stats_df.loc[:, 'rank_pct'] = author_stats_df.val.rank(method='min', pct=True)  # rank rows based on val convert to percentiles
    #author_stats_df.loc[:, 'rank_pct'] = author_stats_df.val.rank(pct=True)
    
    if top_k == 10:
        bins = [0, 0.1, 0.3, 0.45, 0.55, 0.7, 0.9, 1]
        labels=['bottom 10%', '10-30%', '30-45%', 'middle 10%', '55-70%', '70-90%', 'top 10%']
    else:
        bins = [0, 0.2, 0.3, 0.4, 0.6, 0.7, 0.8, 1]
        labels=['bottom 20%', '20-30%', '30-40%', 'middle 20%', '60-70%', '70-80%', 'top 20%']
        
    author_stats_df.loc[:, 'rank_cat'] = (  # assign category labels based on rank percentiles 
        pd.cut(
            author_stats_df.rank_pct,
            bins=bins,
            labels=labels
        )
    )
    
    samples_per_class = max(int((top_k / 100) * author_stats_df.author_id.nunique()), 1)
    if debug:
        print(f'{top_k=} taking {samples_per_class=:,}')
        display(author_stats_df.head(2))
    
    buckets_size = list(author_stats_df.groupby('rank_cat').count()['rank_pct'])
    #print(buckets_size)
    
    samples_dict = {}
    
    #keep = [f'bottom {top_k}%', f'middle {top_k}%', f'top {top_k}%']  # keep only these classes
    keep = [f'bottom {top_k}%', f'top {top_k}%']
    for i, label in enumerate(labels):
        if label not in keep:
            continue
        
        #initial bucket     
        candidates = set(author_stats_df[author_stats_df.rank_cat==label].author_id)
        candidates_size = buckets_size[i] #len(candidates)
        if candidates_size >=  samples_per_class:
            if debug:
                print(f'{label}: Sampling {samples_per_class:,} from {len(candidates):,} candidates')
            samples = set(random.sample(list(candidates), samples_per_class))  # sample here
        else:
            if debug:
                print(f'Insufficient items in {label}. Need {samples_per_class:,} have {len(candidates):,}')
            samples = candidates  # pick everyone
    
        missing = samples_per_class - len(samples)
        if missing > 0: 
            
            #1 next bucket 
            if i != len(labels) - 1: #not last bucket # try the next bucket
                next_label = author_stats_df.rank_cat.cat.categories[i+1]
                candidates = set(author_stats_df[author_stats_df.rank_cat==next_label].author_id)
                candidate_size = buckets_size[i+1]
            else: # for the highest bucket, go one below
                next_label = author_stats_df.rank_cat.cat.categories[i-1] 
                candidates = set(author_stats_df[author_stats_df.rank_cat==next_label].author_id)
                candidate_size = buckets_size[i-1]

            if candidate_size >= missing:    
                new_samples = set(random.sample(list(candidates), missing))  # sample here
                samples = samples | new_samples  # add these new samples
                if debug:
                    print(f'Missing {missing:,} samples for {label}. Expanding the range to {next_label}, Acquired {len(new_samples):,} new samples.')
            else: 
                new_samples = candidates  # pick everyone
                samples = samples | new_samples
            
            missing = samples_per_class - len(samples)
            if missing > 0: 

                #2 next bucket 
                if i != len(labels) - 1: #not last bucket # try the next bucket
                    next_next_label = author_stats_df.rank_cat.cat.categories[i+2]
                    candidates = set(author_stats_df[author_stats_df.rank_cat==next_next_label].author_id)
                    candidate_size = buckets_size[i+2]
                else: # for the highest bucket, go one below
                    next_next_label = author_stats_df.rank_cat.cat.categories[i-2] 
                    candidates = set(author_stats_df[author_stats_df.rank_cat==next_next_label].author_id)
                    candidate_size = buckets_size[i-2]
                
                if candidate_size >= missing:    
                    new_samples = set(random.sample(list(candidates), missing))  # sample here
                    samples = samples | new_samples  # add these new samples
                    if debug:
                        print(f'Missing {missing:,} samples for {label}. Expanding the range to {next_next_label}, Acquired {len(new_samples):,} new samples.')
                else: 
                    new_samples = candidates  # pick everyone
                    samples = samples | new_samples
    
        assert len(samples) == samples_per_class, f'Count mismatch {len(samples)=} {samples_per_class=} for samples {label}'
        samples_dict[label] = samples
        
    return samples_dict,samples_per_class

### Scores

In [None]:
def get_support_graph_ver1(bip_g, author_ids_supp):
    support_graph_ = nx.bipartite.weighted_projected_graph(bip_g, nodes=author_ids_supp)
    return support_graph_

def get_scores_A_ver1(anas,n_anas, active_authors_start,support_graph_,dict_final):
    
    neighbors_active = n_anas & active_authors_start #for each neighbors at the beginning that is active 
    if len(neighbors_active)!=0:      
        #consider just active neighbors
        neighbors_active.add(anas)
        ego_active = support_graph_.subgraph(neighbors_active).copy()
        #sum weights #number contacts with active authors in exposure window from activation date 
        exposure_anas_start = ego_active.degree(anas,weight='weight')
        #add info to dictionary
        if exposure_anas_start in dict_final.keys():
            dict_final[exposure_anas_start].append(anas)
        else:
            dict_final[exposure_anas_start] = [anas]
    else:
        ego_active = nx.empty_graph() #empty
        #add info to dictionary
        if 0 in dict_final.keys():
            dict_final[0].append(anas)
        else:
            dict_final[0] = [anas]
                   
    return dict_final,ego_active

def get_scores_B_ver1(anas,n_anas, high_active_authors,low_active_authors,ego_active_total,dict_final_high,dict_final_low):
    
    neighbors_active_high = n_anas & high_active_authors   
    neighbors_active_low = n_anas & low_active_authors  
    
    if len(neighbors_active_low)==0 and len(neighbors_active_high)!=0: #just contact with high (not low)
        #consider just active neighbors
        neighbors_active_high.add(anas)
        ego_active = ego_active_total.subgraph(neighbors_active_high).copy()
        #sum weights #number papers written with active authors in exposure window from activation date  
        exposure_anas_start = ego_active.degree(anas,weight='weight')
        #add info to dictionary
        if exposure_anas_start in dict_final_high.keys():
            dict_final_high[exposure_anas_start].append(anas)
        else:
            dict_final_high[exposure_anas_start] = [anas]
            
    #low active 10%         
    if len(neighbors_active_high)==0 and len(neighbors_active_low)!=0: 
        neighbors_active_low.add(anas)
        ego_active = ego_active_total.subgraph(neighbors_active_low).copy() 
        exposure_anas_start = ego_active.degree(anas,weight='weight')
        if exposure_anas_start in dict_final_low.keys():
            dict_final_low[exposure_anas_start].append(anas)
        else:
            dict_final_low[exposure_anas_start] = [anas] 
            
    return dict_final_high,dict_final_low


In [None]:
def list_works(G, u, v):
    w = set(G[u]) & set(G[v]) #works written together #G[u] neighbors of u in bipartite graph #weights are sets of works written by the two authors
    return w

def get_support_graph_ver2(bip_g, author_ids_supp,list_works):
    #weighted graph number papers
    support_graph_ = nx.bipartite.generic_weighted_projected_graph(bip_g, nodes=author_ids_supp, weight_function=list_works)
    return support_graph_

def get_scores_A_ver2(anas,n_anas, active_authors_start,support_graph_,dict_final):
    
    neighbors_active = n_anas & active_authors_start #for each neighbors at the beginning that is active 
    #ego network anas
    #ego = nx.ego_graph(support_graph_, anas)
    if len(neighbors_active)!=0:      
        #consider just active neighbors
        neighbors_active.add(anas)
        ego_active = support_graph_.subgraph(neighbors_active).copy()
        #sum weights #number papers written with active authors in exposure window from activation date 
        works_written = set()
        for nn in list(ego_active.neighbors(anas)):
            works_written = works_written | ego_active.edges[(anas,nn)]['weight']
        exposure_anas_start = len(works_written)
        #add info to dictionary
        if exposure_anas_start in dict_final.keys():
            dict_final[exposure_anas_start].append(anas)
        else:
            dict_final[exposure_anas_start] = [anas]
    else:
        ego_active = nx.empty_graph() #empty
        #add info to dictionary
        if 0 in dict_final.keys():
            dict_final[0].append(anas)
        else:
            dict_final[0] = [anas]
                   
    #return dict_final,ego
    return dict_final,ego_active

def get_scores_B_ver2(anas,n_anas,high_active_authors,low_active_authors,ego_active_total,dict_final_high,dict_final_low):
    
    
    neighbors_active_high = n_anas & high_active_authors   
    neighbors_active_low = n_anas & low_active_authors  
    
    if len(neighbors_active_low)==0 and len(neighbors_active_high)!=0: #just contact with high (not low)
        #consider just active neighbors
        neighbors_active_high.add(anas)
        #ego_active = ego.subgraph(neighbors_active_high).copy()
        ego_active = ego_active_total.subgraph(neighbors_active_high).copy()
        #sum weights #number papers written with infected authors in exposure window from activation date
        works_written = set()
        for nn in list(ego_active.neighbors(anas)):
            works_written = works_written | ego_active.edges[(anas,nn)]['weight']
        exposure_anas_start = len(works_written)
        #add info to dictionary
        if exposure_anas_start in dict_final_high.keys():
            dict_final_high[exposure_anas_start].append(anas)
        else:
            dict_final_high[exposure_anas_start] = [anas]
            
    #low active 10%         
    if len(neighbors_active_high)==0 and len(neighbors_active_low)!=0: 
        neighbors_active_low.add(anas)
        ego_active = ego_active_total.subgraph(neighbors_active_low).copy() 
        #sum weights #number papers written with infected authors in exposure window from activation date
        works_written = set()
        for nn in list(ego_active.neighbors(anas)):
            works_written = works_written | ego_active.edges[(anas,nn)]['weight']
        exposure_anas_start = len(works_written)
        if exposure_anas_start in dict_final_low.keys():
            dict_final_low[exposure_anas_start].append(anas)
        else:
            dict_final_low[exposure_anas_start] = [anas] 
            
    return dict_final_high,dict_final_low


In [None]:
def get_support_graph_ver3(bip_g, author_ids_supp):
    #no weights
    support_graph_ = nx.bipartite.weighted_projected_graph(bip_g, nodes=author_ids_supp) 
    return support_graph_

def get_scores_A_ver3(anas,n_anas, active_authors_start,dict_final):
    
    neighbors_active = n_anas & active_authors_start #for each neighbors at the beginning that is active

    if len(neighbors_active)!=0:
        #number infected coauthors
        exposure_anas_start = len(neighbors_active)
        #add info to dictionary
        if exposure_anas_start in dict_final.keys():
            dict_final[exposure_anas_start].append(anas)
        else:
            dict_final[exposure_anas_start] = [anas]
    else:
        #add info to dictionary
        if 0 in dict_final.keys():
            dict_final[0].append(anas)
        else:
            dict_final[0] = [anas]
            
    return dict_final

def get_scores_B_ver3(anas,n_anas,high_active_authors,low_active_authors,dict_final_high,dict_final_low):
    
    neighbors_active_high = n_anas & high_active_authors   
    neighbors_active_low = n_anas & low_active_authors  
    
    if len(neighbors_active_low)==0 and len(neighbors_active_high)!=0: #just contact with high (not low)
        #consider just active neighbors
        #number infected coauthors
        exposure_anas_start = len(neighbors_active_high)
        #add info to dictionary
        if exposure_anas_start in dict_final_high.keys():
            dict_final_high[exposure_anas_start].append(anas)
        else:
            dict_final_high[exposure_anas_start] = [anas]
     
    #low active 10% 
    if len(neighbors_active_high)==0 and len(neighbors_active_low)!=0: 
        #number infected coauthors
        exposure_anas_start = len(neighbors_active_low)
        if exposure_anas_start in dict_final_low.keys():
            dict_final_low[exposure_anas_start].append(anas)
        else:
            dict_final_low[exposure_anas_start] = [anas] 
            
    return dict_final_high,dict_final_low

### Calculations

In [None]:
def calculation_A(i,author_ids_tot_list,all_coauthors_list,first_time_authors,first_time_authors_tot,dict_final,dict_final_list,dict_final_num_list,dict_final_den_list,prior_author_ids_list,authors_isolated):   
    
    dict_k_frac = {}
    dict_k_num = {} #numerator
    dict_k_den = {} #denumerator
    
    #key 0   #add authors not considered  
    author_ids_ = set().union(*author_ids_tot_list[i:i+5]) #all authors windows restricted to eligible ones
    author_ids_ = author_ids_ - prior_author_ids_list[i]
    author_ids_ = author_ids_  - all_coauthors_list[i] #already considered
    authors_k = author_ids_ | authors_isolated  
    len_k = len(authors_k) #all authors zero contacts
    new_auth_k = len(authors_k & first_time_authors_tot) #number of authors become infected first time during the observation window
    dict_k_frac[0] = new_auth_k/len_k
    dict_k_num[0] = new_auth_k
    dict_k_den[0] = len_k

    #key != 0      
    dict_final_keys = list(dict_final.keys())
    dict_final_keys.sort()
    for k in dict_final_keys: #for each class k
        authors_k = set(dict_final[k])
        len_k = len(authors_k) #number of authors
        new_auth_k = len(authors_k & first_time_authors) #number of authors become infected first time during the observation window
        dict_k_frac[k] = new_auth_k/len_k
        dict_k_num[k] = new_auth_k
        dict_k_den[k] = len_k
        
    #order dictionary by key
    dict_k_frac_ord = collections.OrderedDict(sorted(dict_k_frac.items()))
    dict_k_frac_num = collections.OrderedDict(sorted(dict_k_num.items()))
    dict_k_frac_den = collections.OrderedDict(sorted(dict_k_den.items()))
    
    dict_final_list.append(dict_k_frac_ord)
    dict_final_num_list.append(dict_k_frac_num)
    dict_final_den_list.append(dict_k_frac_den)
          
    return dict_final_list,dict_final_num_list,dict_final_den_list

def calculation_B(first_time_authors,dict_final,dict_final_list,dict_final_num_list,dict_final_den_list):
         
    dict_k_frac = {}
    dict_k_num = {} #numerator
    dict_k_den = {} #denumerator            
    
    dict_final_keys = list(dict_final.keys())
    dict_final_keys.sort()
    for k in dict_final_keys: #for each class k
        authors_k = set(dict_final[k])
        len_k = len(authors_k) #number of authors
        new_auth_k = len(authors_k & first_time_authors) #number of authors become active first time during the period
        dict_k_frac[k] = new_auth_k/len_k
        dict_k_num[k] = new_auth_k
        dict_k_den[k] = len_k 
        
    #order dictionary by key
    dict_k_frac_ord = collections.OrderedDict(sorted(dict_k_frac.items()))
    dict_k_frac_num = collections.OrderedDict(sorted(dict_k_num.items()))
    dict_k_frac_den = collections.OrderedDict(sorted(dict_k_den.items()))
        
    dict_final_list.append(dict_k_frac_ord)
    dict_final_num_list.append(dict_k_frac_num)
    dict_final_den_list.append(dict_k_frac_den)
    
        
    return dict_final_list,dict_final_num_list,dict_final_den_list

### Create folders

In [None]:
#create folder
my_path = os.path.join(discipline, 'Impact_mean1')
if not os.path.exists(my_path):
    os.makedirs(my_path)
    
my_path = os.path.join(discipline, 'Productivity')
if not os.path.exists(my_path):
    os.makedirs(my_path)

## INFO

In [None]:
def info(topic,my_path):

    #restrict to topic
    works_concepts_conc = works_concepts.query('concept_name==@topic', engine='python') 

    #each year: work and authors topic
    start_year = 1990 
    work_ids_list =  []
    author_ids_list =  []
    for w in range(0,32): 
        start_year_w = start_year+w

        work_ids = set(
            works_concepts_conc
            .query('publication_year == @start_year_w', engine='python')
            .work_id
        )
        work_ids_list.append(work_ids)
        # corrispondent authors
        author_ids = set(
            works_authors
            .query('work_id.isin(@work_ids)', engine='python')
            .author_id
        )
        author_ids_list.append(author_ids) 
        
    #save
    my_file = 'work_ids_list_'+topic
    with open(os.path.join(my_path, my_file),"wb") as fp:
        pickle.dump(work_ids_list,fp)
    my_file = 'author_ids_list_'+topic
    with open(os.path.join(my_path, my_file),"wb") as fp:
        pickle.dump(author_ids_list,fp)
        
    #each year: works and authors (with and without topic) 
    work_ids_tot_list =  []
    author_ids_tot_list =  []
    for w in range(0,28):
        start_year_w = start_year+w

        work_ids = (
            works
            .query('publication_year == @start_year_w', engine='python')
            .index
        )
        work_ids_tot_list.append(work_ids)

        author_ids = set(
            works_authors
            .query('work_id.isin(@work_ids)', engine='python')
            .author_id
        )
        author_ids_tot_list.append(author_ids)  
    #save
    my_file = 'work_ids_tot_list_'+topic
    with open(os.path.join(my_path, my_file),"wb") as fp:
        pickle.dump(work_ids_tot_list,fp)
    my_file = 'author_ids_tot_list_'+topic
    with open(os.path.join(my_path, my_file),"wb") as fp:
        pickle.dump(author_ids_tot_list,fp)
        
    #consider consecutive EW and OW (5 years each)
    start_year = 1995 
    info_df  = pd.DataFrame()
    windows_cond = [] 
    for w in range(0,23):
        start_year_w = start_year+w #T_0 #start OW

        # work and authors topic in EW
        prior_work_ids_5yr = set().union(*work_ids_list[w:w+5])
        prior_author_ids_5yr = set().union(*author_ids_list[w:w+5]) 

        # work and authors topic in OW
        work_ids = set().union(*work_ids_list[w+5:w+5+5]) 
        author_ids = set().union(*author_ids_list[w+5:w+5+5])

        #active authors start observation window
        active_authors_start = prior_author_ids_5yr
        
        info_i_dict = {
                'T_0':start_year_w, 
                'EW-papers topic': len(prior_work_ids_5yr),
                'EW-authors topic - active authors': len(prior_author_ids_5yr),
                'OW-papers topic': len(work_ids),
                'OW-authors topic': len(author_ids),
                  }
        
        #consider just windows with at least 3000 papers in EW and OW 
        windows_cond.append((len(prior_work_ids_5yr)>=3000) and (len(work_ids)>=3000))
            
        info_i = pd.DataFrame(data=[info_i_dict])
        info_df = pd.concat([info_df, info_i], ignore_index = True, axis = 0)
        
    my_file = 'info_'+topic+'_windows.csv'
    info_df.to_csv(os.path.join(my_path, my_file), sep=';', index=False)
    info_df.insert(0, 'topic', topic)
    
    #save
    my_file = 'windows_cond_'+topic
    with open(os.path.join(my_path, my_file),"wb") as fp:
        pickle.dump(windows_cond,fp)

    return info_df,windows_cond

In [None]:
info_df = pd.DataFrame()  
windows_cond = {}
my_path = os.path.join(discipline, 'Info')
if not os.path.exists(my_path): #create folder
    os.makedirs(my_path)
for topic in topic_list:
    info_df_top,windows_cond_top = info(topic=topic,my_path=my_path) 
    info_df = pd.concat([info_df, info_df_top], ignore_index = True, axis = 0)
    windows_cond[topic] = windows_cond_top
my_file = 'info_windows.csv'
info_df.to_csv(os.path.join(my_path, my_file), sep=';', index=False)  
my_file = 'windows_cond'
with open(os.path.join(my_path, my_file),"wb") as fp:
    pickle.dump(windows_cond,fp)

### Productivity

In [None]:
def info_productivity(discipline,topic,my_path):
       
    #load
    my_path2 = os.path.join(discipline, 'Info')
    my_file = 'work_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        work_ids_list = pickle.load(fp)
    my_file = 'author_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        author_ids_list = pickle.load(fp)
    my_file = 'windows_cond_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        windows_cond = pickle.load(fp)
    
    #consider consecutive EW and OW (5 years each)
    start_year = 1995 
    info_df  = pd.DataFrame()
    active_authors_classes = []
    for w in range(0,23):
        
        #consider just windows with at least 2000 papers in EW and OW
        windows_cond_w = windows_cond[w]   
        if windows_cond_w:
            
            start_year_w = start_year+w #T_0 #start OW

            # work and authors topic in EW
            prior_work_ids_5yr = set().union(*work_ids_list[w:w+5])
            prior_author_ids_5yr = set().union(*author_ids_list[w:w+5]) 

            #active authors start observation window
            active_authors_start = prior_author_ids_5yr

            #authors classes 
            sorted_author_works_count,sorted_author_works_count_len = experts_productivity(works_authors,prior_work_ids_5yr,active_authors_start)

            #10%
            samples_dict_1,n_1 = get_author_samples(sorted_author_works_count, top_k=10, debug=True)   
            high_active_authors1 = samples_dict_1['top 10%']
            high_active_authors1_val = sorted_author_works_count.query('author_id.isin(@high_active_authors1)').val
            low_active_authors1 = samples_dict_1['bottom 10%']
            low_active_authors1_val = sorted_author_works_count.query('author_id.isin(@low_active_authors1)').val

            #save
            active_authors_classes_w = [active_authors_start,samples_dict_1,n_1] 
            active_authors_classes.append(active_authors_classes_w)

            info_i_dict = {
                'T_0':start_year_w, 
                'Size classes - 10%':n_1, 
                'HIGH 10% - MAX': max(high_active_authors1_val),
                'HIGH 10% - MEAN': mean(high_active_authors1_val),
                'HIGH 10% - MIN': min(high_active_authors1_val),
                'LOW 10% - MAX': max(low_active_authors1_val),
                'LOW 10% - MEAN': mean(low_active_authors1_val),
                'LOW 10% - MIN': min(low_active_authors1_val),
                  }
            info_i = pd.DataFrame(data=[info_i_dict])
            info_df = pd.concat([info_df, info_i], ignore_index = True, axis = 0)  
        else:
            active_authors_classes.append(np.nan)

    my_file = 'info_classes_'+topic+'_windows.csv'
    info_df.to_csv(os.path.join(my_path, my_file), sep=';', index=False)
    info_df.insert(0, 'topic', topic)
    
    my_file = 'active_authors_classes_'+topic
    with open(os.path.join(my_path, my_file),"wb") as fp:
        pickle.dump(active_authors_classes,fp)

    return info_df,active_authors_classes

In [None]:
info_df = pd.DataFrame()  
active_authors_classes = {}
my_path = os.path.join(discipline, 'Info/Productivity')
if not os.path.exists(my_path): #create folder
    os.makedirs(my_path)
for topic in topic_list:
    info_df_top,active_authors_classes_top = info_productivity(discipline=discipline,topic=topic,my_path=my_path) 
    info_df = pd.concat([info_df, info_df_top], ignore_index = True, axis = 0)
    active_authors_classes[topic] = active_authors_classes_top
my_file = 'info_classes_windows.csv'
info_df.to_csv(os.path.join(my_path, my_file), sep=';', index=False)  
my_file = 'active_authors_classes'
with open(os.path.join(my_path, my_file),"wb") as fp:
    pickle.dump(active_authors_classes,fp)

### Impact (mean)

#### Def. impact 1 - papers:all, cits:topic

In [None]:
 def info_impact1(discipline,topic,my_path):
    
    works_concepts_conc_tot = works_concepts.query('concept_name==@topic', engine='python') 
    
    work_ids_concept = set(works_concepts_conc_tot.work_id)
    works_referenced_works_concept = works_referenced_works.query('work_id.isin(@work_ids_concept)', engine='python')
    works_cit_counts_year_concept = works_referenced_works_concept.groupby(['referenced_work_id','work_publication_year']).count()["work_id"].reset_index(name="cit_count")
    works_cit_counts_year_concept.set_index(['referenced_work_id', 'work_publication_year'], inplace=True)
    index = pd.MultiIndex.from_product(works_cit_counts_year_concept.index.levels)
    works_cit_counts_year_concept = works_cit_counts_year_concept.reindex(index)
    works_cit_counts_year_concept = works_cit_counts_year_concept.reset_index(level=0).reset_index(level=0)
    works_cit_counts_year_concept = works_cit_counts_year_concept.fillna(0)
    works_cit_counts_year_concept['cit_count_cum'] = works_cit_counts_year_concept.groupby(['referenced_work_id'])['cit_count'].cumsum()
    works_cit_counts_year_concept = works_cit_counts_year_concept.rename(columns = {'referenced_work_id':'work_id'})
       
    #load
    my_path2 = os.path.join(discipline, 'Info')
    my_file = 'work_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        work_ids_list = pickle.load(fp)
    my_file = 'author_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        author_ids_list = pickle.load(fp)
    my_file = 'windows_cond_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        windows_cond = pickle.load(fp)
    
    #consider consecutive EW and OW (5 years each)
    start_year = 1995 
    info_df  = pd.DataFrame()
    active_authors_classes = []
    for w in range(0,23):
        #consider just windows with at least 2000 papers in EW and OW
        windows_cond_w = windows_cond[w]   
        if windows_cond_w:
            
            start_year_w = start_year+w #T_0 #start OW

            # work and authors topic in EW
            prior_work_ids_5yr = set().union(*work_ids_list[w:w+5])
            prior_author_ids_5yr = set().union(*author_ids_list[w:w+5]) 

            #active authors start observation window
            active_authors_start = prior_author_ids_5yr

            #authors classes 
            sorted_author_works_count,sorted_author_works_count_len = experts_impact_mean_1(works_authors,start_year_w,active_authors_start,works_cit_counts_year_concept) 

            #10%
            samples_dict_1,n_1 = get_author_samples(sorted_author_works_count, top_k=10, debug=True)   
            high_active_authors1 = samples_dict_1['top 10%']
            high_active_authors1_val = sorted_author_works_count.query('author_id.isin(@high_active_authors1)').val
            # mid_active_authors1 = samples_dict_1['middle 10%']
            # mid_active_authors1_val = sorted_author_works_count.query('author_id.isin(@mid_active_authors1)').val
            low_active_authors1 = samples_dict_1['bottom 10%']
            low_active_authors1_val = sorted_author_works_count.query('author_id.isin(@low_active_authors1)').val

            #save
            active_authors_classes_w = [active_authors_start,samples_dict_1,n_1] # [[samples_dict_1,n_1],[samples_dict_2,n_2]]
            active_authors_classes.append(active_authors_classes_w)

            info_i_dict = {
                'T_0':start_year_w, 
                'Size classes - 10%':n_1, 
                'HIGH 10% - MAX': max(high_active_authors1_val),
                'HIGH 10% - MEAN': mean(high_active_authors1_val),
                'HIGH 10% - MIN': min(high_active_authors1_val),
                'LOW 10% - MAX': max(low_active_authors1_val),
                'LOW 10% - MEAN': mean(low_active_authors1_val),
                'LOW 10% - MIN': min(low_active_authors1_val),
                  }
            info_i = pd.DataFrame(data=[info_i_dict])
            info_df = pd.concat([info_df, info_i], ignore_index = True, axis = 0)   
        else:
            active_authors_classes.append(np.nan)

    my_file = 'info_classes_'+topic+'_windows.csv'
    info_df.to_csv(os.path.join(my_path, my_file), sep=';', index=False)
    info_df.insert(0, 'topic', topic)
    
    my_file = 'active_authors_classes_'+topic
    with open(os.path.join(my_path, my_file),"wb") as fp:
        pickle.dump(active_authors_classes,fp)

    return info_df,active_authors_classes

In [None]:
info_df = pd.DataFrame()  
active_authors_classes = {}
my_path = os.path.join(discipline, 'Info/Impact_mean1')
if not os.path.exists(my_path): #create folder
    os.makedirs(my_path)
for topic in topic_list:
    info_df_top,active_authors_classes_top = info_impact1(discipline=discipline,topic=topic,my_path=my_path) 
    info_df = pd.concat([info_df, info_df_top], ignore_index = True, axis = 0)
    active_authors_classes[topic] = active_authors_classes_top
my_file = 'info_classes_windows.csv'
info_df.to_csv(os.path.join(my_path, my_file), sep=';', index=False)  
my_file = 'active_authors_classes'
with open(os.path.join(my_path, my_file),"wb") as fp:
    pickle.dump(active_authors_classes,fp)

#### Def. impact 2 - papers:topic, cits:all

In [None]:
 def info_impact2(discipline,topic,my_path):
       
    #load
    my_path2 = os.path.join(discipline, 'Info')
    my_file = 'work_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        work_ids_list = pickle.load(fp)
    my_file = 'author_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        author_ids_list = pickle.load(fp)
    my_file = 'windows_cond_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        windows_cond = pickle.load(fp)
    
    #consider consecutive EW and OW (5 years each)
    start_year = 1995 
    info_df  = pd.DataFrame()
    active_authors_classes = []
    for w in range(0,23):
        #consider just windows with at least 2000 papers in EW and OW
        windows_cond_w = windows_cond[w]   
        if windows_cond_w:
            
            start_year_w = start_year+w #T_0 #start OW

            # work and authors topic in EW
            prior_work_ids_5yr = set().union(*work_ids_list[w:w+5])
            prior_author_ids_5yr = set().union(*author_ids_list[w:w+5]) 

            #active authors start observation window
            active_authors_start = prior_author_ids_5yr

            #authors classes 
            sorted_author_works_count,sorted_author_works_count_len = experts_impact_mean_2(works_authors,start_year_w,prior_work_ids_5yr,active_authors_start,works_cit_counts_year)

            #10%
            samples_dict_1,n_1 = get_author_samples(sorted_author_works_count, top_k=10, debug=True)   
            high_active_authors1 = samples_dict_1['top 10%']
            high_active_authors1_val = sorted_author_works_count.query('author_id.isin(@high_active_authors1)').val
            # mid_active_authors1 = samples_dict_1['middle 10%']
            # mid_active_authors1_val = sorted_author_works_count.query('author_id.isin(@mid_active_authors1)').val
            low_active_authors1 = samples_dict_1['bottom 10%']
            low_active_authors1_val = sorted_author_works_count.query('author_id.isin(@low_active_authors1)').val

            #save
            active_authors_classes_w = [active_authors_start,samples_dict_1,n_1] # [[samples_dict_1,n_1],[samples_dict_2,n_2]]
            active_authors_classes.append(active_authors_classes_w)

            info_i_dict = {
                'T_0':start_year_w, 
                'Size classes - 10%':n_1, 
                'HIGH 10% - MAX': max(high_active_authors1_val),
                'HIGH 10% - MEAN': mean(high_active_authors1_val),
                'HIGH 10% - MIN': min(high_active_authors1_val),
                'LOW 10% - MAX': max(low_active_authors1_val),
                'LOW 10% - MEAN': mean(low_active_authors1_val),
                'LOW 10% - MIN': min(low_active_authors1_val),
                  }
            info_i = pd.DataFrame(data=[info_i_dict])
            info_df = pd.concat([info_df, info_i], ignore_index = True, axis = 0)   
        else:
            active_authors_classes.append(np.nan)

    my_file = 'info_classes_'+topic+'_windows.csv'
    info_df.to_csv(os.path.join(my_path, my_file), sep=';', index=False)
    info_df.insert(0, 'topic', topic)
    
    my_file = 'active_authors_classes_'+topic
    with open(os.path.join(my_path, my_file),"wb") as fp:
        pickle.dump(active_authors_classes,fp)

    return info_df,active_authors_classes

In [None]:
info_df = pd.DataFrame()  
active_authors_classes = {}
my_path = os.path.join(discipline, 'Info/Impact_mean2')
if not os.path.exists(my_path): #create folder
    os.makedirs(my_path)
for topic in topic_list:
    info_df_top,active_authors_classes_top = info_impact2(discipline=discipline,topic=topic,my_path=my_path) 
    info_df = pd.concat([info_df, info_df_top], ignore_index = True, axis = 0)
    active_authors_classes[topic] = active_authors_classes_top
my_file = 'info_classes_windows.csv'
info_df.to_csv(os.path.join(my_path, my_file), sep=';', index=False)  
my_file = 'active_authors_classes'
with open(os.path.join(my_path, my_file),"wb") as fp:
    pickle.dump(active_authors_classes,fp)

#### Def. impact 3 - papers:topic, cits:topic

In [None]:
def info_impact3(discipline,topic,my_path):
    
    works_concepts_conc_tot = works_concepts.query('concept_name==@topic', engine='python')
    
    work_ids_concept = set(works_concepts_conc_tot.work_id)
    works_referenced_works_concept = works_referenced_works.query('work_id.isin(@work_ids_concept)', engine='python')
    works_cit_counts_year_concept = works_referenced_works_concept.groupby(['referenced_work_id','work_publication_year']).count()["work_id"].reset_index(name="cit_count")
    works_cit_counts_year_concept.set_index(['referenced_work_id', 'work_publication_year'], inplace=True)
    index = pd.MultiIndex.from_product(works_cit_counts_year_concept.index.levels)
    works_cit_counts_year_concept = works_cit_counts_year_concept.reindex(index)
    works_cit_counts_year_concept = works_cit_counts_year_concept.reset_index(level=0).reset_index(level=0)
    works_cit_counts_year_concept = works_cit_counts_year_concept.fillna(0)
    works_cit_counts_year_concept['cit_count_cum'] = works_cit_counts_year_concept.groupby(['referenced_work_id'])['cit_count'].cumsum()
    works_cit_counts_year_concept = works_cit_counts_year_concept.rename(columns = {'referenced_work_id':'work_id'})

    #load
    my_path2 = os.path.join(discipline, 'Info')
    my_file = 'work_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        work_ids_list = pickle.load(fp)
    my_file = 'author_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        author_ids_list = pickle.load(fp)
    my_file = 'windows_cond_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        windows_cond = pickle.load(fp)
    
    #consider consecutive EW and OW (5 years each)
    start_year = 1995 
    info_df  = pd.DataFrame()
    active_authors_classes = []
    for w in range(0,23):
        #consider just windows with at least 2000 papers in EW and OW
        windows_cond_w = windows_cond[w]   
        if windows_cond_w:
            
            start_year_w = start_year+w #T_0 #start OW

            # work and authors topic in EW
            prior_work_ids_5yr = set().union(*work_ids_list[w:w+5])
            prior_author_ids_5yr = set().union(*author_ids_list[w:w+5]) 

            #active authors start observation window
            active_authors_start = prior_author_ids_5yr

            #authors classes 
            sorted_author_works_count,sorted_author_works_count_len = experts_impact_mean_3(works_authors,start_year_w,prior_work_ids_5yr,active_authors_start,works_cit_counts_year_concept)

            #10%
            samples_dict_1,n_1 = get_author_samples(sorted_author_works_count, top_k=10, debug=True)   
            high_active_authors1 = samples_dict_1['top 10%']
            high_active_authors1_val = sorted_author_works_count.query('author_id.isin(@high_active_authors1)').val
            low_active_authors1 = samples_dict_1['bottom 10%']
            low_active_authors1_val = sorted_author_works_count.query('author_id.isin(@low_active_authors1)').val

            #save
            active_authors_classes_w = [active_authors_start,samples_dict_1,n_1] # [[samples_dict_1,n_1],[samples_dict_2,n_2]]
            active_authors_classes.append(active_authors_classes_w)

            info_i_dict = {
                'T_0':start_year_w, 
                'Size classes - 10%':n_1, 
                'HIGH 10% - MAX': max(high_active_authors1_val),
                'HIGH 10% - MEAN': mean(high_active_authors1_val),
                'HIGH 10% - MIN': min(high_active_authors1_val),
                'LOW 10% - MAX': max(low_active_authors1_val),
                'LOW 10% - MEAN': mean(low_active_authors1_val),
                'LOW 10% - MIN': min(low_active_authors1_val),
                  }
            info_i = pd.DataFrame(data=[info_i_dict])
            info_df = pd.concat([info_df, info_i], ignore_index = True, axis = 0)   
        else:
            active_authors_classes.append(np.nan)

    my_file = 'info_classes_'+topic+'_windows.csv'
    info_df.to_csv(os.path.join(my_path, my_file), sep=';', index=False)
    info_df.insert(0, 'topic', topic)
    
    my_file = 'active_authors_classes_'+topic
    with open(os.path.join(my_path, my_file),"wb") as fp:
        pickle.dump(active_authors_classes,fp)

    return info_df,active_authors_classes

In [None]:
info_df = pd.DataFrame()  
active_authors_classes = {}
my_path = os.path.join(discipline, 'Info/Impact_mean3')
if not os.path.exists(my_path): #create folder
    os.makedirs(my_path)
for topic in topic_list:
    info_df_top,active_authors_classes_top = info_impact3(discipline=discipline,topic=topic,my_path=my_path) 
    info_df = pd.concat([info_df, info_df_top], ignore_index = True, axis = 0)
    active_authors_classes[topic] = active_authors_classes_top
my_file = 'info_classes_windows.csv'
info_df.to_csv(os.path.join(my_path, my_file), sep=';', index=False)  
my_file = 'active_authors_classes'
with open(os.path.join(my_path, my_file),"wb") as fp:
    pickle.dump(active_authors_classes,fp)

## EXP1

### Productivity

#### Def. contact 1 - #contacts

In [None]:
def Exp1_ver1(discipline,topic,my_path):
    
    #restrict to topic
    works_concepts_conc = works_concepts.query('concept_name==@topic', engine='python')   
      
    #load
    my_path2 = os.path.join(discipline, 'Info')
    my_file = 'work_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        work_ids_list = pickle.load(fp)
    my_file = 'author_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        author_ids_list = pickle.load(fp)
    my_file = 'work_ids_tot_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        work_ids_tot_list = pickle.load(fp)
    my_file = 'author_ids_tot_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        author_ids_tot_list = pickle.load(fp)
    my_file = 'windows_cond_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        windows_cond = pickle.load(fp)
        
    #load
    my_path3 = os.path.join(my_path2, 'Productivity')
    my_file = 'active_authors_classes_'+topic
    with open(os.path.join(my_path3, my_file),"rb") as fp:
        active_authors_classes = pickle.load(fp)   
    
    #consider consecutive EW and OW (5 years each)
    start_year = 1995 
    all_coauthors_list = [] #coauthors collaboration graph
    active_authors_start_union = set() #union active authors all windows
    for w in range(0,23):
        
        #consider just windows with at least 2000 papers in EW and OW
        windows_cond_w = windows_cond[w]   
        if windows_cond_w:
            start_year_w = start_year+w #T_0 #start OW

            # work and authors topic in EW (5 years before)
            prior_work_ids_5yr = set().union(*work_ids_list[w:w+5])
            prior_author_ids_5yr = set().union(*author_ids_list[w:w+5]) 
   
            # all coauthors
            all_coauthors_w = set(
                works_authors
                [
                    works_authors.work_id.isin(
                        works_authors
                        .query('(author_id.isin(@prior_author_ids_5yr)) & (@start_year_w - 5 <= publication_year < @start_year_w)', engine='python')
                        .work_id)
                ]
                .author_id
            )
            #save
            all_coauthors_list.append(all_coauthors_w)

            #union active authors
            [active_authors_start,samples_dict_1,n_1] = active_authors_classes[w]
            active_authors_start_union = active_authors_start_union.union(active_authors_start)
        else:
            all_coauthors_list.append(np.nan) 
            
    active_authors_start_union_list = list(active_authors_start_union)     
    
    my_file = 'all_coauthors_list_'+topic
    with open(os.path.join(my_path, my_file),"wb") as fp:
        pickle.dump(all_coauthors_list,fp)
    my_file = 'active_authors_start_union_'+topic
    with open(os.path.join(my_path, my_file),"wb") as fp:
        pickle.dump(active_authors_start_union,fp)
            
    prior_work_ids_list =  [] #paper written with topic before start year
    prior_author_ids_list =  []
    first_time_authors_list = [] #authors write first paper during observation window
    first_time_authors_tot_list = [] 
    not_active_authors_start_list = [] #authors not already active at the beginning 
    first_time_authors_union = set() #first time authors all windows 
    for w in range(0,23):
            start_year_w = start_year+w
            #authors written at least one paper with concept before start_date --> already active nodes at the beginning
            if w==0:
                prior_work_ids_df = works_concepts_conc.query('publication_year < @start_year_w', engine='python')
                prior_work_ids = set(
                prior_work_ids_df
                .work_id
                )
                prior_work_ids_list.append(prior_work_ids)
                prior_author_ids = set(
                    works_authors
                    .query('work_id.isin(@prior_work_ids)')
                    .author_id
                )
                prior_author_ids_list.append(prior_author_ids)

            else: 
                prior_work_ids = (prior_work_ids_list[w-1]).union(work_ids_list[w+5-1])
                prior_work_ids_list.append(prior_work_ids)
                prior_author_ids = (prior_author_ids_list[w-1]).union(author_ids_list[w+5-1])
                prior_author_ids_list.append(prior_author_ids) 
            
            windows_cond_w = windows_cond[w]   
            if windows_cond_w:
            
                all_coauthors = all_coauthors_list[w] # all coauthors         
                author_ids = set().union(*author_ids_list[w+5:w+5+5]) # work and authors topic in OW
                first_time_authors = (all_coauthors & author_ids) - prior_author_ids #authors write first paper during observation window
                first_time_authors_list.append(first_time_authors)               
                first_time_authors_tot = author_ids - prior_author_ids #authors write first paper during observation window
                first_time_authors_tot_list.append(first_time_authors)
                not_active_authors_start = all_coauthors - prior_author_ids
                not_active_authors_start_list.append(not_active_authors_start)                
                first_time_authors_union = first_time_authors_union.union(first_time_authors)  
            else:
                first_time_authors_list.append(np.nan) 
                not_active_authors_start_list.append(np.nan) 
   
    my_file = 'prior_work_ids_list_'+topic
    with open(os.path.join(my_path, my_file),"wb") as fp:
        pickle.dump(prior_work_ids_list,fp)
    my_file = 'prior_author_ids_list_'+topic
    with open(os.path.join(my_path, my_file),"wb") as fp:
        pickle.dump(prior_author_ids_list,fp)
    my_file = 'first_time_authors_list_'+topic
    with open(os.path.join(my_path, my_file),"wb") as fp:
        pickle.dump(first_time_authors_list,fp)
    my_file = 'first_time_authors_tot_list_'+topic
    with open(os.path.join(my_path, my_file),"wb") as fp:
        pickle.dump(first_time_authors_tot_list,fp)
    my_file = 'not_active_authors_start_list_'+topic
    with open(os.path.join(my_path, my_file),"wb") as fp:
        pickle.dump(not_active_authors_start_list,fp)
    my_file = 'first_time_authors_union_'+topic
    with open(os.path.join(my_path, my_file),"wb") as fp:
        pickle.dump(first_time_authors_union,fp)
        
    #for authors infected at the beginning: dictionary {author : date_infection/date first paper with concept} 
    date_activation_df = (
                        works_authors
                        [works_authors.work_id.isin(
                         works_concepts_conc
                        .work_id
                        )]
                        .query('author_id.isin(@active_authors_start_union)')
                        .sort_values(by='publication_date')
                        .drop_duplicates('author_id')     
            )
    dict_date_act_start = pd.Series(date_activation_df.publication_date.values,index=date_activation_df.author_id).to_dict()  
    
    #for each infected author keep just papers written after their infection date
    works_authors_active = works_authors[works_authors.author_id.isin(active_authors_start_union)] #restrict to active authors
    works_authors_aa_list = []
    for aa in tqdm(active_authors_start_union):
        works_authors_aa = works_authors_active[ (works_authors_active.author_id == aa) & (works_authors_active.publication_date >= dict_date_act_start[aa])] #select just works before activation year
        works_authors_aa_list.append(works_authors_aa) 
    works_authors_activation_date = pd.concat(works_authors_aa_list)  
    my_file = 'works_authors_activation_date_'+topic
    with open(os.path.join(my_path, my_file),"wb") as fp:
        pickle.dump(works_authors_activation_date,fp)
           
    dict_final_list = []
    dict_final_den_list = []
    dict_final_num_list = []
    dict_final_list_high1 = []
    dict_final_den_list_high1 = []
    dict_final_num_list_high1 = []
    dict_final_list_low1 = []
    dict_final_den_list_low1 = []
    dict_final_num_list_low1 = []
    for w in tqdm(range(0,23)): 

        windows_cond_w = windows_cond[w]   
        if windows_cond_w:

            start_year_w = start_year+w
            all_coauthors = all_coauthors_list[w]
            first_time_authors = first_time_authors_list[w]
            first_time_authors_tot = first_time_authors_tot_list[w]
            [active_authors_start,samples_dict_1,n_1] = active_authors_classes[w]
            high_active_authors1 = samples_dict_1['top 10%']
            low_active_authors1 = samples_dict_1['bottom 10%']
            
            #keep just works active_authors_start in this period and written in the period
            work_id_active = works_authors_activation_date[works_authors_activation_date.author_id.isin(active_authors_start)]
            work_id_active = work_id_active.query('@start_year_w-5 <= publication_year < @start_year_w', engine='python') 
            #add coauthors but not infected
            work_id_active_collab = works_authors[works_authors.work_id.isin(work_id_active.work_id)].query('author_id not in @active_authors_start')
            works_authors_collab = pd.concat([work_id_active,work_id_active_collab]).reset_index(drop=True)    

            #bipartite graph work-authors union exposure window
            bip_g = nx.from_pandas_edgelist(
                    works_authors_collab[['work_id', 'author_id']],
                    source='work_id', target='author_id'
                )

            #graph weight number papers written together
            author_ids_supp =  set(works_authors_collab.author_id)
            support_graph_ = get_support_graph_ver1(bip_g, author_ids_supp)
            
            #dictionary {number exposure start year : list of authors that number}
            not_active_authors_start = not_active_authors_start_list[w]
            authors_isolated = not_active_authors_start - author_ids_supp
                       
            dict_final = {}
            dict_final_high1 = {}
            dict_final_low1 = {}
            for anas in tqdm(author_ids_supp & not_active_authors_start): #for each author not active at the beginning 
                n_anas = set(support_graph_.neighbors(anas))

                #A
                dict_final,ego_active_total = get_scores_A_ver1(anas,n_anas, active_authors_start,support_graph_,dict_final)
                #B 
                dict_final_high1,dict_final_low1 = get_scores_B_ver1(anas,n_anas,high_active_authors1,low_active_authors1,ego_active_total,dict_final_high1,dict_final_low1)

            #(iii) Define T(k) to be the fraction of these authors that have become active by the time of the second snapshot.
            #dictionary {k : fraction}

            #A 
            dict_final_list,dict_final_num_list,dict_final_den_list = calculation_A(w,author_ids_tot_list,all_coauthors_list,first_time_authors,first_time_authors_tot,dict_final,dict_final_list,dict_final_num_list,dict_final_den_list,prior_author_ids_list,authors_isolated)   
            #B  
            dict_final_list_high1,dict_final_num_list_high1,dict_final_den_list_high1 = calculation_B(first_time_authors,dict_final_high1,dict_final_list_high1,dict_final_num_list_high1,dict_final_den_list_high1)
            dict_final_list_low1,dict_final_num_list_low1,dict_final_den_list_low1 = calculation_B(first_time_authors,dict_final_low1,dict_final_list_low1,dict_final_num_list_low1,dict_final_den_list_low1)

        else:
            dict_final_list.append(np.nan)
            dict_final_den_list.append(np.nan)
            dict_final_num_list.append(np.nan)
            dict_final_list_high1.append(np.nan)
            dict_final_den_list_high1.append(np.nan)
            dict_final_num_list_high1.append(np.nan)
            dict_final_list_low1.append(np.nan)
            dict_final_den_list_low1.append(np.nan)
            dict_final_num_list_low1.append(np.nan)
            
    #save on file dictionary each window: concept - year_start 
    my_file = 'df_'+topic+'_windows.csv'
    
    topic_df_  = pd.DataFrame()
    for w in range(0,23): 

        windows_cond_w = windows_cond[w]   
        if windows_cond_w:
            start_year_w = start_year + w
                
            dict_final_df=pd.DataFrame(dict_final_list[w].items(), columns=['k', 'prob'])
            dict_final_den_df=pd.DataFrame(dict_final_den_list[w].items(), columns=['k', 'den'])
            dict_final_num_df=pd.DataFrame(dict_final_num_list[w].items(), columns=['k', 'num'])
            dict_final_high1_df=pd.DataFrame(dict_final_list_high1[w].items(), columns=['k', 'prob_high1'])
            dict_final_den_high1_df=pd.DataFrame(dict_final_den_list_high1[w].items(), columns=['k', 'den_high1'])
            dict_final_num_high1_df=pd.DataFrame(dict_final_num_list_high1[w].items(), columns=['k', 'num_high1'])
            dict_final_low1_df=pd.DataFrame(dict_final_list_low1[w].items(), columns=['k', 'prob_low1'])
            dict_final_den_low1_df=pd.DataFrame(dict_final_den_list_low1[w].items(), columns=['k', 'den_low1'])
            dict_final_num_low1_df=pd.DataFrame(dict_final_num_list_low1[w].items(), columns=['k', 'num_low1']) 

            topic_df = dict_final_df.merge(dict_final_den_df.merge(dict_final_num_df))
            topic_high1_df = dict_final_high1_df.merge(dict_final_den_high1_df.merge(dict_final_num_high1_df))
            topic_low1_df = dict_final_low1_df.merge(dict_final_den_low1_df.merge(dict_final_num_low1_df))

            topic_df_w  = (topic_df.merge(topic_high1_df, how='outer')).merge(topic_low1_df, how='outer')
            topic_df_w.insert(0, 'T_0', start_year_w)
            topic_df_ = pd.concat([topic_df_, topic_df_w], ignore_index = True, axis = 0)
              
    topic_df_.to_csv(os.path.join(my_path, my_file))
 
    topic_df_.insert(0, 'topic', topic)

    return topic_df_

In [None]:
my_path = os.path.join(discipline, 'Productivity/Exp1_ver1')
#create folder
if not os.path.exists(my_path):
    os.makedirs(my_path)
topics_df = pd.DataFrame();
for topic in topic_list:
    topic_df_top = Exp1_ver1(discipline=discipline,topic=topic,my_path=my_path) 
    topics_df = pd.concat([topics_df, topic_df_top], ignore_index = True, axis = 0)
my_file = 'df_topic_windows.csv'
topics_df.to_csv(os.path.join(my_path, my_file))

#### Def. contact 2 - #papers

In [None]:
def Exp1_ver2(discipline,topic,my_path):
    
    #restrict to topic
    works_concepts_conc = works_concepts.query('concept_name==@topic', engine='python')  
      
    #load
    my_path2 = os.path.join(discipline, 'Info')
    my_file = 'work_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        work_ids_list = pickle.load(fp)
    my_file = 'author_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        author_ids_list = pickle.load(fp)
    my_file = 'work_ids_tot_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        work_ids_tot_list = pickle.load(fp)
    my_file = 'author_ids_tot_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        author_ids_tot_list = pickle.load(fp)
    my_file = 'windows_cond_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        windows_cond = pickle.load(fp)
        
    #load
    my_path3 = os.path.join(my_path2, 'Productivity')
    my_file = 'active_authors_classes_'+topic
    with open(os.path.join(my_path3, my_file),"rb") as fp:
        active_authors_classes = pickle.load(fp)
        
    #consider consecutive EW and OW (5 years each)
    start_year = 1995 
    my_path4 = os.path.join(discipline, 'Productivity/Exp1_ver1')
    my_file = 'all_coauthors_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        all_coauthors_list = pickle.load(fp) 
    my_file = 'active_authors_start_union_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        active_authors_start_union = pickle.load(fp) 
    active_authors_start_union_list = list(active_authors_start_union)    
            
    my_file = 'prior_work_ids_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        prior_work_ids_list = pickle.load(fp) 
    my_file = 'prior_author_ids_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        prior_author_ids_list = pickle.load(fp)  
    my_file = 'first_time_authors_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_list = pickle.load(fp) 
    my_file = 'first_time_authors_tot_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_tot_list = pickle.load(fp)
    my_file = 'not_active_authors_start_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        not_active_authors_start_list = pickle.load(fp)
    my_file = 'first_time_authors_union_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_union = pickle.load(fp)
        
    my_file = 'works_authors_activation_date_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        works_authors_activation_date = pickle.load(fp)
           
    dict_final_list = []
    dict_final_den_list = []
    dict_final_num_list = []
    dict_final_list_high1 = []
    dict_final_den_list_high1 = []
    dict_final_num_list_high1 = []
    dict_final_list_low1 = []
    dict_final_den_list_low1 = []
    dict_final_num_list_low1 = []
    for w in tqdm(range(0,23)): 

        windows_cond_w = windows_cond[w]   
        if windows_cond_w:

            start_year_w = start_year+w
            all_coauthors = all_coauthors_list[w]
            first_time_authors = first_time_authors_list[w]
            first_time_authors_tot = first_time_authors_tot_list[w]
            [active_authors_start,samples_dict_1,n_1] = active_authors_classes[w]
            high_active_authors1 = samples_dict_1['top 10%']
            low_active_authors1 = samples_dict_1['bottom 10%']
            
            #keep just works active_authors_start in this period and written in the period
            work_id_active = works_authors_activation_date[works_authors_activation_date.author_id.isin(active_authors_start)]
            work_id_active = work_id_active.query('@start_year_w-5 <= publication_year < @start_year_w', engine='python') 
            #add coauthors but not infected
            work_id_active_collab = works_authors[works_authors.work_id.isin(work_id_active.work_id)].query('author_id not in @active_authors_start')
            works_authors_collab = pd.concat([work_id_active,work_id_active_collab]).reset_index(drop=True)    

            #bipartite graph work-authors union exposure window
            bip_g = nx.from_pandas_edgelist(
                    works_authors_collab[['work_id', 'author_id']],
                    source='work_id', target='author_id'
                )

            #graph weight number papers written together
            author_ids_supp =  set(works_authors_collab.author_id)
            support_graph_ = get_support_graph_ver2(bip_g, author_ids_supp,list_works)
            #dictionary {number exposure start year : list of authors that number}
            not_active_authors_start = not_active_authors_start_list[w]
            authors_isolated = not_active_authors_start - author_ids_supp
                       
            dict_final = {}
            dict_final_high1 = {}
            dict_final_low1 = {}
            for anas in tqdm(author_ids_supp & not_active_authors_start): #for each author not active at the beginning  
                n_anas = set(support_graph_.neighbors(anas))

                #A
                dict_final,ego_active_total = get_scores_A_ver2(anas,n_anas, active_authors_start,support_graph_,dict_final)
                #B 
                dict_final_high1,dict_final_low1 = get_scores_B_ver2(anas,n_anas,high_active_authors1,low_active_authors1,ego_active_total,dict_final_high1,dict_final_low1)

            #(iii) Define T(k) to be the fraction of these authors that have become active by the time of the second snapshot.
            #dictionary {k : fraction}

            #A 
            dict_final_list,dict_final_num_list,dict_final_den_list = calculation_A(w,author_ids_tot_list,all_coauthors_list,first_time_authors,first_time_authors_tot,dict_final,dict_final_list,dict_final_num_list,dict_final_den_list,prior_author_ids_list,authors_isolated)   
            #B  
            dict_final_list_high1,dict_final_num_list_high1,dict_final_den_list_high1 = calculation_B(first_time_authors,dict_final_high1,dict_final_list_high1,dict_final_num_list_high1,dict_final_den_list_high1)
            dict_final_list_low1,dict_final_num_list_low1,dict_final_den_list_low1 = calculation_B(first_time_authors,dict_final_low1,dict_final_list_low1,dict_final_num_list_low1,dict_final_den_list_low1)

        else:
            dict_final_list.append(np.nan)
            dict_final_den_list.append(np.nan)
            dict_final_num_list.append(np.nan)
            dict_final_list_high1.append(np.nan)
            dict_final_den_list_high1.append(np.nan)
            dict_final_num_list_high1.append(np.nan)
            dict_final_list_low1.append(np.nan)
            dict_final_den_list_low1.append(np.nan)
            dict_final_num_list_low1.append(np.nan)
            
    #save on file dictionary each window: concept - year_start 
    my_file = 'df_'+topic+'_windows.csv'
    
    topic_df_  = pd.DataFrame()
    for w in range(0,23): 

        windows_cond_w = windows_cond[w]   
        if windows_cond_w:
            start_year_w = start_year + w
                
            dict_final_df=pd.DataFrame(dict_final_list[w].items(), columns=['k', 'prob'])
            dict_final_den_df=pd.DataFrame(dict_final_den_list[w].items(), columns=['k', 'den'])
            dict_final_num_df=pd.DataFrame(dict_final_num_list[w].items(), columns=['k', 'num'])
            dict_final_high1_df=pd.DataFrame(dict_final_list_high1[w].items(), columns=['k', 'prob_high1'])
            dict_final_den_high1_df=pd.DataFrame(dict_final_den_list_high1[w].items(), columns=['k', 'den_high1'])
            dict_final_num_high1_df=pd.DataFrame(dict_final_num_list_high1[w].items(), columns=['k', 'num_high1'])
            dict_final_low1_df=pd.DataFrame(dict_final_list_low1[w].items(), columns=['k', 'prob_low1'])
            dict_final_den_low1_df=pd.DataFrame(dict_final_den_list_low1[w].items(), columns=['k', 'den_low1'])
            dict_final_num_low1_df=pd.DataFrame(dict_final_num_list_low1[w].items(), columns=['k', 'num_low1']) 

            topic_df = dict_final_df.merge(dict_final_den_df.merge(dict_final_num_df))
            topic_high1_df = dict_final_high1_df.merge(dict_final_den_high1_df.merge(dict_final_num_high1_df))
            topic_low1_df = dict_final_low1_df.merge(dict_final_den_low1_df.merge(dict_final_num_low1_df))

            topic_df_w  = (topic_df.merge(topic_high1_df, how='outer')).merge(topic_low1_df, how='outer')
            topic_df_w.insert(0, 'T_0', start_year_w)
            topic_df_ = pd.concat([topic_df_, topic_df_w], ignore_index = True, axis = 0)
              
    topic_df_.to_csv(os.path.join(my_path, my_file))
 
    topic_df_.insert(0, 'topic', topic)

    return topic_df_

In [None]:
my_path = os.path.join(discipline, 'Productivity/Exp1_ver2')
#create folder
if not os.path.exists(my_path):
    os.makedirs(my_path)
topics_df = pd.DataFrame();
for topic in topic_list:
    topic_df_top = Exp1_ver2(discipline=discipline,topic=topic,my_path=my_path) 
    topics_df = pd.concat([topics_df, topic_df_top], ignore_index = True, axis = 0)
my_file = 'df_topic_windows.csv'
topics_df.to_csv(os.path.join(my_path, my_file))

#### Def. contact 3 - #coauthors

In [None]:
def Exp1_ver3(discipline,topic,my_path):
    
    #restrict to topic
    works_concepts_conc = works_concepts.query('concept_name==@topic', engine='python')
      
    #load
    my_path2 = os.path.join(discipline, 'Info')
    my_file = 'work_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        work_ids_list = pickle.load(fp)
    my_file = 'author_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        author_ids_list = pickle.load(fp)
    my_file = 'work_ids_tot_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        work_ids_tot_list = pickle.load(fp)
    my_file = 'author_ids_tot_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        author_ids_tot_list = pickle.load(fp)
    my_file = 'windows_cond_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        windows_cond = pickle.load(fp)
        
    #load
    my_path3 = os.path.join(my_path2, 'Productivity')
    my_file = 'active_authors_classes_'+topic
    with open(os.path.join(my_path3, my_file),"rb") as fp:
        active_authors_classes = pickle.load(fp)
        
    #consider consecutive EW and OW (5 years each)
    start_year = 1995 
    my_path4 = os.path.join(discipline, 'Productivity/Exp1_ver1')
    my_file = 'all_coauthors_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        all_coauthors_list = pickle.load(fp) 
    my_file = 'active_authors_start_union_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        active_authors_start_union = pickle.load(fp) 
    active_authors_start_union_list = list(active_authors_start_union)    
            
    my_file = 'prior_work_ids_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        prior_work_ids_list = pickle.load(fp) 
    my_file = 'prior_author_ids_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        prior_author_ids_list = pickle.load(fp)  
    my_file = 'first_time_authors_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_list = pickle.load(fp) 
    my_file = 'first_time_authors_tot_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_tot_list = pickle.load(fp)
    my_file = 'not_active_authors_start_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        not_active_authors_start_list = pickle.load(fp)
    my_file = 'first_time_authors_union_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_union = pickle.load(fp)
        
    my_file = 'works_authors_activation_date_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        works_authors_activation_date = pickle.load(fp)
           
    dict_final_list = []
    dict_final_den_list = []
    dict_final_num_list = []
    dict_final_list_high1 = []
    dict_final_den_list_high1 = []
    dict_final_num_list_high1 = []
    dict_final_list_low1 = []
    dict_final_den_list_low1 = []
    dict_final_num_list_low1 = []
    for w in tqdm(range(0,23)): 

        windows_cond_w = windows_cond[w]   
        if windows_cond_w:

            start_year_w = start_year+w
            all_coauthors = all_coauthors_list[w]
            first_time_authors = first_time_authors_list[w]
            first_time_authors_tot = first_time_authors_tot_list[w]
            [active_authors_start,samples_dict_1,n_1] = active_authors_classes[w]
            high_active_authors1 = samples_dict_1['top 10%']
            low_active_authors1 = samples_dict_1['bottom 10%']
            
            #keep just works active_authors_start in this period and written in the period
            work_id_active = works_authors_activation_date[works_authors_activation_date.author_id.isin(active_authors_start)]
            work_id_active = work_id_active.query('@start_year_w-5 <= publication_year < @start_year_w', engine='python') 
            #add coauthors but not infected
            work_id_active_collab = works_authors[works_authors.work_id.isin(work_id_active.work_id)].query('author_id not in @active_authors_start')
            works_authors_collab = pd.concat([work_id_active,work_id_active_collab]).reset_index(drop=True)    

            #bipartite graph work-authors union exposure window
            bip_g = nx.from_pandas_edgelist(
                    works_authors_collab[['work_id', 'author_id']],
                    source='work_id', target='author_id'
                )

            #graph weight number papers written together
            author_ids_supp =  set(works_authors_collab.author_id)
            support_graph_ = get_support_graph_ver3(bip_g, author_ids_supp)
            #dictionary {number exposure start year : list of authors that number}
            not_active_authors_start = not_active_authors_start_list[w]
            authors_isolated = not_active_authors_start - author_ids_supp
                       
            dict_final = {}
            dict_final_high1 = {}
            dict_final_low1 = {}
            for anas in tqdm(author_ids_supp & not_active_authors_start): #for each author not active at the beginning 
                n_anas = set(support_graph_.neighbors(anas))

                #A
                dict_finall = get_scores_A_ver3(anas,n_anas,active_authors_start,dict_final)
                #B 
                dict_final_high1,dict_final_low1 = get_scores_B_ver3(anas,n_anas,high_active_authors1,low_active_authors1,dict_final_high1,dict_final_low1)

            #(iii) Define T(k) to be the fraction of these authors that have become active by the time of the second snapshot.
            #dictionary {k : fraction}

            #A 
            dict_final_list,dict_final_num_list,dict_final_den_list = calculation_A(w,author_ids_tot_list,all_coauthors_list,first_time_authors,first_time_authors_tot,dict_final,dict_final_list,dict_final_num_list,dict_final_den_list,prior_author_ids_list,authors_isolated)   
            #B  
            dict_final_list_high1,dict_final_num_list_high1,dict_final_den_list_high1 = calculation_B(first_time_authors,dict_final_high1,dict_final_list_high1,dict_final_num_list_high1,dict_final_den_list_high1)
            dict_final_list_low1,dict_final_num_list_low1,dict_final_den_list_low1 = calculation_B(first_time_authors,dict_final_low1,dict_final_list_low1,dict_final_num_list_low1,dict_final_den_list_low1)

        else:
            dict_final_list.append(np.nan)
            dict_final_den_list.append(np.nan)
            dict_final_num_list.append(np.nan)
            dict_final_list_high1.append(np.nan)
            dict_final_den_list_high1.append(np.nan)
            dict_final_num_list_high1.append(np.nan)
            dict_final_list_low1.append(np.nan)
            dict_final_den_list_low1.append(np.nan)
            dict_final_num_list_low1.append(np.nan)
            
    #save on file dictionary each window: concept - year_start 
    my_file = 'df_'+topic+'_windows.csv'
    
    topic_df_  = pd.DataFrame()
    for w in range(0,23): 

        windows_cond_w = windows_cond[w]   
        if windows_cond_w:
            start_year_w = start_year + w
                
            dict_final_df=pd.DataFrame(dict_final_list[w].items(), columns=['k', 'prob'])
            dict_final_den_df=pd.DataFrame(dict_final_den_list[w].items(), columns=['k', 'den'])
            dict_final_num_df=pd.DataFrame(dict_final_num_list[w].items(), columns=['k', 'num'])
            dict_final_high1_df=pd.DataFrame(dict_final_list_high1[w].items(), columns=['k', 'prob_high1'])
            dict_final_den_high1_df=pd.DataFrame(dict_final_den_list_high1[w].items(), columns=['k', 'den_high1'])
            dict_final_num_high1_df=pd.DataFrame(dict_final_num_list_high1[w].items(), columns=['k', 'num_high1'])
            dict_final_low1_df=pd.DataFrame(dict_final_list_low1[w].items(), columns=['k', 'prob_low1'])
            dict_final_den_low1_df=pd.DataFrame(dict_final_den_list_low1[w].items(), columns=['k', 'den_low1'])
            dict_final_num_low1_df=pd.DataFrame(dict_final_num_list_low1[w].items(), columns=['k', 'num_low1']) 

            topic_df = dict_final_df.merge(dict_final_den_df.merge(dict_final_num_df))
            topic_high1_df = dict_final_high1_df.merge(dict_final_den_high1_df.merge(dict_final_num_high1_df))
            topic_low1_df = dict_final_low1_df.merge(dict_final_den_low1_df.merge(dict_final_num_low1_df))

            topic_df_w  = (topic_df.merge(topic_high1_df, how='outer')).merge(topic_low1_df, how='outer')
            topic_df_w.insert(0, 'T_0', start_year_w)
            topic_df_ = pd.concat([topic_df_, topic_df_w], ignore_index = True, axis = 0)
              
    topic_df_.to_csv(os.path.join(my_path, my_file))
 
    topic_df_.insert(0, 'topic', topic)

    return topic_df_

In [None]:
my_path = os.path.join(discipline, 'Productivity/Exp1_ver3')
#create folder
if not os.path.exists(my_path):
    os.makedirs(my_path)
topics_df = pd.DataFrame();
for topic in topic_list:
    topic_df_top = Exp1_ver3(discipline=discipline,topic=topic,my_path=my_path) 
    topics_df = pd.concat([topics_df, topic_df_top], ignore_index = True, axis = 0)
my_file = 'df_topic_windows.csv'
topics_df.to_csv(os.path.join(my_path, my_file))

### Impact 

#### Def. impact 1 - papers:all, cits:topic

##### Def. contact 1 - #contacts

In [None]:
def Exp1_1_ver1(discipline,topic,my_path):
    
    #restrict to topic
    works_concepts_conc = works_concepts.query('concept_name==@topic', engine='python')
      
    #load
    my_path2 = os.path.join(discipline, 'Info')
    my_file = 'work_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        work_ids_list = pickle.load(fp)
    my_file = 'author_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        author_ids_list = pickle.load(fp)
    my_file = 'work_ids_tot_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        work_ids_tot_list = pickle.load(fp)
    my_file = 'author_ids_tot_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        author_ids_tot_list = pickle.load(fp)
    my_file = 'windows_cond_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        windows_cond = pickle.load(fp)
        
    #load
    my_path3 = os.path.join(my_path2, 'Impact_mean1')
    my_file = 'active_authors_classes_'+topic
    with open(os.path.join(my_path3, my_file),"rb") as fp:
        active_authors_classes = pickle.load(fp)   
    
    #consider consecutive EW and OW (5 years each)
    start_year = 1995 
    my_path4 = os.path.join(discipline, 'Productivity/Exp1_ver1')
    my_file = 'all_coauthors_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        all_coauthors_list = pickle.load(fp) 
    my_file = 'active_authors_start_union_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        active_authors_start_union = pickle.load(fp) 
    active_authors_start_union_list = list(active_authors_start_union)    
            
    my_file = 'prior_work_ids_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        prior_work_ids_list = pickle.load(fp) 
    my_file = 'prior_author_ids_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        prior_author_ids_list = pickle.load(fp)  
    my_file = 'first_time_authors_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_list = pickle.load(fp) 
    my_file = 'first_time_authors_tot_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_tot_list = pickle.load(fp)
    my_file = 'not_active_authors_start_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        not_active_authors_start_list = pickle.load(fp)
    my_file = 'first_time_authors_union_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_union = pickle.load(fp)
        
    my_file = 'works_authors_activation_date_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        works_authors_activation_date = pickle.load(fp)
           
    dict_final_list = []
    dict_final_den_list = []
    dict_final_num_list = []
    dict_final_list_high1 = []
    dict_final_den_list_high1 = []
    dict_final_num_list_high1 = []
    dict_final_list_low1 = []
    dict_final_den_list_low1 = []
    dict_final_num_list_low1 = []
    for w in tqdm(range(0,23)): 

        windows_cond_w = windows_cond[w]   
        if windows_cond_w:

            start_year_w = start_year+w
            all_coauthors = all_coauthors_list[w]
            first_time_authors = first_time_authors_list[w]
            first_time_authors_tot = first_time_authors_tot_list[w]
            [active_authors_start,samples_dict_1,n_1] = active_authors_classes[w]
            high_active_authors1 = samples_dict_1['top 10%']
            low_active_authors1 = samples_dict_1['bottom 10%']
            
            #keep just works active_authors_start in this period and written in the period
            work_id_active = works_authors_activation_date[works_authors_activation_date.author_id.isin(active_authors_start)]
            work_id_active = work_id_active.query('@start_year_w-5 <= publication_year < @start_year_w', engine='python') 
            #add coauthors but not infected
            work_id_active_collab = works_authors[works_authors.work_id.isin(work_id_active.work_id)].query('author_id not in @active_authors_start')
            works_authors_collab = pd.concat([work_id_active,work_id_active_collab]).reset_index(drop=True)    

            #bipartite graph work-authors union exposure window
            bip_g = nx.from_pandas_edgelist(
                    works_authors_collab[['work_id', 'author_id']],
                    source='work_id', target='author_id'
                )

            #graph weight number papers written together
            author_ids_supp =  set(works_authors_collab.author_id)
            support_graph_ = get_support_graph_ver1(bip_g, author_ids_supp)
            #dictionary {number exposure start year : list of authors that number}
            not_active_authors_start = not_active_authors_start_list[w]
            authors_isolated = not_active_authors_start - author_ids_supp
                       
            dict_final = {}
            dict_final_high1 = {}
            dict_final_low1 = {}
            for anas in tqdm(author_ids_supp & not_active_authors_start): #for each author not active at the beginning 
                n_anas = set(support_graph_.neighbors(anas))

                #A
                dict_final,ego_active_total = get_scores_A_ver1(anas,n_anas, active_authors_start,support_graph_,dict_final)
                #B 
                dict_final_high1,dict_final_low1 = get_scores_B_ver1(anas,n_anas,high_active_authors1,low_active_authors1,ego_active_total,dict_final_high1,dict_final_low1)
            
            #(iii) Define T(k) to be the fraction of these authors that have become active by the time of the second snapshot.
            #dictionary {k : fraction}

            #A 
            dict_final_list,dict_final_num_list,dict_final_den_list = calculation_A(w,author_ids_tot_list,all_coauthors_list,first_time_authors,first_time_authors_tot,dict_final,dict_final_list,dict_final_num_list,dict_final_den_list,prior_author_ids_list,authors_isolated)  
            #B  
            dict_final_list_high1,dict_final_num_list_high1,dict_final_den_list_high1 = calculation_B(first_time_authors,dict_final_high1,dict_final_list_high1,dict_final_num_list_high1,dict_final_den_list_high1)
            dict_final_list_low1,dict_final_num_list_low1,dict_final_den_list_low1 = calculation_B(first_time_authors,dict_final_low1,dict_final_list_low1,dict_final_num_list_low1,dict_final_den_list_low1)

        else:
            dict_final_list.append(np.nan)
            dict_final_den_list.append(np.nan)
            dict_final_num_list.append(np.nan)
            dict_final_list_high1.append(np.nan)
            dict_final_den_list_high1.append(np.nan)
            dict_final_num_list_high1.append(np.nan)
            dict_final_list_low1.append(np.nan)
            dict_final_den_list_low1.append(np.nan)
            dict_final_num_list_low1.append(np.nan)
            
    #save on file dictionary each window: concept - year_start 
    my_file = 'df_'+topic+'_windows.csv'
    
    topic_df_  = pd.DataFrame()
    for w in range(0,23): 

        windows_cond_w = windows_cond[w]   
        if windows_cond_w:
            start_year_w = start_year + w
                
            dict_final_df=pd.DataFrame(dict_final_list[w].items(), columns=['k', 'prob'])
            dict_final_den_df=pd.DataFrame(dict_final_den_list[w].items(), columns=['k', 'den'])
            dict_final_num_df=pd.DataFrame(dict_final_num_list[w].items(), columns=['k', 'num'])
            dict_final_high1_df=pd.DataFrame(dict_final_list_high1[w].items(), columns=['k', 'prob_high1'])
            dict_final_den_high1_df=pd.DataFrame(dict_final_den_list_high1[w].items(), columns=['k', 'den_high1'])
            dict_final_num_high1_df=pd.DataFrame(dict_final_num_list_high1[w].items(), columns=['k', 'num_high1'])
            dict_final_low1_df=pd.DataFrame(dict_final_list_low1[w].items(), columns=['k', 'prob_low1'])
            dict_final_den_low1_df=pd.DataFrame(dict_final_den_list_low1[w].items(), columns=['k', 'den_low1'])
            dict_final_num_low1_df=pd.DataFrame(dict_final_num_list_low1[w].items(), columns=['k', 'num_low1']) 

            topic_df = dict_final_df.merge(dict_final_den_df.merge(dict_final_num_df))
            topic_high1_df = dict_final_high1_df.merge(dict_final_den_high1_df.merge(dict_final_num_high1_df))
            topic_low1_df = dict_final_low1_df.merge(dict_final_den_low1_df.merge(dict_final_num_low1_df))

            topic_df_w  = (topic_df.merge(topic_high1_df, how='outer')).merge(topic_low1_df, how='outer')
            topic_df_w.insert(0, 'T_0', start_year_w)
            topic_df_ = pd.concat([topic_df_, topic_df_w], ignore_index = True, axis = 0)
              
    topic_df_.to_csv(os.path.join(my_path, my_file))
 
    topic_df_.insert(0, 'topic', topic)

    return topic_df_

In [None]:
my_path = os.path.join(discipline, 'Impact_mean1/Exp1_ver1')
#create folder
if not os.path.exists(my_path):
    os.makedirs(my_path)
topics_df = pd.DataFrame();
for topic in topic_list[:1]:
    topic_df_top = Exp1_1_ver1(discipline=discipline,topic=topic,my_path=my_path) 
    topics_df = pd.concat([topics_df, topic_df_top], ignore_index = True, axis = 0)
my_file = 'df_topic_windows.csv'
topics_df.to_csv(os.path.join(my_path, my_file))

##### Def. contact 2 - #papers

In [None]:
def Exp1_1_ver2(discipline,topic,my_path):
    
    #restrict to topic
    works_concepts_conc = works_concepts.query('concept_name==@topic', engine='python')
      
    #load
    my_path2 = os.path.join(discipline, 'Info')
    my_file = 'work_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        work_ids_list = pickle.load(fp)
    my_file = 'author_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        author_ids_list = pickle.load(fp)
    my_file = 'work_ids_tot_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        work_ids_tot_list = pickle.load(fp)
    my_file = 'author_ids_tot_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        author_ids_tot_list = pickle.load(fp)
    my_file = 'windows_cond_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        windows_cond = pickle.load(fp)
        
    #load
    my_path3 = os.path.join(my_path2, 'Impact_mean1')
    my_file = 'active_authors_classes_'+topic
    with open(os.path.join(my_path3, my_file),"rb") as fp:
        active_authors_classes = pickle.load(fp)   
    
    #consider consecutive EW and OW (5 years each)
    start_year = 1995 
    my_path4 = os.path.join(discipline, 'Productivity/Exp1_ver1')
    my_file = 'all_coauthors_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        all_coauthors_list = pickle.load(fp) 
    my_file = 'active_authors_start_union_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        active_authors_start_union = pickle.load(fp) 
    active_authors_start_union_list = list(active_authors_start_union)    
            
    my_file = 'prior_work_ids_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        prior_work_ids_list = pickle.load(fp) 
    my_file = 'prior_author_ids_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        prior_author_ids_list = pickle.load(fp)  
    my_file = 'first_time_authors_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_list = pickle.load(fp) 
    my_file = 'first_time_authors_tot_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_tot_list = pickle.load(fp)
    my_file = 'not_active_authors_start_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        not_active_authors_start_list = pickle.load(fp)
    my_file = 'first_time_authors_union_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_union = pickle.load(fp)
        
    my_file = 'works_authors_activation_date_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        works_authors_activation_date = pickle.load(fp)
           
    dict_final_list = []
    dict_final_den_list = []
    dict_final_num_list = []
    dict_final_list_high1 = []
    dict_final_den_list_high1 = []
    dict_final_num_list_high1 = []
    dict_final_list_low1 = []
    dict_final_den_list_low1 = []
    dict_final_num_list_low1 = []
    for w in tqdm(range(0,23)): 

        windows_cond_w = windows_cond[w]   
        if windows_cond_w:

            start_year_w = start_year+w
            all_coauthors = all_coauthors_list[w]
            first_time_authors = first_time_authors_list[w]
            first_time_authors_tot = first_time_authors_tot_list[w]
            [active_authors_start,samples_dict_1,n_1] = active_authors_classes[w]
            high_active_authors1 = samples_dict_1['top 10%']
            low_active_authors1 = samples_dict_1['bottom 10%']
            
            #keep just works active_authors_start in this period and written in the period
            work_id_active = works_authors_activation_date[works_authors_activation_date.author_id.isin(active_authors_start)]
            work_id_active = work_id_active.query('@start_year_w-5 <= publication_year < @start_year_w', engine='python') 
            #add coauthors but not infected
            work_id_active_collab = works_authors[works_authors.work_id.isin(work_id_active.work_id)].query('author_id not in @active_authors_start')
            works_authors_collab = pd.concat([work_id_active,work_id_active_collab]).reset_index(drop=True)    

            #bipartite graph work-authors union exposure window
            bip_g = nx.from_pandas_edgelist(
                    works_authors_collab[['work_id', 'author_id']],
                    source='work_id', target='author_id'
                )

            #graph weight number papers written together
            author_ids_supp =  set(works_authors_collab.author_id)
            support_graph_ = get_support_graph_ver2(bip_g, author_ids_supp,list_works)
            #dictionary {number exposure start year : list of authors that number}
            not_active_authors_start = not_active_authors_start_list[w]
            authors_isolated = not_active_authors_start - author_ids_supp
                       
            dict_final = {}
            dict_final_high1 = {}
            dict_final_low1 = {}
            for anas in tqdm(author_ids_supp & not_active_authors_start): #for each author not active at the beginning  
                n_anas = set(support_graph_.neighbors(anas))

                #A
                dict_final,ego_active_total = get_scores_A_ver2(anas,n_anas, active_authors_start,support_graph_,dict_final)
                #B 
                dict_final_high1,dict_final_low1 = get_scores_B_ver2(anas,n_anas,high_active_authors1,low_active_authors1,ego_active_total,dict_final_high1,dict_final_low1)

            #(iii) Define T(k) to be the fraction of these authors that have become active by the time of the second snapshot.
            #dictionary {k : fraction}

            #A 
            dict_final_list,dict_final_num_list,dict_final_den_list = calculation_A(w,author_ids_tot_list,all_coauthors_list,first_time_authors,first_time_authors_tot,dict_final,dict_final_list,dict_final_num_list,dict_final_den_list,prior_author_ids_list,authors_isolated)   
            #B  
            dict_final_list_high1,dict_final_num_list_high1,dict_final_den_list_high1 = calculation_B(first_time_authors,dict_final_high1,dict_final_list_high1,dict_final_num_list_high1,dict_final_den_list_high1)
            dict_final_list_low1,dict_final_num_list_low1,dict_final_den_list_low1 = calculation_B(first_time_authors,dict_final_low1,dict_final_list_low1,dict_final_num_list_low1,dict_final_den_list_low1)

        else:
            dict_final_list.append(np.nan)
            dict_final_den_list.append(np.nan)
            dict_final_num_list.append(np.nan)
            dict_final_list_high1.append(np.nan)
            dict_final_den_list_high1.append(np.nan)
            dict_final_num_list_high1.append(np.nan)
            dict_final_list_low1.append(np.nan)
            dict_final_den_list_low1.append(np.nan)
            dict_final_num_list_low1.append(np.nan)
            
    #save on file dictionary each window: concept - year_start 
    my_file = 'df_'+topic+'_windows.csv'
    
    topic_df_  = pd.DataFrame()
    for w in range(0,23): 

        windows_cond_w = windows_cond[w]   
        if windows_cond_w:
            start_year_w = start_year + w
                
            dict_final_df=pd.DataFrame(dict_final_list[w].items(), columns=['k', 'prob'])
            dict_final_den_df=pd.DataFrame(dict_final_den_list[w].items(), columns=['k', 'den'])
            dict_final_num_df=pd.DataFrame(dict_final_num_list[w].items(), columns=['k', 'num'])
            dict_final_high1_df=pd.DataFrame(dict_final_list_high1[w].items(), columns=['k', 'prob_high1'])
            dict_final_den_high1_df=pd.DataFrame(dict_final_den_list_high1[w].items(), columns=['k', 'den_high1'])
            dict_final_num_high1_df=pd.DataFrame(dict_final_num_list_high1[w].items(), columns=['k', 'num_high1'])
            dict_final_low1_df=pd.DataFrame(dict_final_list_low1[w].items(), columns=['k', 'prob_low1'])
            dict_final_den_low1_df=pd.DataFrame(dict_final_den_list_low1[w].items(), columns=['k', 'den_low1'])
            dict_final_num_low1_df=pd.DataFrame(dict_final_num_list_low1[w].items(), columns=['k', 'num_low1']) 

            topic_df = dict_final_df.merge(dict_final_den_df.merge(dict_final_num_df))
            topic_high1_df = dict_final_high1_df.merge(dict_final_den_high1_df.merge(dict_final_num_high1_df))
            topic_low1_df = dict_final_low1_df.merge(dict_final_den_low1_df.merge(dict_final_num_low1_df))

            topic_df_w  = (topic_df.merge(topic_high1_df, how='outer')).merge(topic_low1_df, how='outer')
            topic_df_w.insert(0, 'T_0', start_year_w)
            topic_df_ = pd.concat([topic_df_, topic_df_w], ignore_index = True, axis = 0)
              
    topic_df_.to_csv(os.path.join(my_path, my_file))
 
    topic_df_.insert(0, 'topic', topic)

    return topic_df_

In [None]:
my_path = os.path.join(discipline, 'Impact_mean1/Exp1_ver2')
#create folder
if not os.path.exists(my_path):
    os.makedirs(my_path)
topics_df = pd.DataFrame();
for topic in topic_list:
    topic_df_top = Exp1_1_ver2(discipline=discipline,topic=topic,my_path=my_path) 
    topics_df = pd.concat([topics_df, topic_df_top], ignore_index = True, axis = 0)
my_file = 'df_topic_windows.csv'
topics_df.to_csv(os.path.join(my_path, my_file))

##### Def. contact 3 - #coauthors

In [None]:
def Exp1_1_ver3(discipline,topic,my_path):
    
    #restrict to topic
    works_concepts_conc = works_concepts.query('concept_name==@topic', engine='python')
      
    #load
    my_path2 = os.path.join(discipline, 'Info')
    my_file = 'work_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        work_ids_list = pickle.load(fp)
    my_file = 'author_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        author_ids_list = pickle.load(fp)
    my_file = 'work_ids_tot_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        work_ids_tot_list = pickle.load(fp)
    my_file = 'author_ids_tot_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        author_ids_tot_list = pickle.load(fp)
    my_file = 'windows_cond_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        windows_cond = pickle.load(fp)
        
    #load
    my_path3 = os.path.join(my_path2, 'Impact_mean1')
    my_file = 'active_authors_classes_'+topic
    with open(os.path.join(my_path3, my_file),"rb") as fp:
        active_authors_classes = pickle.load(fp)   
    
    #consider consecutive EW and OW (5 years each)
    start_year = 1995 
    my_path4 = os.path.join(discipline, 'Productivity/Exp1_ver1')
    my_file = 'all_coauthors_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        all_coauthors_list = pickle.load(fp) 
    my_file = 'active_authors_start_union_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        active_authors_start_union = pickle.load(fp) 
    active_authors_start_union_list = list(active_authors_start_union)    
            
    my_file = 'prior_work_ids_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        prior_work_ids_list = pickle.load(fp) 
    my_file = 'prior_author_ids_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        prior_author_ids_list = pickle.load(fp)  
    my_file = 'first_time_authors_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_list = pickle.load(fp) 
    my_file = 'first_time_authors_tot_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_tot_list = pickle.load(fp)
    my_file = 'not_active_authors_start_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        not_active_authors_start_list = pickle.load(fp)
    my_file = 'first_time_authors_union_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_union = pickle.load(fp)
        
    my_file = 'works_authors_activation_date_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        works_authors_activation_date = pickle.load(fp)
           
    dict_final_list = []
    dict_final_den_list = []
    dict_final_num_list = []
    dict_final_list_high1 = []
    dict_final_den_list_high1 = []
    dict_final_num_list_high1 = []
    dict_final_list_low1 = []
    dict_final_den_list_low1 = []
    dict_final_num_list_low1 = []
    for w in tqdm(range(0,23)): 

        windows_cond_w = windows_cond[w]   
        if windows_cond_w:

            start_year_w = start_year+w
            all_coauthors = all_coauthors_list[w]
            first_time_authors = first_time_authors_list[w]
            first_time_authors_tot = first_time_authors_tot_list[w]
            [active_authors_start,samples_dict_1,n_1] = active_authors_classes[w]
            high_active_authors1 = samples_dict_1['top 10%']
            low_active_authors1 = samples_dict_1['bottom 10%']
            
            #keep just works active_authors_start in this period and written in the period
            work_id_active = works_authors_activation_date[works_authors_activation_date.author_id.isin(active_authors_start)]
            work_id_active = work_id_active.query('@start_year_w-5 <= publication_year < @start_year_w', engine='python') 
            #add coauthors but not infected
            work_id_active_collab = works_authors[works_authors.work_id.isin(work_id_active.work_id)].query('author_id not in @active_authors_start')
            works_authors_collab = pd.concat([work_id_active,work_id_active_collab]).reset_index(drop=True)    

            #bipartite graph work-authors union exposure window
            bip_g = nx.from_pandas_edgelist(
                    works_authors_collab[['work_id', 'author_id']],
                    source='work_id', target='author_id'
                )

            #graph weight number papers written together
            author_ids_supp =  set(works_authors_collab.author_id)
            support_graph_ = get_support_graph_ver3(bip_g, author_ids_supp)
            #dictionary {number exposure start year : list of authors that number}
            not_active_authors_start = not_active_authors_start_list[w]
            authors_isolated = not_active_authors_start - author_ids_supp
                       
            dict_final = {}
            dict_final_high1 = {}
            dict_final_low1 = {}
            for anas in tqdm(author_ids_supp & not_active_authors_start): #for each author not active at the beginning 
                n_anas = set(support_graph_.neighbors(anas))

                #A
                dict_finall = get_scores_A_ver3(anas,n_anas,active_authors_start,dict_final)
                #B 
                dict_final_high1,dict_final_low1 = get_scores_B_ver3(anas,n_anas,high_active_authors1,low_active_authors1,dict_final_high1,dict_final_low1)

            #(iii) Define T(k) to be the fraction of these authors that have become active by the time of the second snapshot.
            #dictionary {k : fraction}

            #A 
            dict_final_list,dict_final_num_list,dict_final_den_list = calculation_A(w,author_ids_tot_list,all_coauthors_list,first_time_authors,first_time_authors_tot,dict_final,dict_final_list,dict_final_num_list,dict_final_den_list,prior_author_ids_list,authors_isolated)   
            #B  
            dict_final_list_high1,dict_final_num_list_high1,dict_final_den_list_high1 = calculation_B(first_time_authors,dict_final_high1,dict_final_list_high1,dict_final_num_list_high1,dict_final_den_list_high1)
            dict_final_list_low1,dict_final_num_list_low1,dict_final_den_list_low1 = calculation_B(first_time_authors,dict_final_low1,dict_final_list_low1,dict_final_num_list_low1,dict_final_den_list_low1)

        else:
            dict_final_list.append(np.nan)
            dict_final_den_list.append(np.nan)
            dict_final_num_list.append(np.nan)
            dict_final_list_high1.append(np.nan)
            dict_final_den_list_high1.append(np.nan)
            dict_final_num_list_high1.append(np.nan)
            dict_final_list_low1.append(np.nan)
            dict_final_den_list_low1.append(np.nan)
            dict_final_num_list_low1.append(np.nan)
            
    #save on file dictionary each window: concept - year_start 
    my_file = 'df_'+topic+'_windows.csv'
    
    topic_df_  = pd.DataFrame()
    for w in range(0,23): 

        windows_cond_w = windows_cond[w]   
        if windows_cond_w:
            start_year_w = start_year + w
                
            dict_final_df=pd.DataFrame(dict_final_list[w].items(), columns=['k', 'prob'])
            dict_final_den_df=pd.DataFrame(dict_final_den_list[w].items(), columns=['k', 'den'])
            dict_final_num_df=pd.DataFrame(dict_final_num_list[w].items(), columns=['k', 'num'])
            dict_final_high1_df=pd.DataFrame(dict_final_list_high1[w].items(), columns=['k', 'prob_high1'])
            dict_final_den_high1_df=pd.DataFrame(dict_final_den_list_high1[w].items(), columns=['k', 'den_high1'])
            dict_final_num_high1_df=pd.DataFrame(dict_final_num_list_high1[w].items(), columns=['k', 'num_high1'])
            dict_final_low1_df=pd.DataFrame(dict_final_list_low1[w].items(), columns=['k', 'prob_low1'])
            dict_final_den_low1_df=pd.DataFrame(dict_final_den_list_low1[w].items(), columns=['k', 'den_low1'])
            dict_final_num_low1_df=pd.DataFrame(dict_final_num_list_low1[w].items(), columns=['k', 'num_low1']) 

            topic_df = dict_final_df.merge(dict_final_den_df.merge(dict_final_num_df))
            topic_high1_df = dict_final_high1_df.merge(dict_final_den_high1_df.merge(dict_final_num_high1_df))
            topic_low1_df = dict_final_low1_df.merge(dict_final_den_low1_df.merge(dict_final_num_low1_df))

            topic_df_w  = (topic_df.merge(topic_high1_df, how='outer')).merge(topic_low1_df, how='outer')
            topic_df_w.insert(0, 'T_0', start_year_w)
            topic_df_ = pd.concat([topic_df_, topic_df_w], ignore_index = True, axis = 0)
              
    topic_df_.to_csv(os.path.join(my_path, my_file))
 
    topic_df_.insert(0, 'topic', topic)

    return topic_df_

In [None]:
my_path = os.path.join(discipline, 'Impact_mean1/Exp1_ver3')
#create folder
if not os.path.exists(my_path):
    os.makedirs(my_path)
topics_df = pd.DataFrame();
for topic in topic_list:
    topic_df_top = Exp1_1_ver3(discipline=discipline,topic=topic,my_path=my_path) 
    topics_df = pd.concat([topics_df, topic_df_top], ignore_index = True, axis = 0)
my_file = 'df_topic_windows.csv'
topics_df.to_csv(os.path.join(my_path, my_file))

#### Def. impact 2 - papers:topic, cits:all

##### Def. contact 1 - #contacts

In [None]:
def Exp1_2_ver1(discipline,topic,my_path):
    
    #restrict to topic
    works_concepts_conc = works_concepts.query('concept_name==@topic', engine='python')
      
    #load
    my_path2 = os.path.join(discipline, 'Info')
    my_file = 'work_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        work_ids_list = pickle.load(fp)
    my_file = 'author_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        author_ids_list = pickle.load(fp)
    my_file = 'work_ids_tot_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        work_ids_tot_list = pickle.load(fp)
    my_file = 'author_ids_tot_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        author_ids_tot_list = pickle.load(fp)
    my_file = 'windows_cond_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        windows_cond = pickle.load(fp)
        
    #load
    my_path3 = os.path.join(my_path2, 'Impact_mean2')
    my_file = 'active_authors_classes_'+topic
    with open(os.path.join(my_path3, my_file),"rb") as fp:
        active_authors_classes = pickle.load(fp)   
    
    #consider consecutive EW and OW (5 years each)
    start_year = 1995 
    my_path4 = os.path.join(discipline, 'Productivity/Exp1_ver1')
    my_file = 'all_coauthors_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        all_coauthors_list = pickle.load(fp) 
    my_file = 'active_authors_start_union_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        active_authors_start_union = pickle.load(fp) 
    active_authors_start_union_list = list(active_authors_start_union)    
            
    my_file = 'prior_work_ids_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        prior_work_ids_list = pickle.load(fp) 
    my_file = 'prior_author_ids_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        prior_author_ids_list = pickle.load(fp)  
    my_file = 'first_time_authors_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_list = pickle.load(fp) 
    my_file = 'first_time_authors_tot_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_tot_list = pickle.load(fp)
    my_file = 'not_active_authors_start_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        not_active_authors_start_list = pickle.load(fp)
    my_file = 'first_time_authors_union_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_union = pickle.load(fp)
        
    my_file = 'works_authors_activation_date_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        works_authors_activation_date = pickle.load(fp)
           
    dict_final_list = []
    dict_final_den_list = []
    dict_final_num_list = []
    dict_final_list_high1 = []
    dict_final_den_list_high1 = []
    dict_final_num_list_high1 = []
    dict_final_list_low1 = []
    dict_final_den_list_low1 = []
    dict_final_num_list_low1 = []
    for w in tqdm(range(0,23)): 

        windows_cond_w = windows_cond[w]   
        if windows_cond_w:

            start_year_w = start_year+w
            all_coauthors = all_coauthors_list[w]
            first_time_authors = first_time_authors_list[w]
            first_time_authors_tot = first_time_authors_tot_list[w]
            [active_authors_start,samples_dict_1,n_1] = active_authors_classes[w]
            high_active_authors1 = samples_dict_1['top 10%']
            low_active_authors1 = samples_dict_1['bottom 10%']
            
            #keep just works active_authors_start in this period and written in the period
            work_id_active = works_authors_activation_date[works_authors_activation_date.author_id.isin(active_authors_start)]
            work_id_active = work_id_active.query('@start_year_w-5 <= publication_year < @start_year_w', engine='python') 
            #add coauthors but not infected
            work_id_active_collab = works_authors[works_authors.work_id.isin(work_id_active.work_id)].query('author_id not in @active_authors_start')
            works_authors_collab = pd.concat([work_id_active,work_id_active_collab]).reset_index(drop=True)    

            #bipartite graph work-authors union exposure window
            bip_g = nx.from_pandas_edgelist(
                    works_authors_collab[['work_id', 'author_id']],
                    source='work_id', target='author_id'
                )

            #graph weight number papers written together
            author_ids_supp =  set(works_authors_collab.author_id)
            support_graph_ = get_support_graph_ver1(bip_g, author_ids_supp)
            #dictionary {number exposure start year : list of authors that number}
            not_active_authors_start = not_active_authors_start_list[w]
            authors_isolated = not_active_authors_start - author_ids_supp
                       
            dict_final = {}
            dict_final_high1 = {}
            dict_final_low1 = {}
            for anas in tqdm(author_ids_supp & not_active_authors_start): #for each author not active at the beginning 
                n_anas = set(support_graph_.neighbors(anas))

                #A
                dict_final,ego_active_total = get_scores_A_ver1(anas,n_anas, active_authors_start,support_graph_,dict_final)
                #B 
                dict_final_high1,dict_final_low1 = get_scores_B_ver1(anas,n_anas,high_active_authors1,low_active_authors1,ego_active_total,dict_final_high1,dict_final_low1)
            
            #(iii) Define T(k) to be the fraction of these authors that have become active by the time of the second snapshot.
            #dictionary {k : fraction}

            #A 
            dict_final_list,dict_final_num_list,dict_final_den_list = calculation_A(w,author_ids_tot_list,all_coauthors_list,first_time_authors,first_time_authors_tot,dict_final,dict_final_list,dict_final_num_list,dict_final_den_list,prior_author_ids_list,authors_isolated)   
            #B  
            dict_final_list_high1,dict_final_num_list_high1,dict_final_den_list_high1 = calculation_B(first_time_authors,dict_final_high1,dict_final_list_high1,dict_final_num_list_high1,dict_final_den_list_high1)
            dict_final_list_low1,dict_final_num_list_low1,dict_final_den_list_low1 = calculation_B(first_time_authors,dict_final_low1,dict_final_list_low1,dict_final_num_list_low1,dict_final_den_list_low1)

        else:
            dict_final_list.append(np.nan)
            dict_final_den_list.append(np.nan)
            dict_final_num_list.append(np.nan)
            dict_final_list_high1.append(np.nan)
            dict_final_den_list_high1.append(np.nan)
            dict_final_num_list_high1.append(np.nan)
            dict_final_list_low1.append(np.nan)
            dict_final_den_list_low1.append(np.nan)
            dict_final_num_list_low1.append(np.nan)
            
    #save on file dictionary each window: concept - year_start 
    my_file = 'df_'+topic+'_windows.csv'
    
    topic_df_  = pd.DataFrame()
    for w in range(0,23): 

        windows_cond_w = windows_cond[w]   
        if windows_cond_w:
            start_year_w = start_year + w
                
            dict_final_df=pd.DataFrame(dict_final_list[w].items(), columns=['k', 'prob'])
            dict_final_den_df=pd.DataFrame(dict_final_den_list[w].items(), columns=['k', 'den'])
            dict_final_num_df=pd.DataFrame(dict_final_num_list[w].items(), columns=['k', 'num'])
            dict_final_high1_df=pd.DataFrame(dict_final_list_high1[w].items(), columns=['k', 'prob_high1'])
            dict_final_den_high1_df=pd.DataFrame(dict_final_den_list_high1[w].items(), columns=['k', 'den_high1'])
            dict_final_num_high1_df=pd.DataFrame(dict_final_num_list_high1[w].items(), columns=['k', 'num_high1'])
            dict_final_low1_df=pd.DataFrame(dict_final_list_low1[w].items(), columns=['k', 'prob_low1'])
            dict_final_den_low1_df=pd.DataFrame(dict_final_den_list_low1[w].items(), columns=['k', 'den_low1'])
            dict_final_num_low1_df=pd.DataFrame(dict_final_num_list_low1[w].items(), columns=['k', 'num_low1']) 

            topic_df = dict_final_df.merge(dict_final_den_df.merge(dict_final_num_df))
            topic_high1_df = dict_final_high1_df.merge(dict_final_den_high1_df.merge(dict_final_num_high1_df))
            topic_low1_df = dict_final_low1_df.merge(dict_final_den_low1_df.merge(dict_final_num_low1_df))

            topic_df_w  = (topic_df.merge(topic_high1_df, how='outer')).merge(topic_low1_df, how='outer')
            topic_df_w.insert(0, 'T_0', start_year_w)
            topic_df_ = pd.concat([topic_df_, topic_df_w], ignore_index = True, axis = 0)
              
    topic_df_.to_csv(os.path.join(my_path, my_file))
 
    topic_df_.insert(0, 'topic', topic)

    return topic_df_

In [None]:
my_path = os.path.join(discipline, 'Impact_mean2/Exp1_ver1')
#create folder
if not os.path.exists(my_path):
    os.makedirs(my_path)
topics_df = pd.DataFrame();
for topic in topic_list:
    topic_df_top = Exp1_2_ver1(discipline=discipline,topic=topic,my_path=my_path) 
    topics_df = pd.concat([topics_df, topic_df_top], ignore_index = True, axis = 0)
my_file = 'df_topic_windows.csv'
topics_df.to_csv(os.path.join(my_path, my_file))

##### Def. contact 2 - #papers

In [None]:
def Exp1_2_ver2(discipline,topic,my_path):
    
    #restrict to topic
    works_concepts_conc = works_concepts.query('concept_name==@topic', engine='python')
      
    #load
    my_path2 = os.path.join(discipline, 'Info')
    my_file = 'work_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        work_ids_list = pickle.load(fp)
    my_file = 'author_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        author_ids_list = pickle.load(fp)
    my_file = 'work_ids_tot_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        work_ids_tot_list = pickle.load(fp)
    my_file = 'author_ids_tot_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        author_ids_tot_list = pickle.load(fp)
    my_file = 'windows_cond_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        windows_cond = pickle.load(fp)
        
    #load
    my_path3 = os.path.join(my_path2, 'Impact_mean2')
    my_file = 'active_authors_classes_'+topic
    with open(os.path.join(my_path3, my_file),"rb") as fp:
        active_authors_classes = pickle.load(fp)   
    
    #consider consecutive EW and OW (5 years each)
    start_year = 1995 
    my_path4 = os.path.join(discipline, 'Productivity/Exp1_ver1')
    my_file = 'all_coauthors_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        all_coauthors_list = pickle.load(fp) 
    my_file = 'active_authors_start_union_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        active_authors_start_union = pickle.load(fp) 
    active_authors_start_union_list = list(active_authors_start_union)    
            
    my_file = 'prior_work_ids_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        prior_work_ids_list = pickle.load(fp) 
    my_file = 'prior_author_ids_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        prior_author_ids_list = pickle.load(fp)  
    my_file = 'first_time_authors_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_list = pickle.load(fp) 
    my_file = 'first_time_authors_tot_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_tot_list = pickle.load(fp)
    my_file = 'not_active_authors_start_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        not_active_authors_start_list = pickle.load(fp)
    my_file = 'first_time_authors_union_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_union = pickle.load(fp)
        
    my_file = 'works_authors_activation_date_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        works_authors_activation_date = pickle.load(fp)
           
    dict_final_list = []
    dict_final_den_list = []
    dict_final_num_list = []
    dict_final_list_high1 = []
    dict_final_den_list_high1 = []
    dict_final_num_list_high1 = []
    dict_final_list_low1 = []
    dict_final_den_list_low1 = []
    dict_final_num_list_low1 = []
    for w in tqdm(range(0,23)): 

        windows_cond_w = windows_cond[w]   
        if windows_cond_w:

            start_year_w = start_year+w
            all_coauthors = all_coauthors_list[w]
            first_time_authors = first_time_authors_list[w]
            first_time_authors_tot = first_time_authors_tot_list[w]
            [active_authors_start,samples_dict_1,n_1] = active_authors_classes[w]
            high_active_authors1 = samples_dict_1['top 10%']
            low_active_authors1 = samples_dict_1['bottom 10%']
            
            #keep just works active_authors_start in this period and written in the period
            work_id_active = works_authors_activation_date[works_authors_activation_date.author_id.isin(active_authors_start)]
            work_id_active = work_id_active.query('@start_year_w-5 <= publication_year < @start_year_w', engine='python') 
            #add coauthors but not infected
            work_id_active_collab = works_authors[works_authors.work_id.isin(work_id_active.work_id)].query('author_id not in @active_authors_start')
            works_authors_collab = pd.concat([work_id_active,work_id_active_collab]).reset_index(drop=True)    

            #bipartite graph work-authors union exposure window
            bip_g = nx.from_pandas_edgelist(
                    works_authors_collab[['work_id', 'author_id']],
                    source='work_id', target='author_id'
                )

            #graph weight number papers written together
            author_ids_supp =  set(works_authors_collab.author_id)
            support_graph_ = get_support_graph_ver2(bip_g, author_ids_supp,list_works)
            #dictionary {number exposure start year : list of authors that number}
            not_active_authors_start = not_active_authors_start_list[w]
            authors_isolated = not_active_authors_start - author_ids_supp
                       
            dict_final = {}
            dict_final_high1 = {}
            dict_final_low1 = {}
            for anas in tqdm(author_ids_supp & not_active_authors_start): #for each author not active at the beginning 
                n_anas = set(support_graph_.neighbors(anas))

                #A
                dict_final,ego_active_total = get_scores_A_ver2(anas,n_anas, active_authors_start,support_graph_,dict_final)
                #B 
                dict_final_high1,dict_final_low1 = get_scores_B_ver2(anas,n_anas,high_active_authors1,low_active_authors1,ego_active_total,dict_final_high1,dict_final_low1)

            #(iii) Define T(k) to be the fraction of these authors that have become active by the time of the second snapshot.
            #dictionary {k : fraction}

            #A 
            dict_final_list,dict_final_num_list,dict_final_den_list = calculation_A(w,author_ids_tot_list,all_coauthors_list,first_time_authors,first_time_authors_tot,dict_final,dict_final_list,dict_final_num_list,dict_final_den_list,prior_author_ids_list,authors_isolated)   
            #B  
            dict_final_list_high1,dict_final_num_list_high1,dict_final_den_list_high1 = calculation_B(first_time_authors,dict_final_high1,dict_final_list_high1,dict_final_num_list_high1,dict_final_den_list_high1)
            dict_final_list_low1,dict_final_num_list_low1,dict_final_den_list_low1 = calculation_B(first_time_authors,dict_final_low1,dict_final_list_low1,dict_final_num_list_low1,dict_final_den_list_low1)

        else:
            dict_final_list.append(np.nan)
            dict_final_den_list.append(np.nan)
            dict_final_num_list.append(np.nan)
            dict_final_list_high1.append(np.nan)
            dict_final_den_list_high1.append(np.nan)
            dict_final_num_list_high1.append(np.nan)
            dict_final_list_low1.append(np.nan)
            dict_final_den_list_low1.append(np.nan)
            dict_final_num_list_low1.append(np.nan)
            
    #save on file dictionary each window: concept - year_start 
    my_file = 'df_'+topic+'_windows.csv'
    
    topic_df_  = pd.DataFrame()
    for w in range(0,23): 

        windows_cond_w = windows_cond[w]   
        if windows_cond_w:
            start_year_w = start_year + w
                
            dict_final_df=pd.DataFrame(dict_final_list[w].items(), columns=['k', 'prob'])
            dict_final_den_df=pd.DataFrame(dict_final_den_list[w].items(), columns=['k', 'den'])
            dict_final_num_df=pd.DataFrame(dict_final_num_list[w].items(), columns=['k', 'num'])
            dict_final_high1_df=pd.DataFrame(dict_final_list_high1[w].items(), columns=['k', 'prob_high1'])
            dict_final_den_high1_df=pd.DataFrame(dict_final_den_list_high1[w].items(), columns=['k', 'den_high1'])
            dict_final_num_high1_df=pd.DataFrame(dict_final_num_list_high1[w].items(), columns=['k', 'num_high1'])
            dict_final_low1_df=pd.DataFrame(dict_final_list_low1[w].items(), columns=['k', 'prob_low1'])
            dict_final_den_low1_df=pd.DataFrame(dict_final_den_list_low1[w].items(), columns=['k', 'den_low1'])
            dict_final_num_low1_df=pd.DataFrame(dict_final_num_list_low1[w].items(), columns=['k', 'num_low1']) 

            topic_df = dict_final_df.merge(dict_final_den_df.merge(dict_final_num_df))
            topic_high1_df = dict_final_high1_df.merge(dict_final_den_high1_df.merge(dict_final_num_high1_df))
            topic_low1_df = dict_final_low1_df.merge(dict_final_den_low1_df.merge(dict_final_num_low1_df))

            topic_df_w  = (topic_df.merge(topic_high1_df, how='outer')).merge(topic_low1_df, how='outer')
            topic_df_w.insert(0, 'T_0', start_year_w)
            topic_df_ = pd.concat([topic_df_, topic_df_w], ignore_index = True, axis = 0)
              
    topic_df_.to_csv(os.path.join(my_path, my_file))
 
    topic_df_.insert(0, 'topic', topic)

    return topic_df_

In [None]:
my_path = os.path.join(discipline, 'Impact_mean2/Exp1_ver2')
#create folder
if not os.path.exists(my_path):
    os.makedirs(my_path)
topics_df = pd.DataFrame();
for topic in topic_list:
    topic_df_top = Exp1_2_ver2(discipline=discipline,topic=topic,my_path=my_path) 
    topics_df = pd.concat([topics_df, topic_df_top], ignore_index = True, axis = 0)
my_file = 'df_topic_windows.csv'
topics_df.to_csv(os.path.join(my_path, my_file))

##### Def. contact 3 - #coauthors

In [None]:
def Exp1_2_ver3(discipline,topic,my_path):
    
    #restrict to topic
    works_concepts_conc = works_concepts.query('concept_name==@topic', engine='python')
      
    #load
    my_path2 = os.path.join(discipline, 'Info')
    my_file = 'work_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        work_ids_list = pickle.load(fp)
    my_file = 'author_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        author_ids_list = pickle.load(fp)
    my_file = 'work_ids_tot_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        work_ids_tot_list = pickle.load(fp)
    my_file = 'author_ids_tot_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        author_ids_tot_list = pickle.load(fp)
    my_file = 'windows_cond_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        windows_cond = pickle.load(fp)
        
    #load
    my_path3 = os.path.join(my_path2, 'Impact_mean2')
    my_file = 'active_authors_classes_'+topic
    with open(os.path.join(my_path3, my_file),"rb") as fp:
        active_authors_classes = pickle.load(fp)   
    
    #consider consecutive EW and OW (5 years each)
    start_year = 1995 
    my_path4 = os.path.join(discipline, 'Productivity/Exp1_ver1')
    my_file = 'all_coauthors_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        all_coauthors_list = pickle.load(fp) 
    my_file = 'active_authors_start_union_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        active_authors_start_union = pickle.load(fp) 
    active_authors_start_union_list = list(active_authors_start_union)    
            
    my_file = 'prior_work_ids_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        prior_work_ids_list = pickle.load(fp) 
    my_file = 'prior_author_ids_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        prior_author_ids_list = pickle.load(fp)  
    my_file = 'first_time_authors_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_list = pickle.load(fp) 
    my_file = 'first_time_authors_tot_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_tot_list = pickle.load(fp)
    my_file = 'not_active_authors_start_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        not_active_authors_start_list = pickle.load(fp)
    my_file = 'first_time_authors_union_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_union = pickle.load(fp)
        
    my_file = 'works_authors_activation_date_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        works_authors_activation_date = pickle.load(fp)
           
    dict_final_list = []
    dict_final_den_list = []
    dict_final_num_list = []
    dict_final_list_high1 = []
    dict_final_den_list_high1 = []
    dict_final_num_list_high1 = []
    dict_final_list_low1 = []
    dict_final_den_list_low1 = []
    dict_final_num_list_low1 = []
    for w in tqdm(range(0,23)): 

        windows_cond_w = windows_cond[w]   
        if windows_cond_w:

            start_year_w = start_year+w
            all_coauthors = all_coauthors_list[w]
            first_time_authors = first_time_authors_list[w]
            first_time_authors_tot = first_time_authors_tot_list[w]
            [active_authors_start,samples_dict_1,n_1] = active_authors_classes[w]
            high_active_authors1 = samples_dict_1['top 10%']
            low_active_authors1 = samples_dict_1['bottom 10%']
            
            #keep just works active_authors_start in this period and written in the period
            work_id_active = works_authors_activation_date[works_authors_activation_date.author_id.isin(active_authors_start)]
            work_id_active = work_id_active.query('@start_year_w-5 <= publication_year < @start_year_w', engine='python') 
            #add coauthors but not infected
            work_id_active_collab = works_authors[works_authors.work_id.isin(work_id_active.work_id)].query('author_id not in @active_authors_start')
            works_authors_collab = pd.concat([work_id_active,work_id_active_collab]).reset_index(drop=True)    

            #bipartite graph work-authors union exposure window
            bip_g = nx.from_pandas_edgelist(
                    works_authors_collab[['work_id', 'author_id']],
                    source='work_id', target='author_id'
                )

            #graph weight number papers written together
            author_ids_supp =  set(works_authors_collab.author_id)
            support_graph_ = get_support_graph_ver3(bip_g, author_ids_supp)
            #dictionary {number exposure start year : list of authors that number}
            not_active_authors_start = not_active_authors_start_list[w]
            authors_isolated = not_active_authors_start - author_ids_supp
                       
            dict_final = {}
            dict_final_high1 = {}
            dict_final_low1 = {}
            for anas in tqdm(author_ids_supp & not_active_authors_start): #for each author not active at the beginning 
                n_anas = set(support_graph_.neighbors(anas))

                #A
                dict_finall = get_scores_A_ver3(anas,n_anas,active_authors_start,dict_final)
                #B 
                dict_final_high1,dict_final_low1 = get_scores_B_ver3(anas,n_anas,high_active_authors1,low_active_authors1,dict_final_high1,dict_final_low1)

            #(iii) Define T(k) to be the fraction of these authors that have become active by the time of the second snapshot.
            #dictionary {k : fraction}

            #A 
            dict_final_list,dict_final_num_list,dict_final_den_list = calculation_A(w,author_ids_tot_list,all_coauthors_list,first_time_authors,first_time_authors_tot,dict_final,dict_final_list,dict_final_num_list,dict_final_den_list,prior_author_ids_list,authors_isolated)   
            #B  
            dict_final_list_high1,dict_final_num_list_high1,dict_final_den_list_high1 = calculation_B(first_time_authors,dict_final_high1,dict_final_list_high1,dict_final_num_list_high1,dict_final_den_list_high1)
            dict_final_list_low1,dict_final_num_list_low1,dict_final_den_list_low1 = calculation_B(first_time_authors,dict_final_low1,dict_final_list_low1,dict_final_num_list_low1,dict_final_den_list_low1)

        else:
            dict_final_list.append(np.nan)
            dict_final_den_list.append(np.nan)
            dict_final_num_list.append(np.nan)
            dict_final_list_high1.append(np.nan)
            dict_final_den_list_high1.append(np.nan)
            dict_final_num_list_high1.append(np.nan)
            dict_final_list_low1.append(np.nan)
            dict_final_den_list_low1.append(np.nan)
            dict_final_num_list_low1.append(np.nan)
            
    #save on file dictionary each window: concept - year_start 
    my_file = 'df_'+topic+'_windows.csv'
    
    topic_df_  = pd.DataFrame()
    for w in range(0,23): 

        windows_cond_w = windows_cond[w]   
        if windows_cond_w:
            start_year_w = start_year + w
                
            dict_final_df=pd.DataFrame(dict_final_list[w].items(), columns=['k', 'prob'])
            dict_final_den_df=pd.DataFrame(dict_final_den_list[w].items(), columns=['k', 'den'])
            dict_final_num_df=pd.DataFrame(dict_final_num_list[w].items(), columns=['k', 'num'])
            dict_final_high1_df=pd.DataFrame(dict_final_list_high1[w].items(), columns=['k', 'prob_high1'])
            dict_final_den_high1_df=pd.DataFrame(dict_final_den_list_high1[w].items(), columns=['k', 'den_high1'])
            dict_final_num_high1_df=pd.DataFrame(dict_final_num_list_high1[w].items(), columns=['k', 'num_high1'])
            dict_final_low1_df=pd.DataFrame(dict_final_list_low1[w].items(), columns=['k', 'prob_low1'])
            dict_final_den_low1_df=pd.DataFrame(dict_final_den_list_low1[w].items(), columns=['k', 'den_low1'])
            dict_final_num_low1_df=pd.DataFrame(dict_final_num_list_low1[w].items(), columns=['k', 'num_low1']) 

            topic_df = dict_final_df.merge(dict_final_den_df.merge(dict_final_num_df))
            topic_high1_df = dict_final_high1_df.merge(dict_final_den_high1_df.merge(dict_final_num_high1_df))
            topic_low1_df = dict_final_low1_df.merge(dict_final_den_low1_df.merge(dict_final_num_low1_df))

            topic_df_w  = (topic_df.merge(topic_high1_df, how='outer')).merge(topic_low1_df, how='outer')
            topic_df_w.insert(0, 'T_0', start_year_w)
            topic_df_ = pd.concat([topic_df_, topic_df_w], ignore_index = True, axis = 0)
              
    topic_df_.to_csv(os.path.join(my_path, my_file))
 
    topic_df_.insert(0, 'topic', topic)

    return topic_df_

In [None]:
my_path = os.path.join(discipline, 'Impact_mean2/Exp1_ver3')
#create folder
if not os.path.exists(my_path):
    os.makedirs(my_path)
topics_df = pd.DataFrame();
for topic in topic_list:
    topic_df_top = Exp1_2_ver3(discipline=discipline,topic=topic,my_path=my_path) 
    topics_df = pd.concat([topics_df, topic_df_top], ignore_index = True, axis = 0)
my_file = 'df_topic_windows.csv'
topics_df.to_csv(os.path.join(my_path, my_file))

#### Def. impact 3 - papers:topic, cits:topic

##### Def. contact 1 - #contacts

In [None]:
def Exp1_3_ver1(discipline,topic,my_path):
    
    #restrict to topic
    works_concepts_conc = works_concepts.query('concept_name==@topic', engine='python')
  
    #load
    my_path2 = os.path.join(discipline, 'Info')
    my_file = 'work_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        work_ids_list = pickle.load(fp)
    my_file = 'author_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        author_ids_list = pickle.load(fp)
    my_file = 'work_ids_tot_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        work_ids_tot_list = pickle.load(fp)
    my_file = 'author_ids_tot_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        author_ids_tot_list = pickle.load(fp)
    my_file = 'windows_cond_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        windows_cond = pickle.load(fp)
        
    #load
    my_path3 = os.path.join(my_path2, 'Impact_mean3')
    my_file = 'active_authors_classes_'+topic
    with open(os.path.join(my_path3, my_file),"rb") as fp:
        active_authors_classes = pickle.load(fp)   
    
    #consider consecutive EW and OW (5 years each)
    start_year = 1995 
    my_path4 = os.path.join(discipline, 'Productivity/Exp1_ver1')
    my_file = 'all_coauthors_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        all_coauthors_list = pickle.load(fp) 
    my_file = 'active_authors_start_union_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        active_authors_start_union = pickle.load(fp) 
    active_authors_start_union_list = list(active_authors_start_union)    
            
    my_file = 'prior_work_ids_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        prior_work_ids_list = pickle.load(fp) 
    my_file = 'prior_author_ids_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        prior_author_ids_list = pickle.load(fp)  
    my_file = 'first_time_authors_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_list = pickle.load(fp) 
    my_file = 'first_time_authors_tot_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_tot_list = pickle.load(fp)
    my_file = 'not_active_authors_start_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        not_active_authors_start_list = pickle.load(fp)
    my_file = 'first_time_authors_union_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_union = pickle.load(fp)
        
    my_file = 'works_authors_activation_date_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        works_authors_activation_date = pickle.load(fp)
           
    dict_final_list = []
    dict_final_den_list = []
    dict_final_num_list = []
    dict_final_list_high1 = []
    dict_final_den_list_high1 = []
    dict_final_num_list_high1 = []
    dict_final_list_low1 = []
    dict_final_den_list_low1 = []
    dict_final_num_list_low1 = []
    for w in tqdm(range(0,23)): 

        windows_cond_w = windows_cond[w]   
        if windows_cond_w:

            start_year_w = start_year+w
            all_coauthors = all_coauthors_list[w]
            first_time_authors = first_time_authors_list[w]
            first_time_authors_tot = first_time_authors_tot_list[w]
            [active_authors_start,samples_dict_1,n_1] = active_authors_classes[w]
            high_active_authors1 = samples_dict_1['top 10%']
            low_active_authors1 = samples_dict_1['bottom 10%']
            
            #keep just works active_authors_start in this period and written in the period
            work_id_active = works_authors_activation_date[works_authors_activation_date.author_id.isin(active_authors_start)]
            work_id_active = work_id_active.query('@start_year_w-5 <= publication_year < @start_year_w', engine='python') 
            #add coauthors but not infected
            work_id_active_collab = works_authors[works_authors.work_id.isin(work_id_active.work_id)].query('author_id not in @active_authors_start')
            works_authors_collab = pd.concat([work_id_active,work_id_active_collab]).reset_index(drop=True)    

            #bipartite graph work-authors union exposure window
            bip_g = nx.from_pandas_edgelist(
                    works_authors_collab[['work_id', 'author_id']],
                    source='work_id', target='author_id'
                )

            #graph weight number papers written together
            author_ids_supp =  set(works_authors_collab.author_id)
            support_graph_ = get_support_graph_ver1(bip_g, author_ids_supp)
            #dictionary {number exposure start year : list of authors that number}
            not_active_authors_start = not_active_authors_start_list[w]
            authors_isolated = not_active_authors_start - author_ids_supp
                       
            dict_final = {}
            dict_final_high1 = {}
            dict_final_low1 = {}
            for anas in tqdm(author_ids_supp & not_active_authors_start): #for each author not active at the beginning  
                n_anas = set(support_graph_.neighbors(anas))

                #A
                dict_final,ego_active_total = get_scores_A_ver1(anas,n_anas, active_authors_start,support_graph_,dict_final)
                #B 
                dict_final_high1,dict_final_low1 = get_scores_B_ver1(anas,n_anas,high_active_authors1,low_active_authors1,ego_active_total,dict_final_high1,dict_final_low1)
            
            #(iii) Define T(k) to be the fraction of these authors that have become active by the time of the second snapshot.
            #dictionary {k : fraction}

            #A 
            dict_final_list,dict_final_num_list,dict_final_den_list = calculation_A(w,author_ids_tot_list,all_coauthors_list,first_time_authors,first_time_authors_tot,dict_final,dict_final_list,dict_final_num_list,dict_final_den_list,prior_author_ids_list,authors_isolated)   
            #B  
            dict_final_list_high1,dict_final_num_list_high1,dict_final_den_list_high1 = calculation_B(first_time_authors,dict_final_high1,dict_final_list_high1,dict_final_num_list_high1,dict_final_den_list_high1)
            dict_final_list_low1,dict_final_num_list_low1,dict_final_den_list_low1 = calculation_B(first_time_authors,dict_final_low1,dict_final_list_low1,dict_final_num_list_low1,dict_final_den_list_low1)

        else:
            dict_final_list.append(np.nan)
            dict_final_den_list.append(np.nan)
            dict_final_num_list.append(np.nan)
            dict_final_list_high1.append(np.nan)
            dict_final_den_list_high1.append(np.nan)
            dict_final_num_list_high1.append(np.nan)
            dict_final_list_low1.append(np.nan)
            dict_final_den_list_low1.append(np.nan)
            dict_final_num_list_low1.append(np.nan)
            
    #save on file dictionary each window: concept - year_start 
    my_file = 'df_'+topic+'_windows.csv'
    
    topic_df_  = pd.DataFrame()
    for w in range(0,23): 

        windows_cond_w = windows_cond[w]   
        if windows_cond_w:
            start_year_w = start_year + w
                
            dict_final_df=pd.DataFrame(dict_final_list[w].items(), columns=['k', 'prob'])
            dict_final_den_df=pd.DataFrame(dict_final_den_list[w].items(), columns=['k', 'den'])
            dict_final_num_df=pd.DataFrame(dict_final_num_list[w].items(), columns=['k', 'num'])
            dict_final_high1_df=pd.DataFrame(dict_final_list_high1[w].items(), columns=['k', 'prob_high1'])
            dict_final_den_high1_df=pd.DataFrame(dict_final_den_list_high1[w].items(), columns=['k', 'den_high1'])
            dict_final_num_high1_df=pd.DataFrame(dict_final_num_list_high1[w].items(), columns=['k', 'num_high1'])
            dict_final_low1_df=pd.DataFrame(dict_final_list_low1[w].items(), columns=['k', 'prob_low1'])
            dict_final_den_low1_df=pd.DataFrame(dict_final_den_list_low1[w].items(), columns=['k', 'den_low1'])
            dict_final_num_low1_df=pd.DataFrame(dict_final_num_list_low1[w].items(), columns=['k', 'num_low1']) 

            topic_df = dict_final_df.merge(dict_final_den_df.merge(dict_final_num_df))
            topic_high1_df = dict_final_high1_df.merge(dict_final_den_high1_df.merge(dict_final_num_high1_df))
            topic_low1_df = dict_final_low1_df.merge(dict_final_den_low1_df.merge(dict_final_num_low1_df))

            topic_df_w  = (topic_df.merge(topic_high1_df, how='outer')).merge(topic_low1_df, how='outer')
            topic_df_w.insert(0, 'T_0', start_year_w)
            topic_df_ = pd.concat([topic_df_, topic_df_w], ignore_index = True, axis = 0)
              
    topic_df_.to_csv(os.path.join(my_path, my_file))
 
    topic_df_.insert(0, 'topic', topic)

    return topic_df_

In [None]:
my_path = os.path.join(discipline, 'Impact_mean3/Exp1_ver1')
#create folder
if not os.path.exists(my_path):
    os.makedirs(my_path)
topics_df = pd.DataFrame();
for topic in topic_list:
    topic_df_top = Exp1_3_ver1(discipline=discipline,topic=topic,my_path=my_path) 
    topics_df = pd.concat([topics_df, topic_df_top], ignore_index = True, axis = 0)
my_file = 'df_topic_windows.csv'
topics_df.to_csv(os.path.join(my_path, my_file))

##### Def. contact 2 - #papers

In [None]:
def Exp1_3_ver2(discipline,topic,my_path):
    
    #restrict to topic
    works_concepts_conc = works_concepts.query('concept_name==@topic', engine='python')
      
    #load
    my_path2 = os.path.join(discipline, 'Info')
    my_file = 'work_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        work_ids_list = pickle.load(fp)
    my_file = 'author_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        author_ids_list = pickle.load(fp)
    my_file = 'work_ids_tot_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        work_ids_tot_list = pickle.load(fp)
    my_file = 'author_ids_tot_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        author_ids_tot_list = pickle.load(fp)
    my_file = 'windows_cond_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        windows_cond = pickle.load(fp)
        
    #load
    my_path3 = os.path.join(my_path2, 'Impact_mean3')
    my_file = 'active_authors_classes_'+topic
    with open(os.path.join(my_path3, my_file),"rb") as fp:
        active_authors_classes = pickle.load(fp)   
    
    #consider consecutive EW and OW (5 years each)
    start_year = 1995 
    my_path4 = os.path.join(discipline, 'Productivity/Exp1_ver1')
    my_file = 'all_coauthors_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        all_coauthors_list = pickle.load(fp) 
    my_file = 'active_authors_start_union_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        active_authors_start_union = pickle.load(fp) 
    active_authors_start_union_list = list(active_authors_start_union)    
            
    my_file = 'prior_work_ids_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        prior_work_ids_list = pickle.load(fp) 
    my_file = 'prior_author_ids_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        prior_author_ids_list = pickle.load(fp)  
    my_file = 'first_time_authors_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_list = pickle.load(fp) 
    my_file = 'first_time_authors_tot_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_tot_list = pickle.load(fp)
    my_file = 'not_active_authors_start_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        not_active_authors_start_list = pickle.load(fp)
    my_file = 'first_time_authors_union_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_union = pickle.load(fp)
        
    my_file = 'works_authors_activation_date_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        works_authors_activation_date = pickle.load(fp)
           
    dict_final_list = []
    dict_final_den_list = []
    dict_final_num_list = []
    dict_final_list_high1 = []
    dict_final_den_list_high1 = []
    dict_final_num_list_high1 = []
    dict_final_list_low1 = []
    dict_final_den_list_low1 = []
    dict_final_num_list_low1 = []
    for w in tqdm(range(0,23)): 

        windows_cond_w = windows_cond[w]   
        if windows_cond_w:

            start_year_w = start_year+w
            all_coauthors = all_coauthors_list[w]
            first_time_authors = first_time_authors_list[w]
            first_time_authors_tot = first_time_authors_tot_list[w]
            [active_authors_start,samples_dict_1,n_1] = active_authors_classes[w]
            high_active_authors1 = samples_dict_1['top 10%']
            low_active_authors1 = samples_dict_1['bottom 10%']
            
            #keep just works active_authors_start in this period and written in the period
            work_id_active = works_authors_activation_date[works_authors_activation_date.author_id.isin(active_authors_start)]
            work_id_active = work_id_active.query('@start_year_w-5 <= publication_year < @start_year_w', engine='python') 
            #add coauthors but not infected
            work_id_active_collab = works_authors[works_authors.work_id.isin(work_id_active.work_id)].query('author_id not in @active_authors_start')
            works_authors_collab = pd.concat([work_id_active,work_id_active_collab]).reset_index(drop=True)    

            #bipartite graph work-authors union exposure window
            bip_g = nx.from_pandas_edgelist(
                    works_authors_collab[['work_id', 'author_id']],
                    source='work_id', target='author_id'
                )

            #graph weight number papers written together
            author_ids_supp =  set(works_authors_collab.author_id)
            support_graph_ = get_support_graph_ver2(bip_g, author_ids_supp,list_works)
            #dictionary {number exposure start year : list of authors that number}
            not_active_authors_start = not_active_authors_start_list[w]
            authors_isolated = not_active_authors_start - author_ids_supp
                       
            dict_final = {}
            dict_final_high1 = {}
            dict_final_low1 = {}
            for anas in tqdm(author_ids_supp & not_active_authors_start): #for each author not active at the beginning 
                n_anas = set(support_graph_.neighbors(anas))

                #A
                dict_final,ego_active_total = get_scores_A_ver2(anas,n_anas, active_authors_start,support_graph_,dict_final)
                #B 
                dict_final_high1,dict_final_low1 = get_scores_B_ver2(anas,n_anas,high_active_authors1,low_active_authors1,ego_active_total,dict_final_high1,dict_final_low1)

            #(iii) Define T(k) to be the fraction of these authors that have become active by the time of the second snapshot.
            #dictionary {k : fraction}

            #A 
            dict_final_list,dict_final_num_list,dict_final_den_list = calculation_A(w,author_ids_tot_list,all_coauthors_list,first_time_authors,first_time_authors_tot,dict_final,dict_final_list,dict_final_num_list,dict_final_den_list,prior_author_ids_list,authors_isolated)   
            #B  
            dict_final_list_high1,dict_final_num_list_high1,dict_final_den_list_high1 = calculation_B(first_time_authors,dict_final_high1,dict_final_list_high1,dict_final_num_list_high1,dict_final_den_list_high1)
            dict_final_list_low1,dict_final_num_list_low1,dict_final_den_list_low1 = calculation_B(first_time_authors,dict_final_low1,dict_final_list_low1,dict_final_num_list_low1,dict_final_den_list_low1)

        else:
            dict_final_list.append(np.nan)
            dict_final_den_list.append(np.nan)
            dict_final_num_list.append(np.nan)
            dict_final_list_high1.append(np.nan)
            dict_final_den_list_high1.append(np.nan)
            dict_final_num_list_high1.append(np.nan)
            dict_final_list_low1.append(np.nan)
            dict_final_den_list_low1.append(np.nan)
            dict_final_num_list_low1.append(np.nan)
            
    #save on file dictionary each window: concept - year_start 
    my_file = 'df_'+topic+'_windows.csv'
    
    topic_df_  = pd.DataFrame()
    for w in range(0,23): 

        windows_cond_w = windows_cond[w]   
        if windows_cond_w:
            start_year_w = start_year + w
                
            dict_final_df=pd.DataFrame(dict_final_list[w].items(), columns=['k', 'prob'])
            dict_final_den_df=pd.DataFrame(dict_final_den_list[w].items(), columns=['k', 'den'])
            dict_final_num_df=pd.DataFrame(dict_final_num_list[w].items(), columns=['k', 'num'])
            dict_final_high1_df=pd.DataFrame(dict_final_list_high1[w].items(), columns=['k', 'prob_high1'])
            dict_final_den_high1_df=pd.DataFrame(dict_final_den_list_high1[w].items(), columns=['k', 'den_high1'])
            dict_final_num_high1_df=pd.DataFrame(dict_final_num_list_high1[w].items(), columns=['k', 'num_high1'])
            dict_final_low1_df=pd.DataFrame(dict_final_list_low1[w].items(), columns=['k', 'prob_low1'])
            dict_final_den_low1_df=pd.DataFrame(dict_final_den_list_low1[w].items(), columns=['k', 'den_low1'])
            dict_final_num_low1_df=pd.DataFrame(dict_final_num_list_low1[w].items(), columns=['k', 'num_low1']) 

            topic_df = dict_final_df.merge(dict_final_den_df.merge(dict_final_num_df))
            topic_high1_df = dict_final_high1_df.merge(dict_final_den_high1_df.merge(dict_final_num_high1_df))
            topic_low1_df = dict_final_low1_df.merge(dict_final_den_low1_df.merge(dict_final_num_low1_df))

            topic_df_w  = (topic_df.merge(topic_high1_df, how='outer')).merge(topic_low1_df, how='outer')
            topic_df_w.insert(0, 'T_0', start_year_w)
            topic_df_ = pd.concat([topic_df_, topic_df_w], ignore_index = True, axis = 0)
              
    topic_df_.to_csv(os.path.join(my_path, my_file))
 
    topic_df_.insert(0, 'topic', topic)

    return topic_df_

In [None]:
my_path = os.path.join(discipline, 'Impact_mean3/Exp1_ver2')
#create folder
if not os.path.exists(my_path):
    os.makedirs(my_path)
topics_df = pd.DataFrame();
for topic in topic_list:
    topic_df_top = Exp1_3_ver2(discipline=discipline,topic=topic,my_path=my_path) 
    topics_df = pd.concat([topics_df, topic_df_top], ignore_index = True, axis = 0)
my_file = 'df_topic_windows.csv'
topics_df.to_csv(os.path.join(my_path, my_file))

##### Def. contact 3 - #coauthors

In [None]:
def Exp1_3_ver3(discipline,topic,my_path):
    
    #restrict to topic
    works_concepts_conc = works_concepts.query('concept_name==@topic', engine='python')
      
    #load
    my_path2 = os.path.join(discipline, 'Info')
    my_file = 'work_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        work_ids_list = pickle.load(fp)
    my_file = 'author_ids_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        author_ids_list = pickle.load(fp)
    my_file = 'work_ids_tot_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        work_ids_tot_list = pickle.load(fp)
    my_file = 'author_ids_tot_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        author_ids_tot_list = pickle.load(fp)
    my_file = 'windows_cond_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        windows_cond = pickle.load(fp)
        
    #load
    my_path3 = os.path.join(my_path2, 'Impact_mean3')
    my_file = 'active_authors_classes_'+topic
    with open(os.path.join(my_path3, my_file),"rb") as fp:
        active_authors_classes = pickle.load(fp)   
    
    #consider consecutive EW and OW (5 years each)
    start_year = 1995 
    my_path4 = os.path.join(discipline, 'Productivity/Exp1_ver1')
    my_file = 'all_coauthors_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        all_coauthors_list = pickle.load(fp) 
    my_file = 'active_authors_start_union_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        active_authors_start_union = pickle.load(fp) 
    active_authors_start_union_list = list(active_authors_start_union)    
            
    my_file = 'prior_work_ids_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        prior_work_ids_list = pickle.load(fp) 
    my_file = 'prior_author_ids_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        prior_author_ids_list = pickle.load(fp)  
    my_file = 'first_time_authors_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_list = pickle.load(fp) 
    my_file = 'first_time_authors_tot_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_tot_list = pickle.load(fp)
    my_file = 'not_active_authors_start_list_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        not_active_authors_start_list = pickle.load(fp)
    my_file = 'first_time_authors_union_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        first_time_authors_union = pickle.load(fp)
        
    my_file = 'works_authors_activation_date_'+topic
    with open(os.path.join(my_path4, my_file),"rb") as fp:
        works_authors_activation_date = pickle.load(fp)
           
    dict_final_list = []
    dict_final_den_list = []
    dict_final_num_list = []
    dict_final_list_high1 = []
    dict_final_den_list_high1 = []
    dict_final_num_list_high1 = []
    dict_final_list_low1 = []
    dict_final_den_list_low1 = []
    dict_final_num_list_low1 = []
    for w in tqdm(range(0,23)): 

        windows_cond_w = windows_cond[w]   
        if windows_cond_w:

            start_year_w = start_year+w
            all_coauthors = all_coauthors_list[w]
            first_time_authors = first_time_authors_list[w]
            first_time_authors_tot = first_time_authors_tot_list[w]
            [active_authors_start,samples_dict_1,n_1] = active_authors_classes[w]
            high_active_authors1 = samples_dict_1['top 10%']
            low_active_authors1 = samples_dict_1['bottom 10%']
            
            #keep just works active_authors_start in this period and written in the period
            work_id_active = works_authors_activation_date[works_authors_activation_date.author_id.isin(active_authors_start)]
            work_id_active = work_id_active.query('@start_year_w-5 <= publication_year < @start_year_w', engine='python') 
            #add coauthors but not infected
            work_id_active_collab = works_authors[works_authors.work_id.isin(work_id_active.work_id)].query('author_id not in @active_authors_start')
            works_authors_collab = pd.concat([work_id_active,work_id_active_collab]).reset_index(drop=True)    

            #bipartite graph work-authors union exposure window
            bip_g = nx.from_pandas_edgelist(
                    works_authors_collab[['work_id', 'author_id']],
                    source='work_id', target='author_id'
                )

            #graph weight number papers written together
            author_ids_supp =  set(works_authors_collab.author_id)
            support_graph_ = get_support_graph_ver3(bip_g, author_ids_supp)
            #dictionary {number exposure start year : list of authors that number}
            not_active_authors_start = not_active_authors_start_list[w]
            authors_isolated = not_active_authors_start - author_ids_supp
                       
            dict_final = {}
            dict_final_high1 = {}
            dict_final_low1 = {}
            for anas in tqdm(author_ids_supp & not_active_authors_start): #for each author not active at the beginning 
                n_anas = set(support_graph_.neighbors(anas))

                #A
                dict_finall = get_scores_A_ver3(anas,n_anas,active_authors_start,dict_final)
                #B 
                dict_final_high1,dict_final_low1 = get_scores_B_ver3(anas,n_anas,high_active_authors1,low_active_authors1,dict_final_high1,dict_final_low1)

            #(iii) Define T(k) to be the fraction of these authors that have become active by the time of the second snapshot.
            #dictionary {k : fraction}

            #A 
            dict_final_list,dict_final_num_list,dict_final_den_list = calculation_A(w,author_ids_tot_list,all_coauthors_list,first_time_authors,first_time_authors_tot,dict_final,dict_final_list,dict_final_num_list,dict_final_den_list,prior_author_ids_list,authors_isolated)   
            #B  
            dict_final_list_high1,dict_final_num_list_high1,dict_final_den_list_high1 = calculation_B(first_time_authors,dict_final_high1,dict_final_list_high1,dict_final_num_list_high1,dict_final_den_list_high1)
            dict_final_list_low1,dict_final_num_list_low1,dict_final_den_list_low1 = calculation_B(first_time_authors,dict_final_low1,dict_final_list_low1,dict_final_num_list_low1,dict_final_den_list_low1)

        else:
            dict_final_list.append(np.nan)
            dict_final_den_list.append(np.nan)
            dict_final_num_list.append(np.nan)
            dict_final_list_high1.append(np.nan)
            dict_final_den_list_high1.append(np.nan)
            dict_final_num_list_high1.append(np.nan)
            dict_final_list_low1.append(np.nan)
            dict_final_den_list_low1.append(np.nan)
            dict_final_num_list_low1.append(np.nan)
            
    #save on file dictionary each window: concept - year_start 
    my_file = 'df_'+topic+'_windows.csv'
    
    topic_df_  = pd.DataFrame()
    for w in range(0,23): 

        windows_cond_w = windows_cond[w]   
        if windows_cond_w:
            start_year_w = start_year + w
                
            dict_final_df=pd.DataFrame(dict_final_list[w].items(), columns=['k', 'prob'])
            dict_final_den_df=pd.DataFrame(dict_final_den_list[w].items(), columns=['k', 'den'])
            dict_final_num_df=pd.DataFrame(dict_final_num_list[w].items(), columns=['k', 'num'])
            dict_final_high1_df=pd.DataFrame(dict_final_list_high1[w].items(), columns=['k', 'prob_high1'])
            dict_final_den_high1_df=pd.DataFrame(dict_final_den_list_high1[w].items(), columns=['k', 'den_high1'])
            dict_final_num_high1_df=pd.DataFrame(dict_final_num_list_high1[w].items(), columns=['k', 'num_high1'])
            dict_final_low1_df=pd.DataFrame(dict_final_list_low1[w].items(), columns=['k', 'prob_low1'])
            dict_final_den_low1_df=pd.DataFrame(dict_final_den_list_low1[w].items(), columns=['k', 'den_low1'])
            dict_final_num_low1_df=pd.DataFrame(dict_final_num_list_low1[w].items(), columns=['k', 'num_low1']) 

            topic_df = dict_final_df.merge(dict_final_den_df.merge(dict_final_num_df))
            topic_high1_df = dict_final_high1_df.merge(dict_final_den_high1_df.merge(dict_final_num_high1_df))
            topic_low1_df = dict_final_low1_df.merge(dict_final_den_low1_df.merge(dict_final_num_low1_df))

            topic_df_w  = (topic_df.merge(topic_high1_df, how='outer')).merge(topic_low1_df, how='outer')
            topic_df_w.insert(0, 'T_0', start_year_w)
            topic_df_ = pd.concat([topic_df_, topic_df_w], ignore_index = True, axis = 0)
              
    topic_df_.to_csv(os.path.join(my_path, my_file))
 
    topic_df_.insert(0, 'topic', topic)

    return topic_df_

In [None]:
my_path = os.path.join(discipline, 'Impact_mean3/Exp1_ver3')
#create folder
if not os.path.exists(my_path):
    os.makedirs(my_path)
topics_df = pd.DataFrame();
for topic in topic_list:
    topic_df_top = Exp1_3_ver3(discipline=discipline,topic=topic,my_path=my_path) 
    topics_df = pd.concat([topics_df, topic_df_top], ignore_index = True, axis = 0)
my_file = 'df_topic_windows.csv'
topics_df.to_csv(os.path.join(my_path, my_file))

## CALCULATIONS

In [None]:
#select window according to condition minimum number of papers with concept in EW and OW 
from itertools import compress
start_year = 1995
end_year = 2017
years_list = list(range(start_year,end_year+1)) #list T_0

def windows_selection(topic,my_path,years_list,N):
    
    #load works with concepts each year
    my_file = 'work_ids_list_'+topic
    with open(os.path.join(my_path, my_file),"rb") as fp:
        work_ids_list = pickle.load(fp)
        
    #consider consecutive EW and OW (5 years each)
    start_year = 1995 
    windows_cond = [] 
    for w in range(0,23):
        start_year_w = start_year+w #T_0 #start OW

        # work and authors topic in EW
        prior_work_ids_5yr = set().union(*work_ids_list[w:w+5])

        # work and authors topic in OW
        work_ids = set().union(*work_ids_list[w+5:w+5+5]) 

        #consider just windows with at least N papers in EW and OW 
        windows_cond.append((len(prior_work_ids_5yr)>=N) and (len(work_ids)>=N))


    #save
    windows_list = list(compress(years_list, windows_cond)) 
    my_file = 'windows_list_'+topic
    with open(os.path.join(my_path, my_file),"wb") as fp:
        pickle.dump(windows_list,fp)

In [None]:
N = 3000 #thereshold
my_path = os.path.join(discipline, 'Info')
#windows_selection
for topic in topic_list:        
    windows_selection(topic=topic,my_path=my_path,years_list=years_list,N=N) 

In [None]:
my_path_list = [
        os.path.join(discipline, 'Productivity/Exp1_ver1'),
        os.path.join(discipline, 'Productivity/Exp1_ver2'),
        os.path.join(discipline, 'Productivity/Exp1_ver3'),
        os.path.join(discipline, 'Impact_mean1/Exp1_ver1'),
        os.path.join(discipline, 'Impact_mean1/Exp1_ver2'),
        os.path.join(discipline, 'Impact_mean1/Exp1_ver3'),
        os.path.join(discipline, 'Impact_mean2/Exp1_ver1'),
        os.path.join(discipline, 'Impact_mean2/Exp1_ver2'),
        os.path.join(discipline, 'Impact_mean2/Exp1_ver3'),
        os.path.join(discipline, 'Impact_mean3/Exp1_ver1'),
        os.path.join(discipline, 'Impact_mean3/Exp1_ver2'),
        os.path.join(discipline, 'Impact_mean3/Exp1_ver3')
]

In [None]:
#union results
for my_path in my_path_list:
    topics_df = pd.DataFrame();
    for topic in topic_list: 
        my_file = 'df_'+topic+'_windows.csv'
        topic_df_top = pd.read_csv(os.path.join(my_path, my_file),index_col=0) 
        topic_df_top.insert(0, 'topic', topic)
        topics_df = pd.concat([topics_df, topic_df_top], ignore_index = True, axis = 0)  
    my_file = 'df_topic_windows.csv'    
    topics_df.to_csv(os.path.join(my_path, my_file))

In [None]:
def Exp1_cumulative(dict_final_num_list,dict_final_den_list,i,dict_final_num_list_cum,dict_final_list_cum,dict_final_den_list_cum):
    
    dataframe_num_i = pd.DataFrame(dict_final_num_list[i].items(), columns=['k', 'num'])
    dataframe_den_i = pd.DataFrame(dict_final_den_list[i].items(), columns=['k', 'den'])
    dataframe_i = dataframe_num_i.merge(dataframe_den_i)
    dataframe_i_rev = dataframe_i.loc[::-1] 
    dataframe_i['num_cum'] = dataframe_i_rev['num'].cumsum().loc[::-1]
    dataframe_i['den_cum'] = dataframe_i_rev['den'].cumsum().loc[::-1]
    dataframe_i['prob_cum'] = (dataframe_i.num_cum / dataframe_i.den_cum).loc[::-1]
    dict_final_num_list_cum.append(dict(zip(dataframe_i.k, dataframe_i.num_cum)))    
    dict_final_den_list_cum.append(dict(zip(dataframe_i.k, dataframe_i.den_cum))) 
    dict_final_list_cum.append(dict(zip(dataframe_i.k, dataframe_i.prob_cum)))

    return dict_final_list_cum,dict_final_num_list_cum,dict_final_den_list_cum

In [None]:
def Exp1_calculation(dict_final_list_cum,dict_final_den_list_cum,dict_final_num_list_cum,num_windows_concept,dict_final_list_mean,dict_final_list_std,dict_final_den_list_mean,dict_final_den_list_std,dict_final_num_list_mean,dict_final_num_list_std,j):

        values_j =  [d[j] for d in dict_final_list_cum]
        dict_final_list_mean[j] = np.nanmean(values_j)
        dict_final_list_std[j] = np.nanstd(values_j)/sqrt(num_windows_concept)
        values_j =  [d[j] for d in dict_final_den_list_cum]
        dict_final_den_list_mean[j] = np.nanmean(values_j)
        dict_final_den_list_std[j] = np.nanstd(values_j)/sqrt(num_windows_concept)
        values_j =  [d[j] for d in dict_final_num_list_cum]
        dict_final_num_list_mean[j] = np.nanmean(values_j)
        dict_final_num_list_std[j] = np.nanstd(values_j)/sqrt(num_windows_concept)
        
        return dict_final_list_mean,dict_final_list_std,dict_final_den_list_mean,dict_final_den_list_std,dict_final_num_list_mean,dict_final_num_list_std

In [None]:
#average windows periods 
def Exp1_stat(df_topics,topic,my_path):
    df_topic = df_topics.query('topic==@topic').reset_index() #concept
    #df_topic = df_topic.drop_duplicates(subset=['T_0','k']) #drop duplicate windows
    
    #download all dictionaries
    dict_final_list = []
    dict_final_den_list = []
    dict_final_num_list = []
    dict_final_list_high1 = []
    dict_final_den_list_high1 = []
    dict_final_num_list_high1 = []
    dict_final_list_low1 = []
    dict_final_den_list_low1 = []
    dict_final_num_list_low1 = []
    
    my_path2 = os.path.join(discipline, 'Info')
    my_file = 'windows_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        windows_list = pickle.load(fp)

    start_year_window_list = windows_list #consider just windows selected by condition
    #start_year_window_list = time_periods[topic]
    #start_year_window_list = list(set(df_topic.T_0))
    num_windows_topic = len(start_year_window_list) #number windows
    for w in range(0,num_windows_topic): #each window
        start_year_window = start_year_window_list[w]
        df_topic_w = df_topic.query('T_0==@start_year_window').reset_index()
        
        dict_final_list.append(dict(zip(df_topic_w.k, df_topic_w.prob))) 
        dict_final_den_list.append(dict(zip(df_topic_w.k, df_topic_w.den)))
        dict_final_num_list.append(dict(zip(df_topic_w.k, df_topic_w.num)))
        dict_final_list_high1.append(dict(zip(df_topic_w.k, df_topic_w.prob_high1)))
        dict_final_den_list_high1.append(dict(zip(df_topic_w.k, df_topic_w.den_high1)))
        dict_final_num_list_high1.append(dict(zip(df_topic_w.k, df_topic_w.num_high1)))
        dict_final_list_low1.append(dict(zip(df_topic_w.k, df_topic_w.prob_low1)))
        dict_final_den_list_low1.append(dict(zip(df_topic_w.k, df_topic_w.den_low1)))
        dict_final_num_list_low1.append(dict(zip(df_topic_w.k, df_topic_w.num_low1)))

        
    #add missing keys
    max_keys=0
    for w in range(0,len(start_year_window_list)):
        max_keys = max(max_keys,max(dict_final_list[w].keys()))
    for w in range(0,len(start_year_window_list)):
        for j in range(0,max_keys+1):
            if j not in dict_final_list[w].keys():
                dict_final_list[w][j] = np.nan 
                dict_final_den_list[w][j] = np.nan
                dict_final_num_list[w][j] = np.nan
            if j not in dict_final_list_high1[w].keys():
                dict_final_list_high1[w][j] = np.nan 
                dict_final_den_list_high1[w][j] = np.nan
                dict_final_num_list_high1[w][j] = np.nan
            if j not in dict_final_list_low1[w].keys():
                dict_final_list_low1[w][j] = np.nan 
                dict_final_den_list_low1[w][j] = np.nan
                dict_final_num_list_low1[w][j] = np.nan
                               
    #average and std dictionaries
    dict_final_list_mean = {}
    dict_final_list_std = {}
    dict_final_den_list_mean = {}
    dict_final_den_list_std = {}
    dict_final_num_list_mean = {}
    dict_final_num_list_std = {}
    dict_final_list_high1_mean = {}
    dict_final_list_high1_std = {}
    dict_final_den_list_high1_mean = {}
    dict_final_den_list_high1_std = {}
    dict_final_num_list_high1_mean = {}
    dict_final_num_list_high1_std = {}
    dict_final_list_low1_mean = {}
    dict_final_list_low1_std = {}
    dict_final_den_list_low1_mean = {}
    dict_final_den_list_low1_std = {}
    dict_final_num_list_low1_mean = {}
    dict_final_num_list_low1_std = {}
    for j in range(0,max_keys+1):
        values_j =  [d[j] for d in dict_final_list]
        dict_final_list_mean[j] = np.nanmean(values_j)
        dict_final_list_std[j] = np.nanstd(values_j)/sqrt(num_windows_topic) 
        values_j =  [d[j] for d in dict_final_den_list]
        dict_final_den_list_mean[j] = np.nanmean(values_j)
        dict_final_den_list_std[j] = np.nanstd(values_j)/sqrt(num_windows_topic) 
        values_j =  [d[j] for d in dict_final_num_list]
        dict_final_num_list_mean[j] = np.nanmean(values_j)
        dict_final_num_list_std[j] = np.nanstd(values_j)/sqrt(num_windows_topic) 
        
        values_j =  [d[j] for d in dict_final_list_high1]
        dict_final_list_high1_mean[j] = np.nanmean(values_j)
        dict_final_list_high1_std[j] = np.nanstd(values_j)/sqrt(num_windows_topic) 
        values_j =  [d[j] for d in dict_final_den_list_high1]
        dict_final_den_list_high1_mean[j] = np.nanmean(values_j)
        dict_final_den_list_high1_std[j] = np.nanstd(values_j)/sqrt(num_windows_topic) 
        values_j =  [d[j] for d in dict_final_num_list_high1]
        dict_final_num_list_high1_mean[j] = np.nanmean(values_j)
        dict_final_num_list_high1_std[j] = np.nanstd(values_j)/sqrt(num_windows_topic) 
        
        values_j =  [d[j] for d in dict_final_list_low1]
        dict_final_list_low1_mean[j] = np.nanmean(values_j)
        dict_final_list_low1_std[j] = np.nanstd(values_j)/sqrt(num_windows_topic)  
        values_j =  [d[j] for d in dict_final_den_list_low1]
        dict_final_den_list_low1_mean[j] = np.nanmean(values_j)
        dict_final_den_list_low1_std[j] = np.nanstd(values_j)/sqrt(num_windows_topic) 
        values_j =  [d[j] for d in dict_final_num_list_low1]
        dict_final_num_list_low1_mean[j] = np.nanmean(values_j)
        dict_final_num_list_low1_std[j] = np.nanstd(values_j)/sqrt(num_windows_topic) 
    
    #save on file 
    my_file = 'df_'+topic+'_stat.csv'
    Prob_mean_df=pd.DataFrame(dict_final_list_mean.items(), columns=['k', 'prob_mean'])
    Prob_std_df=pd.DataFrame(dict_final_list_std.items(), columns=['k', 'prob_std'])
    Den_mean_df=pd.DataFrame(dict_final_den_list_mean.items(), columns=['k', 'den_mean'])
    Den_std_df=pd.DataFrame(dict_final_den_list_std.items(), columns=['k', 'den_std'])
    Num_mean_df=pd.DataFrame(dict_final_num_list_mean.items(), columns=['k', 'num_mean'])
    Num_std_df=pd.DataFrame(dict_final_num_list_std.items(), columns=['k', 'num_std'])
    
    Prob_mean_high1_df=pd.DataFrame(dict_final_list_high1_mean.items(), columns=['k', 'prob_mean_high1'])
    Prob_std_high1_df=pd.DataFrame(dict_final_list_high1_std.items(), columns=['k', 'prob_std_high1'])
    Den_mean_high1_df=pd.DataFrame(dict_final_den_list_high1_mean.items(), columns=['k', 'den_mean_high1'])
    Den_std_high1_df=pd.DataFrame(dict_final_den_list_high1_std.items(), columns=['k', 'den_std_high1'])
    Num_mean_high1_df=pd.DataFrame(dict_final_num_list_high1_mean.items(), columns=['k', 'num_mean_high1'])
    Num_std_high1_df=pd.DataFrame(dict_final_num_list_high1_std.items(), columns=['k', 'num_std_high1'])
    Prob_mean_low1_df=pd.DataFrame(dict_final_list_low1_mean.items(), columns=['k', 'prob_mean_low1'])
    Prob_std_low1_df=pd.DataFrame(dict_final_list_low1_std.items(), columns=['k', 'prob_std_low1'])
    Den_mean_low1_df=pd.DataFrame(dict_final_den_list_low1_mean.items(), columns=['k', 'den_mean_low1'])
    Den_std_low1_df=pd.DataFrame(dict_final_den_list_low1_std.items(), columns=['k', 'den_std_low1'])
    Num_mean_low1_df=pd.DataFrame(dict_final_num_list_low1_mean.items(), columns=['k', 'num_mean_low1'])
    Num_std_low1_df=pd.DataFrame(dict_final_num_list_low1_std.items(), columns=['k', 'num_std_low1'])
       
    topic_df = ((Prob_mean_df.merge(Prob_std_df)).merge(Den_mean_df.merge(Den_std_df)).merge(Num_mean_df.merge(Num_std_df)))   
    topic_high1_df = ((Prob_mean_high1_df.merge(Prob_std_high1_df)).merge(Den_mean_high1_df.merge(Den_std_high1_df)).merge(Num_mean_high1_df.merge(Num_std_high1_df))) 
    topic_low1_df = ((Prob_mean_low1_df.merge(Prob_std_low1_df)).merge(Den_mean_low1_df.merge(Den_std_low1_df)).merge(Num_mean_low1_df.merge(Num_std_low1_df))) 
    
    topic_df_  = (topic_df.merge(topic_high1_df)).merge(topic_low1_df)
    
    #save all concept dataframes in one file 
    topic_df_.insert(0, 'topic', topic)
    return topic_df_

In [None]:
#cumulative each window save - for p-values caluculation
def Exp1_stat_cum_windows(df_topics,topic,my_path):
    df_topic = df_topics.query('topic==@topic').reset_index() #concept
    #df_topic = df_topic.drop_duplicates(subset=['T_0','k']) #drop duplicate windows

    #download all dictionaries
    dict_final_list = []
    dict_final_den_list = []
    dict_final_num_list = []
    dict_final_list_high1 = []
    dict_final_den_list_high1 = []
    dict_final_num_list_high1 = []
    dict_final_list_low1 = []
    dict_final_den_list_low1 = []
    dict_final_num_list_low1 = []
    my_path2 = os.path.join(discipline, 'Info')
    my_file = 'windows_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        windows_list = pickle.load(fp)
    start_year_window_list = windows_list #consider just windows selected by condition
    #start_year_window_list = time_periods[topic]
    #start_year_window_list = list(set(df_topic.T_0))
    num_windows_topic = len(start_year_window_list) #number windows
    for w in range(0,num_windows_topic): #each window
        start_year_window = start_year_window_list[w]
        df_topic_w = df_topic.query('T_0==@start_year_window').reset_index()
        
        dict_final_list.append(dict(zip(df_topic_w.k, df_topic_w.prob))) 
        dict_final_den_list.append(dict(zip(df_topic_w.k, df_topic_w.den)))
        dict_final_num_list.append(dict(zip(df_topic_w.k, df_topic_w.num)))
        dict_final_list_high1.append(dict(zip(df_topic_w.k, df_topic_w.prob_high1)))
        dict_final_den_list_high1.append(dict(zip(df_topic_w.k, df_topic_w.den_high1)))
        dict_final_num_list_high1.append(dict(zip(df_topic_w.k, df_topic_w.num_high1)))
        dict_final_list_low1.append(dict(zip(df_topic_w.k, df_topic_w.prob_low1)))
        dict_final_den_list_low1.append(dict(zip(df_topic_w.k, df_topic_w.den_low1)))
        dict_final_num_list_low1.append(dict(zip(df_topic_w.k, df_topic_w.num_low1)))
        
    #add missing keys
    max_keys=0
    for w in range(0,num_windows_topic):
        max_keys = max(max_keys,max(dict_final_list[w].keys()))
    for w in range(0,num_windows_topic):
        for j in range(0,max_keys+1):
            if j not in dict_final_list[w].keys():
                dict_final_list[w][j] = np.nan 
                dict_final_den_list[w][j] = np.nan
                dict_final_num_list[w][j] = np.nan
            if j not in dict_final_list_high1[w].keys():
                dict_final_list_high1[w][j] = np.nan 
                dict_final_den_list_high1[w][j] = np.nan
                dict_final_num_list_high1[w][j] = np.nan
            if j not in dict_final_list_low1[w].keys():
                dict_final_list_low1[w][j] = np.nan 
                dict_final_den_list_low1[w][j] = np.nan
                dict_final_num_list_low1[w][j] = np.nan
                
        #order dictionary by key
        dict_final_num_list[w] = collections.OrderedDict(sorted(dict_final_num_list[w].items()))
        dict_final_den_list[w] = collections.OrderedDict(sorted(dict_final_den_list[w].items()))
        dict_final_list[w] = collections.OrderedDict(sorted(dict_final_list[w].items()))
        dict_final_num_list_high1[w] = collections.OrderedDict(sorted(dict_final_num_list_high1[w].items()))
        dict_final_den_list_high1[w] = collections.OrderedDict(sorted(dict_final_den_list_high1[w].items()))
        dict_final_list_high1[w] = collections.OrderedDict(sorted(dict_final_list_high1[w].items()))
        dict_final_num_list_low1[w] = collections.OrderedDict(sorted(dict_final_num_list_low1[w].items()))
        dict_final_den_list_low1[w] = collections.OrderedDict(sorted(dict_final_den_list_low1[w].items()))
        dict_final_list_low1[w] = collections.OrderedDict(sorted(dict_final_list_low1[w].items()))
                
                
    #cumulative distributions (at least one)
    dict_final_num_list_cum = []
    dict_final_den_list_cum = []
    dict_final_list_cum = []
    dict_final_num_list_high1_cum = []
    dict_final_den_list_high1_cum = []
    dict_final_list_high1_cum = []
    dict_final_num_list_low1_cum = []
    dict_final_den_list_low1_cum = []
    dict_final_list_low1_cum = []
    for w in range(0,num_windows_topic):
        dataframe_num_w = pd.DataFrame(dict_final_num_list[w].items(), columns=['k', 'num'])
        dataframe_den_w = pd.DataFrame(dict_final_den_list[w].items(), columns=['k', 'den'])
        dataframe_w = dataframe_num_w.merge(dataframe_den_w)
        dataframe_w_rev = dataframe_w.loc[::-1] 
        dataframe_w['num_cum'] = dataframe_w_rev['num'].cumsum().loc[::-1]
        dataframe_w['den_cum'] = dataframe_w_rev['den'].cumsum().loc[::-1]
        dataframe_w['prob_cum'] = (dataframe_w.num_cum / dataframe_w.den_cum).loc[::-1]
        dict_final_num_list_cum.append(dict(zip(dataframe_w.k, dataframe_w.num_cum)))    
        dict_final_den_list_cum.append(dict(zip(dataframe_w.k, dataframe_w.den_cum))) 
        dict_final_list_cum.append(dict(zip(dataframe_w.k, dataframe_w.prob_cum))) 
        
        dataframe_num_w = pd.DataFrame(dict_final_num_list_high1[w].items(), columns=['k', 'num'])
        dataframe_den_w = pd.DataFrame(dict_final_den_list_high1[w].items(), columns=['k', 'den'])
        dataframe_w = dataframe_num_w.merge(dataframe_den_w)
        dataframe_w_rev = dataframe_w.loc[::-1] 
        dataframe_w['num_cum'] = dataframe_w_rev['num'].cumsum().loc[::-1]
        dataframe_w['den_cum'] = dataframe_w_rev['den'].cumsum().loc[::-1]
        dataframe_w['prob_cum'] = (dataframe_w.num_cum / dataframe_w.den_cum).loc[::-1]
        dict_final_num_list_high1_cum.append(dict(zip(dataframe_w.k, dataframe_w.num_cum)))    
        dict_final_den_list_high1_cum.append(dict(zip(dataframe_w.k, dataframe_w.den_cum))) 
        dict_final_list_high1_cum.append(dict(zip(dataframe_w.k, dataframe_w.prob_cum)))
        
        dataframe_num_w = pd.DataFrame(dict_final_num_list_low1[w].items(), columns=['k', 'num'])
        dataframe_den_w = pd.DataFrame(dict_final_den_list_low1[w].items(), columns=['k', 'den'])
        dataframe_w = dataframe_num_w.merge(dataframe_den_w)
        dataframe_w_rev = dataframe_w.loc[::-1] 
        dataframe_w['num_cum'] = dataframe_w_rev['num'].cumsum().loc[::-1]
        dataframe_w['den_cum'] = dataframe_w_rev['den'].cumsum().loc[::-1]
        dataframe_w['prob_cum'] = (dataframe_w.num_cum / dataframe_w.den_cum).loc[::-1]
        dict_final_num_list_low1_cum.append(dict(zip(dataframe_w.k, dataframe_w.num_cum)))    
        dict_final_den_list_low1_cum.append(dict(zip(dataframe_w.k, dataframe_w.den_cum))) 
        dict_final_list_low1_cum.append(dict(zip(dataframe_w.k, dataframe_w.prob_cum)))
            
    #save on file dictionary each window: concept - year_start 
    start_year = 1995
    topic_df_  = pd.DataFrame()
    for w in range(0,num_windows_topic): 
            start_year_window = start_year_window_list[w]    
            dict_final_df=pd.DataFrame(dict_final_list_cum[w].items(), columns=['k', 'prob'])
            dict_final_den_df=pd.DataFrame(dict_final_den_list_cum[w].items(), columns=['k', 'den'])
            dict_final_num_df=pd.DataFrame(dict_final_num_list_cum[w].items(), columns=['k', 'num'])
            dict_final_high1_df=pd.DataFrame(dict_final_list_high1_cum[w].items(), columns=['k', 'prob_high1'])
            dict_final_den_high1_df=pd.DataFrame(dict_final_den_list_high1_cum[w].items(), columns=['k', 'den_high1'])
            dict_final_num_high1_df=pd.DataFrame(dict_final_num_list_high1_cum[w].items(), columns=['k', 'num_high1'])
            dict_final_low1_df=pd.DataFrame(dict_final_list_low1_cum[w].items(), columns=['k', 'prob_low1'])
            dict_final_den_low1_df=pd.DataFrame(dict_final_den_list_low1_cum[w].items(), columns=['k', 'den_low1'])
            dict_final_num_low1_df=pd.DataFrame(dict_final_num_list_low1_cum[w].items(), columns=['k', 'num_low1']) 

            topic_df = dict_final_df.merge(dict_final_den_df.merge(dict_final_num_df))
            topic_high1_df = dict_final_high1_df.merge(dict_final_den_high1_df.merge(dict_final_num_high1_df))
            topic_low1_df = dict_final_low1_df.merge(dict_final_den_low1_df.merge(dict_final_num_low1_df))

            topic_df_w  = (topic_df.merge(topic_high1_df, how='outer')).merge(topic_low1_df, how='outer')
            topic_df_w.insert(0, 'T_0', start_year_window)
            topic_df_ = pd.concat([topic_df_, topic_df_w], ignore_index = True, axis = 0)
            
    my_file = 'df_'+topic+'_windows_cum.csv'          
    topic_df_.to_csv(os.path.join(my_path, my_file))

In [None]:
for my_path in my_path_list:  
    #Exp1_stat_cum
    my_file = 'df_topic_windows.csv'
    df_topics = pd.read_csv(os.path.join(my_path, my_file),index_col=0)    
    for topic in topic_list:  
        Exp1_stat_cum_windows(df_topics=df_topics,topic=topic,my_path=my_path) 

In [None]:
#cumulative 
def Exp1_stat_cum(df_topics,topic,my_path):
    df_topic = df_topics.query('topic==@topic').reset_index() #concept
    #df_topic = df_topic.drop_duplicates(subset=['T_0','k']) #drop duplicate windows
    
    #download all dictionaries
    dict_final_list = []
    dict_final_den_list = []
    dict_final_num_list = []
    dict_final_list_high1 = []
    dict_final_den_list_high1 = []
    dict_final_num_list_high1 = []
    dict_final_list_low1 = []
    dict_final_den_list_low1 = []
    dict_final_num_list_low1 = []
    my_path2 = os.path.join(discipline, 'Info')
    my_file = 'windows_list_'+topic
    with open(os.path.join(my_path2, my_file),"rb") as fp:
        windows_list = pickle.load(fp)
    start_year_window_list = windows_list #consider just windows selected by condition
    #start_year_window_list = time_periods[topic]
    #start_year_window_list = list(set(df_topic.T_0))
    num_windows_topic = len(start_year_window_list) #number windows
    for w in range(0,num_windows_topic): #each window
        start_year_window = start_year_window_list[w]
        df_topic_w = df_topic.query('T_0==@start_year_window').reset_index()
        
        dict_final_list.append(dict(zip(df_topic_w.k, df_topic_w.prob))) 
        dict_final_den_list.append(dict(zip(df_topic_w.k, df_topic_w.den)))
        dict_final_num_list.append(dict(zip(df_topic_w.k, df_topic_w.num)))
        dict_final_list_high1.append(dict(zip(df_topic_w.k, df_topic_w.prob_high1)))
        dict_final_den_list_high1.append(dict(zip(df_topic_w.k, df_topic_w.den_high1)))
        dict_final_num_list_high1.append(dict(zip(df_topic_w.k, df_topic_w.num_high1)))
        dict_final_list_low1.append(dict(zip(df_topic_w.k, df_topic_w.prob_low1)))
        dict_final_den_list_low1.append(dict(zip(df_topic_w.k, df_topic_w.den_low1)))
        dict_final_num_list_low1.append(dict(zip(df_topic_w.k, df_topic_w.num_low1)))
        
    #add missing keys
    max_keys=0
    for w in range(0,num_windows_topic):
        max_keys = max(max_keys,max(dict_final_list[w].keys()))
    for w in range(0,num_windows_topic):
        for j in range(0,max_keys+1):
            if j not in dict_final_list[w].keys():
                dict_final_list[w][j] = np.nan 
                dict_final_den_list[w][j] = np.nan
                dict_final_num_list[w][j] = np.nan
            if j not in dict_final_list_high1[w].keys():
                dict_final_list_high1[w][j] = np.nan 
                dict_final_den_list_high1[w][j] = np.nan
                dict_final_num_list_high1[w][j] = np.nan
            if j not in dict_final_list_low1[w].keys():
                dict_final_list_low1[w][j] = np.nan 
                dict_final_den_list_low1[w][j] = np.nan
                dict_final_num_list_low1[w][j] = np.nan
                
        #order dictionary by key
        dict_final_num_list[w] = collections.OrderedDict(sorted(dict_final_num_list[w].items()))
        dict_final_den_list[w] = collections.OrderedDict(sorted(dict_final_den_list[w].items()))
        dict_final_list[w] = collections.OrderedDict(sorted(dict_final_list[w].items()))
        dict_final_num_list_high1[w] = collections.OrderedDict(sorted(dict_final_num_list_high1[w].items()))
        dict_final_den_list_high1[w] = collections.OrderedDict(sorted(dict_final_den_list_high1[w].items()))
        dict_final_list_high1[w] = collections.OrderedDict(sorted(dict_final_list_high1[w].items()))
        dict_final_num_list_low1[w] = collections.OrderedDict(sorted(dict_final_num_list_low1[w].items()))
        dict_final_den_list_low1[w] = collections.OrderedDict(sorted(dict_final_den_list_low1[w].items()))
        dict_final_list_low1[w] = collections.OrderedDict(sorted(dict_final_list_low1[w].items()))
                
                
    #cumulative distributions (at least one)
    dict_final_num_list_cum = []
    dict_final_den_list_cum = []
    dict_final_list_cum = []
    dict_final_num_list_high1_cum = []
    dict_final_den_list_high1_cum = []
    dict_final_list_high1_cum = []
    dict_final_num_list_low1_cum = []
    dict_final_den_list_low1_cum = []
    dict_final_list_low1_cum = []
    for w in range(0,num_windows_topic):
        dataframe_num_w = pd.DataFrame(dict_final_num_list[w].items(), columns=['k', 'num'])
        dataframe_den_w = pd.DataFrame(dict_final_den_list[w].items(), columns=['k', 'den'])
        dataframe_w = dataframe_num_w.merge(dataframe_den_w)
        dataframe_w_rev = dataframe_w.loc[::-1] 
        dataframe_w['num_cum'] = dataframe_w_rev['num'].cumsum().loc[::-1]
        dataframe_w['den_cum'] = dataframe_w_rev['den'].cumsum().loc[::-1]
        dataframe_w['prob_cum'] = (dataframe_w.num_cum / dataframe_w.den_cum).loc[::-1]
        dict_final_num_list_cum.append(dict(zip(dataframe_w.k, dataframe_w.num_cum)))    
        dict_final_den_list_cum.append(dict(zip(dataframe_w.k, dataframe_w.den_cum))) 
        dict_final_list_cum.append(dict(zip(dataframe_w.k, dataframe_w.prob_cum))) 
        
        dataframe_num_w = pd.DataFrame(dict_final_num_list_high1[w].items(), columns=['k', 'num'])
        dataframe_den_w = pd.DataFrame(dict_final_den_list_high1[w].items(), columns=['k', 'den'])
        dataframe_w = dataframe_num_w.merge(dataframe_den_w)
        dataframe_w_rev = dataframe_w.loc[::-1] 
        dataframe_w['num_cum'] = dataframe_w_rev['num'].cumsum().loc[::-1]
        dataframe_w['den_cum'] = dataframe_w_rev['den'].cumsum().loc[::-1]
        dataframe_w['prob_cum'] = (dataframe_w.num_cum / dataframe_w.den_cum).loc[::-1]
        dict_final_num_list_high1_cum.append(dict(zip(dataframe_w.k, dataframe_w.num_cum)))    
        dict_final_den_list_high1_cum.append(dict(zip(dataframe_w.k, dataframe_w.den_cum))) 
        dict_final_list_high1_cum.append(dict(zip(dataframe_w.k, dataframe_w.prob_cum)))
        
        dataframe_num_w = pd.DataFrame(dict_final_num_list_low1[w].items(), columns=['k', 'num'])
        dataframe_den_w = pd.DataFrame(dict_final_den_list_low1[w].items(), columns=['k', 'den'])
        dataframe_w = dataframe_num_w.merge(dataframe_den_w)
        dataframe_w_rev = dataframe_w.loc[::-1] 
        dataframe_w['num_cum'] = dataframe_w_rev['num'].cumsum().loc[::-1]
        dataframe_w['den_cum'] = dataframe_w_rev['den'].cumsum().loc[::-1]
        dataframe_w['prob_cum'] = (dataframe_w.num_cum / dataframe_w.den_cum).loc[::-1]
        dict_final_num_list_low1_cum.append(dict(zip(dataframe_w.k, dataframe_w.num_cum)))    
        dict_final_den_list_low1_cum.append(dict(zip(dataframe_w.k, dataframe_w.den_cum))) 
        dict_final_list_low1_cum.append(dict(zip(dataframe_w.k, dataframe_w.prob_cum)))
        

                                             
    #average and std dictionaries
    dict_final_list_mean = {}
    dict_final_list_std = {}
    dict_final_den_list_mean = {}
    dict_final_den_list_std = {}
    dict_final_num_list_mean = {}
    dict_final_num_list_std = {}
    dict_final_list_high1_mean = {}
    dict_final_list_high1_std = {}
    dict_final_den_list_high1_mean = {}
    dict_final_den_list_high1_std = {}
    dict_final_num_list_high1_mean = {}
    dict_final_num_list_high1_std = {}
    dict_final_list_low1_mean = {}
    dict_final_list_low1_std = {}
    dict_final_den_list_low1_mean = {}
    dict_final_den_list_low1_std = {}
    dict_final_num_list_low1_mean = {}
    dict_final_num_list_low1_std = {}
    for j in range(0,max_keys+1):
        values_j =  [d[j] for d in dict_final_list_cum]
        dict_final_list_mean[j] = np.nanmean(values_j)
        dict_final_list_std[j] = np.nanstd(values_j)/sqrt(num_windows_topic)
        values_j =  [d[j] for d in dict_final_den_list_cum]
        dict_final_den_list_mean[j] = np.nanmean(values_j)
        dict_final_den_list_std[j] = np.nanstd(values_j)/sqrt(num_windows_topic)
        values_j =  [d[j] for d in dict_final_num_list_cum]
        dict_final_num_list_mean[j] = np.nanmean(values_j)
        dict_final_num_list_std[j] = np.nanstd(values_j)/sqrt(num_windows_topic)
        
        values_j =  [d[j] for d in dict_final_list_high1_cum]
        dict_final_list_high1_mean[j] = np.nanmean(values_j)
        dict_final_list_high1_std[j] = np.nanstd(values_j)/sqrt(num_windows_topic) 
        values_j =  [d[j] for d in dict_final_den_list_high1_cum]
        dict_final_den_list_high1_mean[j] = np.nanmean(values_j)
        dict_final_den_list_high1_std[j] = np.nanstd(values_j)/sqrt(num_windows_topic)
        values_j =  [d[j] for d in dict_final_num_list_high1_cum]
        dict_final_num_list_high1_mean[j] = np.nanmean(values_j)
        dict_final_num_list_high1_std[j] = np.nanstd(values_j)/sqrt(num_windows_topic)
        
        values_j =  [d[j] for d in dict_final_list_low1_cum]
        dict_final_list_low1_mean[j] = np.nanmean(values_j)
        dict_final_list_low1_std[j] = np.nanstd(values_j)/sqrt(num_windows_topic)
        values_j =  [d[j] for d in dict_final_den_list_low1_cum]
        dict_final_den_list_low1_mean[j] = np.nanmean(values_j)
        dict_final_den_list_low1_std[j] = np.nanstd(values_j)/sqrt(num_windows_topic)
        values_j =  [d[j] for d in dict_final_num_list_low1_cum]
        dict_final_num_list_low1_mean[j] = np.nanmean(values_j)
        dict_final_num_list_low1_std[j] = np.nanstd(values_j)/sqrt(num_windows_topic)
        
    
    
    #save on file 
    Prob_mean_df=pd.DataFrame(dict_final_list_mean.items(), columns=['k', 'prob_mean'])
    Prob_std_df=pd.DataFrame(dict_final_list_std.items(), columns=['k', 'prob_std'])
    Den_mean_df=pd.DataFrame(dict_final_den_list_mean.items(), columns=['k', 'den_mean'])
    Den_std_df=pd.DataFrame(dict_final_den_list_std.items(), columns=['k', 'den_std'])
    Num_mean_df=pd.DataFrame(dict_final_num_list_mean.items(), columns=['k', 'num_mean'])
    Num_std_df=pd.DataFrame(dict_final_num_list_std.items(), columns=['k', 'num_std'])
    
    Prob_mean_high1_df=pd.DataFrame(dict_final_list_high1_mean.items(), columns=['k', 'prob_mean_high1'])
    Prob_std_high1_df=pd.DataFrame(dict_final_list_high1_std.items(), columns=['k', 'prob_std_high1'])
    Den_mean_high1_df=pd.DataFrame(dict_final_den_list_high1_mean.items(), columns=['k', 'den_mean_high1'])
    Den_std_high1_df=pd.DataFrame(dict_final_den_list_high1_std.items(), columns=['k', 'den_std_high1'])
    Num_mean_high1_df=pd.DataFrame(dict_final_num_list_high1_mean.items(), columns=['k', 'num_mean_high1'])
    Num_std_high1_df=pd.DataFrame(dict_final_num_list_high1_std.items(), columns=['k', 'num_std_high1'])
    Prob_mean_low1_df=pd.DataFrame(dict_final_list_low1_mean.items(), columns=['k', 'prob_mean_low1'])
    Prob_std_low1_df=pd.DataFrame(dict_final_list_low1_std.items(), columns=['k', 'prob_std_low1'])
    Den_mean_low1_df=pd.DataFrame(dict_final_den_list_low1_mean.items(), columns=['k', 'den_mean_low1'])
    Den_std_low1_df=pd.DataFrame(dict_final_den_list_low1_std.items(), columns=['k', 'den_std_low1'])
    Num_mean_low1_df=pd.DataFrame(dict_final_num_list_low1_mean.items(), columns=['k', 'num_mean_low1'])
    Num_std_low1_df=pd.DataFrame(dict_final_num_list_low1_std.items(), columns=['k', 'num_std_low1'])
       
    topic_df = ((Prob_mean_df.merge(Prob_std_df)).merge(Den_mean_df.merge(Den_std_df)).merge(Num_mean_df.merge(Num_std_df)))   
    topic_high1_df = ((Prob_mean_high1_df.merge(Prob_std_high1_df)).merge(Den_mean_high1_df.merge(Den_std_high1_df)).merge(Num_mean_high1_df.merge(Num_std_high1_df))) 
    topic_low1_df = ((Prob_mean_low1_df.merge(Prob_std_low1_df)).merge(Den_mean_low1_df.merge(Den_std_low1_df)).merge(Num_mean_low1_df.merge(Num_std_low1_df))) 

    topic_df_  = (topic_df.merge(topic_high1_df)).merge(topic_low1_df)
    
    #save all concept dataframes in one file 
    topic_df_.insert(0, 'topic', topic)
    return topic_df_

In [None]:
#simple contagion
def Exp1_stat_sc(df_topics,df_topics_cum,topic):

    df_topic = df_topics_cum.query('topic==@topic').reset_index()
    a, b = df_topic.k[0:11],df_topic.prob_mean[0:11]
    df_topic_baselines = df_topics.query('topic==@topic').reset_index()
    p = df_topic_baselines.prob_mean[1]
    prob_baselines = np.array([(1 - (1 - p)**k) for k in range(0, 11)])
    den = np.array(df_topic_baselines.den_mean)[0:11]
    num = prob_baselines * den 
    dataframe_num_baselines = pd.DataFrame(num, columns=['num'])
    dataframe_den_baselines = pd.DataFrame(den, columns=['den'])
    dataframe_baselines = dataframe_num_baselines.merge(dataframe_den_baselines, left_index=True,right_index=True)
    dataframe_baselines_rev = dataframe_baselines.loc[::-1] 
    dataframe_baselines['num_cum'] = dataframe_baselines_rev['num'].cumsum().loc[::-1]
    dataframe_baselines['den_cum'] = dataframe_baselines_rev['den'].cumsum().loc[::-1]
    dataframe_baselines['prob_cum'] = (dataframe_baselines.num_cum / dataframe_baselines.den_cum).loc[::-1]
    y_1 = list(dataframe_baselines['prob_cum'])     
    baseline_df = pd.DataFrame(list(zip(range(0, 11), y_1)), columns =['k', 'val'])
    
    baseline_df.insert(0, 'topic', topic)
    return baseline_df

In [None]:
for my_path in my_path_list:
    
    #Exp1_stat
    my_file = 'df_topic_windows.csv'
    df_topics = pd.read_csv(os.path.join(my_path, my_file),index_col=0)
    topics_df = pd.DataFrame();
    
    for topic in topic_list:        
        topic_df_top = Exp1_stat(df_topics=df_topics,topic=topic,my_path=my_path) 
        topics_df = pd.concat([topics_df, topic_df_top], ignore_index = True, axis = 0)
    my_file = 'df_topic_stat.csv'
    topics_df.to_csv(os.path.join(my_path, my_file))
    
    #Exp1_stat_cum
    my_file = 'df_topic_windows.csv'
    df_topics = pd.read_csv(os.path.join(my_path, my_file),index_col=0)
    topics_df = pd.DataFrame()
    
    for topic in topic_list:  
        topic_df_top = Exp1_stat_cum(df_topics=df_topics,topic=topic,my_path=my_path) 
        topics_df = pd.concat([topics_df, topic_df_top], ignore_index = True, axis = 0)
    my_file = 'df_topic_stat_cumulative.csv'
    topics_df.to_csv(os.path.join(my_path, my_file))
    
    #Exp1_stat_sc
    my_file = 'df_topic_stat.csv'
    df_topics = pd.read_csv(os.path.join(my_path, my_file),index_col=0)
    
    my_file = 'df_topic_stat_cumulative.csv'
    df_topics_cum = pd.read_csv(os.path.join(my_path, my_file),index_col=0) 

    sc_df = pd.DataFrame()
    
    for topic in topic_list: 
        print(topic)
        sc_df_top = Exp1_stat_sc(df_topics=df_topics,df_topics_cum=df_topics_cum,topic=topic)   
        sc_df = pd.concat([sc_df, sc_df_top], ignore_index = True, axis = 0)
     
    my_file = 'df_topic_stat_sc.csv'
    sc_df.to_csv(os.path.join(my_path, my_file))