In [1]:
import pandas as pd
import numpy as np
from datetime import date
from statistics import mean, mode
from pandas.api.types import CategoricalDtype

In [2]:
today = date.today()
print(today)

2023-12-14


# Tables

In [3]:
path_tables = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 2 tables/"

In [4]:
# tables
filename_careerorglink = "careerorglink.xlsx"
filename_leadercareerlink = "leadercareerlink.xlsx"
filename_orgtree = "orgtree.xlsx"
filename_elected = "positions_elected.xlsx"

In [5]:
# career-org link
# col = pd.read_excel(path_tables + filename_careerorglink,dtype="str")
# col.shape

In [6]:
# col.columns

In [7]:
# leader-career link
# lcl = pd.read_excel(path_tables + filename_leadercareerlink,dtype="str")
# lcl.shape

In [8]:
# lcl.columns

In [9]:
# orgtree
# org = pd.read_excel(path_tables + filename_orgtree,dtype="str")
# org.shape

In [10]:
# org.columns

In [11]:
# elected = pd.read_excel(path_tables + filename_elected,dtype="str")
# elected.shape

In [12]:
# elected.columns

# Queries

In [13]:
path_queries = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 3 queries/"

In [14]:
filename_leaderjob_all = "leaderjob_electUnelect_inOutgov.xlsx"
filename_leaderjob_no_spa = "leaderjob_no_spa.xlsx"
filename_leaderjobtransition_no_spa = "leaderjobtransition_no_spa.xlsx"

In [15]:
# leader jobs
# ljobs_all = pd.read_excel(path_queries + filename_leaderjob_all,dtype="str")
# ljobs_all.shape

In [16]:
# ljobs = pd.read_excel(path_queries + filename_leaderjob_no_spa,dtype="str")
# ljobs.shape

In [17]:
# transitions used for analysis - no SPA - no local-local
trans = pd.read_excel(path_queries + filename_leaderjobtransition_no_spa,dtype="str")
trans.shape

(4306, 31)

# Analysis - Research Note

In [23]:
path_analysis = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/analysis/"

In [24]:
# analysis sub-paths
study0_path = "2023.10.04 Study 0 - research note/"
study1_path = "2023.10.04 Study 1 - political capital/"
study2_path = "2023.10.04 Study 2 - commitment vs control/"
study3_path = "2023.10.04 Study 3 - reds vs experts/"

In [18]:
# ljobs = ljobs.astype({"CareerStartYear":"int","CareerStartDate":"int"})
# ljobs.dtypes

# Functions

In [20]:
def merge_results(m):
    
    print("\nMerge Results...")
    print("")
    print("\tshape     :",m.shape)
    print("\tleft_only :",m[m["_merge"]=="left_only"].shape)
    print("\tboth      :",m[m["_merge"]=="both"].shape)
    print("\tright_only:",m[m["_merge"]=="right_only"].shape)

In [21]:
# using this on (PI,OrgName) will ensure unique & non-null keys
# using this on a larger df will ensure unique rows and non-null keys, but not unique keys

def unique_non_null_rows(olddf):
    
    df = olddf.copy()
    
    
    ### drop duplicates
    df = df.drop_duplicates(keep="first",ignore_index=True)

    ### drop null rows
    df.dropna(how="all",axis=0,inplace=True)
    
    # drop rows with null PI
    df = df[~(df["PrimaryInstitution"].isna())]
        
    ### drop rows where PI contains stop words
    stop_words_lower = ["uncertain","current","deprecated","please_revise"]
    df = df[ ~ df["PrimaryInstitution"].str.lower().isin(stop_words_lower)]
            

    print("\nUnique Non-null Rows...")
    print("")
    print("\tNon-unique rows:",olddf.shape)
    print("\tUnique rows    :",df.shape)

    ### sort
    df = df.sort_values(["PrimaryInstitution","OrgName"])
    
    return df

In [22]:
def create_time_series(series,group_var,count_var):
    
    yeardist = series.groupby(group_var,as_index=False).count().sort_values(group_var)
    
    max_year = max(series[group_var])
    x = pd.DataFrame({"year":pd.Series(range(min(yeardist[group_var].astype(int)),max(yeardist[group_var].astype(int))+1))})
    
    yeardist[group_var] = yeardist[group_var].astype(int,errors="ignore")
    
    ts = x.merge(yeardist,left_on="year",right_on=group_var,how="left")
    ts.loc[ts[count_var].isna(),count_var]=0
    # merge_results(ts)
    
    ts_columns = ["year",count_var]
    ts = ts[ts_columns]
    
    return ts

# Format

### change datatypes

In [19]:
trans = trans.astype({"OrgAdvance":"int","PositionAdvance":"int",
                      "CareerStartYear_1":"int","CareerStartYear_2":"int",
                      "CareerStartDate_1":"int","CareerStartDate_2":"int"})
trans.dtypes

LeaderID                   object
CareerString_1             object
CareerDateString_2022_1    object
CareerStartYear_1           int32
CareerStartMonth_1         object
CareerStartDate_1           int32
CareerSubstring_1          object
InstitutionType_1          object
PrimaryInstitution_1       object
OrgName_1                  object
Local_1                    object
Position_1                 object
IsElected_1                object
OrgRank_1                  object
PositionRank_1             object
CareerString_2             object
CareerDateString_2022_2    object
CareerStartYear_2           int32
CareerStartMonth_2         object
CareerStartDate_2           int32
CareerSubstring_2          object
InstitutionType_2          object
PrimaryInstitution_2       object
OrgName_2                  object
Local_2                    object
Position_2                 object
IsElected_2                object
OrgRank_2                  object
PositionRank_2             object
OrgAdvance    

### double-check exlcudes SPA jobs and local-local trans

In [26]:
trans.columns

Index(['LeaderID', 'CareerString_1', 'CareerDateString_2022_1',
       'CareerStartYear_1', 'CareerStartMonth_1', 'CareerStartDate_1',
       'CareerSubstring_1', 'InstitutionType_1', 'PrimaryInstitution_1',
       'OrgName_1', 'Local_1', 'Position_1', 'IsElected_1', 'OrgRank_1',
       'PositionRank_1', 'CareerString_2', 'CareerDateString_2022_2',
       'CareerStartYear_2', 'CareerStartMonth_2', 'CareerStartDate_2',
       'CareerSubstring_2', 'InstitutionType_2', 'PrimaryInstitution_2',
       'OrgName_2', 'Local_2', 'Position_2', 'IsElected_2', 'OrgRank_2',
       'PositionRank_2', 'OrgAdvance', 'PositionAdvance'],
      dtype='object')

In [27]:
trans.shape

(4306, 31)

In [28]:
trans.Local_1.unique()

array(['False', 'True'], dtype=object)

In [29]:
# remove Local-Local ties
trans2 = trans[~((trans["Local_1"]=="True") & (trans["Local_2"]=="True"))]
trans2.shape

(4222, 31)

# Construct covariates

In [35]:
trans2.columns

Index(['LeaderID', 'CareerString_1', 'CareerDateString_2022_1',
       'CareerStartYear_1', 'CareerStartMonth_1', 'CareerStartDate_1',
       'CareerSubstring_1', 'InstitutionType_1', 'PrimaryInstitution_1',
       'OrgName_1', 'Local_1', 'Position_1', 'IsElected_1', 'OrgRank_1',
       'PositionRank_1', 'CareerString_2', 'CareerDateString_2022_2',
       'CareerStartYear_2', 'CareerStartMonth_2', 'CareerStartDate_2',
       'CareerSubstring_2', 'InstitutionType_2', 'PrimaryInstitution_2',
       'OrgName_2', 'Local_2', 'Position_2', 'IsElected_2', 'OrgRank_2',
       'PositionRank_2', 'OrgAdvance', 'PositionAdvance'],
      dtype='object')

In [38]:
trans2["OrgAdvanceYes"] = trans2["OrgAdvance"]>0

In [71]:
trans2["SameInstitution"] = False

In [72]:
trans2.loc[(trans2["PrimaryInstitution_1"]==trans2["PrimaryInstitution_2"]),"SameInstitution"] = True

In [41]:
trans2["SameOrg"] = False

In [43]:
trans2.loc[(trans2["PrimaryInstitution_1"]==trans2["PrimaryInstitution_2"]) & 
       (trans2["OrgName_1"]==trans2["OrgName_2"]),"SameOrg"] = True

In [74]:
verify_columns = ["SameInstitution","SameOrg","OrgName_1"]
grouping_columns = ["SameInstitution","SameOrg"]
trans2[verify_columns].groupby(grouping_columns).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,OrgName_1
SameInstitution,SameOrg,Unnamed: 2_level_1
False,False,1633
True,False,1055
True,True,342


In [50]:
trans2["PositionAdvanceYes"] = np.nan

In [51]:
trans2.loc[(trans2["SameOrg"]==True),"PositionAdvanceYes"]=False

In [52]:
trans2.loc[(trans2["SameOrg"]==True) & (trans2["PositionAdvance"]>0),"PositionAdvanceYes"]=True

In [78]:
trans2.columns

Index(['LeaderID', 'CareerString_1', 'CareerDateString_2022_1',
       'CareerStartYear_1', 'CareerStartMonth_1', 'CareerStartDate_1',
       'CareerSubstring_1', 'InstitutionType_1', 'PrimaryInstitution_1',
       'OrgName_1', 'Local_1', 'Position_1', 'IsElected_1', 'OrgRank_1',
       'PositionRank_1', 'CareerString_2', 'CareerDateString_2022_2',
       'CareerStartYear_2', 'CareerStartMonth_2', 'CareerStartDate_2',
       'CareerSubstring_2', 'InstitutionType_2', 'PrimaryInstitution_2',
       'OrgName_2', 'Local_2', 'Position_2', 'IsElected_2', 'OrgRank_2',
       'PositionRank_2', 'OrgAdvance', 'PositionAdvance', 'OrgAdvanceYes',
       'SameOrg', 'PositionAdvanceYes', 'SameInstitution'],
      dtype='object')

In [80]:
trans2.OrgAdvance.unique()

array([ 0,  1, -1,  2, -2, -3,  3, -4,  4])

In [85]:
# Succession Period - Broad

# - KIS-->KJI: 1974-1993 - less institutionalized (stronger intra)
# - KJI--->KJU:  2002-2011 - more institutionalized (stronger inter)

trans2["Succession_Broad"] = np.nan
trans2.loc[(trans2["CareerStartYear_2"]>1973) & (trans2["CareerStartYear_2"]<1994),"Succession_Broad"] = 1
trans2.loc[(trans2["CareerStartYear_2"]>2001) & (trans2["CareerStartYear_2"]<2012),"Succession_Broad"] = 2
trans2[["Succession_Broad","OrgName_2"]].groupby("Succession_Broad").count()

Unnamed: 0_level_0,OrgName_2
Succession_Broad,Unnamed: 1_level_1
1.0,693
2.0,655


In [84]:
# Succession Period - Narrow

# - KIS-->KJI: 1987-1994 - less institutionalized (stronger intra)
# - KJI--->KJU:  2009-2011 - more institutionalized (stronger inter)

trans2["Succession_Narrow"] = np.nan
trans2.loc[(trans2["CareerStartYear_2"]>1986) & (trans2["CareerStartYear_2"]<1995),"Succession_Narrow"] = 1
trans2.loc[(trans2["CareerStartYear_2"]>2008) & (trans2["CareerStartYear_2"]<2012),"Succession_Narrow"] = 2
trans2[["Succession_Narrow","OrgName_2"]].groupby("Succession_Narrow").count()

Unnamed: 0_level_0,OrgName_2
Succession_Narrow,Unnamed: 1_level_1
1.0,330
2.0,385


### OrgRank increase - percent by year

In [54]:
trans2.columns

Index(['LeaderID', 'CareerString_1', 'CareerDateString_2022_1',
       'CareerStartYear_1', 'CareerStartMonth_1', 'CareerStartDate_1',
       'CareerSubstring_1', 'InstitutionType_1', 'PrimaryInstitution_1',
       'OrgName_1', 'Local_1', 'Position_1', 'IsElected_1', 'OrgRank_1',
       'PositionRank_1', 'CareerString_2', 'CareerDateString_2022_2',
       'CareerStartYear_2', 'CareerStartMonth_2', 'CareerStartDate_2',
       'CareerSubstring_2', 'InstitutionType_2', 'PrimaryInstitution_2',
       'OrgName_2', 'Local_2', 'Position_2', 'IsElected_2', 'OrgRank_2',
       'PositionRank_2', 'OrgAdvance', 'PositionAdvance', 'OrgAdvanceYes',
       'SameOrg', 'PositionAdvanceYes'],
      dtype='object')

In [60]:
stat1_columns = ["CareerStartYear_2","OrgAdvanceYes"]
stat1_groupby_columns = ["CareerStartYear_2"]
stat1_label_columns = ["Year","Total Transitions","Total Advancements of OrgRank"]

In [69]:
stat1 = trans2[stat1_columns].groupby(stat1_groupby_columns,as_index=False)["OrgAdvanceYes"].agg({"Total Transitions":"count","Total Advancements of OrgRank":"sum"})
stat1.columns = stat1_label_columns
stat1["Percentage Advancements of OrgRank"] = stat1.apply(lambda x: round(x["Total Advancements of OrgRank"] / x["Total Transitions"] * 100,2) if x["Total Transitions"] > 0 else np.nan,axis=1)
stat1

Unnamed: 0,Year,Total Transitions,Total Advancements of OrgRank,Percentage Advancements of OrgRank
0,1946,5,1,20.00
1,1947,1,0,0.00
2,1948,3,2,66.67
3,1949,1,0,0.00
4,1950,10,2,20.00
...,...,...,...,...
71,2017,67,19,28.36
72,2018,59,19,32.20
73,2019,164,48,29.27
74,2020,64,20,31.25


In [None]:
stat1_filename = "orgrank_within_between.xlsx"
stat1.to_excel(path_analysis + study0_path + stat2_filename,index=False)

### OrgRank increase - percent by year, within & between PrimaryInstitution

In [54]:
trans2.columns

Index(['LeaderID', 'CareerString_1', 'CareerDateString_2022_1',
       'CareerStartYear_1', 'CareerStartMonth_1', 'CareerStartDate_1',
       'CareerSubstring_1', 'InstitutionType_1', 'PrimaryInstitution_1',
       'OrgName_1', 'Local_1', 'Position_1', 'IsElected_1', 'OrgRank_1',
       'PositionRank_1', 'CareerString_2', 'CareerDateString_2022_2',
       'CareerStartYear_2', 'CareerStartMonth_2', 'CareerStartDate_2',
       'CareerSubstring_2', 'InstitutionType_2', 'PrimaryInstitution_2',
       'OrgName_2', 'Local_2', 'Position_2', 'IsElected_2', 'OrgRank_2',
       'PositionRank_2', 'OrgAdvance', 'PositionAdvance', 'OrgAdvanceYes',
       'SameOrg', 'PositionAdvanceYes'],
      dtype='object')

In [75]:
stat2_columns = ["CareerStartYear_2","SameInstitution","OrgAdvanceYes"]
stat2_groupby_columns = ["CareerStartYear_2","SameInstitution"]
stat2_label_columns = ["Year","SameInstitution","Total Transitions","Total Advancements of OrgRank"]

In [76]:
stat2 = trans2[stat2_columns].groupby(stat2_groupby_columns,as_index=False)["OrgAdvanceYes"].agg({"Total Transitions":"count","Total Advancements of OrgRank":"sum"})
stat2.columns = stat2_label_columns
stat2["Percentage Advancements of OrgRank"] = stat2.apply(lambda x: round(x["Total Advancements of OrgRank"] / x["Total Transitions"] * 100,2) if x["Total Transitions"] > 0 else np.nan,axis=1)
stat2

Unnamed: 0,Year,SameInstitution,Total Transitions,Total Advancements of OrgRank,Percentage Advancements of OrgRank
0,1946,False,2,0,0.00
1,1946,True,3,1,33.33
2,1947,False,1,0,0.00
3,1948,False,1,0,0.00
4,1948,True,2,2,100.00
...,...,...,...,...,...
142,2019,True,67,12,17.91
143,2020,False,28,13,46.43
144,2020,True,36,7,19.44
145,2021,False,73,15,20.55


In [None]:
# pivot SameInstitution - 

In [77]:
stat2_filename = "orgrank_within_between.xlsx"
stat2.to_excel(path_analysis + study0_path + stat2_filename,index=False)

### OrgRank increase - percent by succesion period (broad)

In [54]:
trans2.columns

Index(['LeaderID', 'CareerString_1', 'CareerDateString_2022_1',
       'CareerStartYear_1', 'CareerStartMonth_1', 'CareerStartDate_1',
       'CareerSubstring_1', 'InstitutionType_1', 'PrimaryInstitution_1',
       'OrgName_1', 'Local_1', 'Position_1', 'IsElected_1', 'OrgRank_1',
       'PositionRank_1', 'CareerString_2', 'CareerDateString_2022_2',
       'CareerStartYear_2', 'CareerStartMonth_2', 'CareerStartDate_2',
       'CareerSubstring_2', 'InstitutionType_2', 'PrimaryInstitution_2',
       'OrgName_2', 'Local_2', 'Position_2', 'IsElected_2', 'OrgRank_2',
       'PositionRank_2', 'OrgAdvance', 'PositionAdvance', 'OrgAdvanceYes',
       'SameOrg', 'PositionAdvanceYes'],
      dtype='object')

In [75]:
stat2_columns = ["CareerStartYear_2","SameInstitution","OrgAdvanceYes"]
stat2_groupby_columns = ["CareerStartYear_2","SameInstitution"]
stat2_label_columns = ["Year","SameInstitution","Total Transitions","Total Advancements of OrgRank"]

In [76]:
stat2 = trans2[stat2_columns].groupby(stat2_groupby_columns,as_index=False)["OrgAdvanceYes"].agg({"Total Transitions":"count","Total Advancements of OrgRank":"sum"})
stat2.columns = stat2_label_columns
stat2["Percentage Advancements of OrgRank"] = stat2.apply(lambda x: round(x["Total Advancements of OrgRank"] / x["Total Transitions"] * 100,2) if x["Total Transitions"] > 0 else np.nan,axis=1)
stat2

Unnamed: 0,Year,SameInstitution,Total Transitions,Total Advancements of OrgRank,Percentage Advancements of OrgRank
0,1946,False,2,0,0.00
1,1946,True,3,1,33.33
2,1947,False,1,0,0.00
3,1948,False,1,0,0.00
4,1948,True,2,2,100.00
...,...,...,...,...,...
142,2019,True,67,12,17.91
143,2020,False,28,13,46.43
144,2020,True,36,7,19.44
145,2021,False,73,15,20.55


In [None]:
# pivot SameInstitution - 

In [77]:
stat2_filename = "orgrank_within_between.xlsx"
stat2.to_excel(path_analysis + study0_path + stat2_filename,index=False)

# Hypothesis 1a - mobility during succession periods, broad

In [87]:
stat3_columns = ["Succession_Broad","OrgAdvanceYes"]
stat3_groupby_columns = ["Succession_Broad"]
stat3_label_columns = ["Succession Period","Total Transitions","Total Advancements of OrgRank"]

In [96]:
select_columns = ["SameInstitution","Succession_Broad","OrgName_2"]
group_columns = ["Succession_Broad","SameInstitution"]
trans2[select_columns].groupby(group_columns).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,OrgName_2
Succession_Broad,SameInstitution,Unnamed: 2_level_1
1.0,False,435
1.0,True,258
2.0,False,366
2.0,True,289


In [99]:
trans2.shape

(4306, 37)

In [98]:
trans3 = trans2[trans2["SameInstitution"]==True]
trans3.shape

(1553, 37)

In [102]:
select_columns = ["SameInstitution","Succession_Broad","OrgName_2"]
group_columns = ["Succession_Broad","SameInstitution"]
trans3[select_columns].groupby(group_columns).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,OrgName_2
Succession_Broad,SameInstitution,Unnamed: 2_level_1
1.0,True,258
2.0,True,289


In [100]:
stat3 = trans3[stat3_columns].groupby(stat3_groupby_columns,as_index=False)["OrgAdvanceYes"].agg({"Total Transitions":"count","Total Advancements of OrgRank":"sum"})
stat3.columns = stat3_label_columns
stat3["Percentage Advancements of OrgRank"] = stat3.apply(lambda x: round(x["Total Advancements of OrgRank"] / x["Total Transitions"] * 100,2) if x["Total Transitions"] > 0 else np.nan,axis=1)
stat3

Unnamed: 0,Succession Period,Total Transitions,Total Advancements of OrgRank,Percentage Advancements of OrgRank
0,1.0,147,26,17.69
1,2.0,168,65,38.69


# Hypothesis 1b - mobility during succession periods, narrow

In [91]:
stat3_columns = ["Succession_Narrow","OrgAdvanceYes"]
stat3_groupby_columns = ["Succession_Narrow"]
stat3_label_columns = ["Succession Period","Total Transitions","Total Advancements of OrgRank"]

In [101]:
stat3 = trans3[stat3_columns].groupby(stat3_groupby_columns,as_index=False)["OrgAdvanceYes"].agg({"Total Transitions":"count","Total Advancements of OrgRank":"sum"})
stat3.columns = stat3_label_columns
stat3["Percentage Advancements of OrgRank"] = stat3.apply(lambda x: round(x["Total Advancements of OrgRank"] / x["Total Transitions"] * 100,2) if x["Total Transitions"] > 0 else np.nan,axis=1)
stat3

Unnamed: 0,Succession Period,Total Transitions,Total Advancements of OrgRank,Percentage Advancements of OrgRank
0,1.0,147,26,17.69
1,2.0,168,65,38.69


# Hypothesis 2b - mobility during succession periods, broad