In [1]:
import pandas as pd
import numpy as np
from datetime import date
from statistics import mean, mode
from pandas.api.types import CategoricalDtype
import math
from scipy.stats import norm

In [2]:
today = date.today()
print(today)

2024-02-10


# Tables

In [3]:
path_tables = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 2 tables/"

In [4]:
# tables
filename_careerorglink = "careerorglink.xlsx"
filename_leadercareerlink = "leadercareerlink.xlsx"
filename_orgtree = "orgtree.xlsx"
filename_elected = "positions_elected.xlsx"

In [5]:
# career-org link
# col = pd.read_excel(path_tables + filename_careerorglink,dtype="str")
# col.shape

In [6]:
# col.columns

In [7]:
# leader-career link
# lcl = pd.read_excel(path_tables + filename_leadercareerlink,dtype="str")
# lcl.shape

In [8]:
# lcl.columns

In [9]:
# orgtree
# org = pd.read_excel(path_tables + filename_orgtree,dtype="str")
# org.shape

In [10]:
# org.columns

In [11]:
# elected = pd.read_excel(path_tables + filename_elected,dtype="str")
# elected.shape

In [12]:
# elected.columns

# Queries

In [13]:
path_queries = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 3 queries/"

In [14]:
filename_leaderjob_all = "leaderjob_electUnelect_inOutgov.xlsx"
filename_leaderjob_no_spa = "leaderjob_no_spa.xlsx"
filename_leaderjobtransition_no_spa = "leaderjobtransition_no_spa.xlsx"

In [15]:
# leader jobs
# ljobs_all = pd.read_excel(path_queries + filename_leaderjob_all,dtype="str")
# ljobs_all.shape

In [16]:
# ljobs = pd.read_excel(path_queries + filename_leaderjob_no_spa,dtype="str")
# ljobs.shape

In [17]:
# transitions used for analysis - no SPA - no local-local
trans = pd.read_excel(path_queries + filename_leaderjobtransition_no_spa,dtype="str")
trans.shape

(4306, 31)

# Analysis - Research Note

In [18]:
path_analysis = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/analysis/"

In [19]:
# analysis sub-paths
study0_path = "2023.10.04 Study 0 - research note/"
study1_path = "2023.10.04 Study 1 - political capital/"
study2_path = "2023.10.04 Study 2 - commitment vs control/"
study3_path = "2023.10.04 Study 3 - reds vs experts/"

In [20]:
# ljobs = ljobs.astype({"CareerStartYear":"int","CareerStartDate":"int"})
# ljobs.dtypes

# Functions

In [21]:
def merge_results(m):
    
    print("\nMerge Results...")
    print("")
    print("\tshape     :",m.shape)
    print("\tleft_only :",m[m["_merge"]=="left_only"].shape)
    print("\tboth      :",m[m["_merge"]=="both"].shape)
    print("\tright_only:",m[m["_merge"]=="right_only"].shape)

In [22]:
# using this on (PI,OrgName) will ensure unique & non-null keys
# using this on a larger df will ensure unique rows and non-null keys, but not unique keys

def unique_non_null_rows(olddf):
    
    df = olddf.copy()
    
    
    ### drop duplicates
    df = df.drop_duplicates(keep="first",ignore_index=True)

    ### drop null rows
    df.dropna(how="all",axis=0,inplace=True)
    
    # drop rows with null PI
    df = df[~(df["PrimaryInstitution"].isna())]
        
    ### drop rows where PI contains stop words
    stop_words_lower = ["uncertain","current","deprecated","please_revise"]
    df = df[ ~ df["PrimaryInstitution"].str.lower().isin(stop_words_lower)]
            

    print("\nUnique Non-null Rows...")
    print("")
    print("\tNon-unique rows:",olddf.shape)
    print("\tUnique rows    :",df.shape)

    ### sort
    df = df.sort_values(["PrimaryInstitution","OrgName"])
    
    return df

In [23]:
def create_time_series(series,group_var,count_var):
    
    yeardist = series.groupby(group_var,as_index=False).count().sort_values(group_var)
    
    max_year = max(series[group_var])
    x = pd.DataFrame({"year":pd.Series(range(min(yeardist[group_var].astype(int)),max(yeardist[group_var].astype(int))+1))})
    
    yeardist[group_var] = yeardist[group_var].astype(int,errors="ignore")
    
    ts = x.merge(yeardist,left_on="year",right_on=group_var,how="left")
    ts.loc[ts[count_var].isna(),count_var]=0
    # merge_results(ts)
    
    ts_columns = ["year",count_var]
    ts = ts[ts_columns]
    
    return ts

In [24]:
# create InstitutionCategory (English) variable InstitutionType

# use with apply. e.g., 
# df["InstitutionCategory] = df["InstitutionType"].apply(define_institution_category)

def define_institution_category(PI):
    
    c = "Social"
    
    if PI=="정권기관":
        c = "Government"
    elif PI=="노동당":
        c = "Party"
    elif PI=="인민군":
        c = "Military"
        
    return c

In [25]:
def two_sample_ttest_proportions(x1,x2,n1,n2,alpha):
    
    pstar = (x1+x2)/(n1+n2)
    p1 = x1/n1
    p2 = x2/n2
    z = (p1-p2) / math.sqrt(pstar*(1-pstar)*((1/n1) + (1/n2)))
    pvalue = 1-norm.cdf(abs(z))
    
    # add significance stars to result
    stars = ""
    if pvalue<.05:
        stars = "*"
    if pvalue<.01:
        stars = "**"
    if pvalue<.001:
        stars = "***"
    result = str(pvalue)+stars
    
    print("p1:\t",p1)
    print("p2:\t",p2)
    print("p1-p2:\t",(p1-p2))
    print("z:\t",z)
    print("pvalue:\t",result)

In [99]:
# apply two_sample_ttest_proportions across categories
# InstitutionCategory_1

def ttest_by_institution_category(ics,stat,between_row,within_row,period1_col,period2_col):
        
    for ic in ics:

        print("\n\n",ic,"\n\n")

        stat_ic = stat[stat.InstitutionCategory_1==ic]

        # two-sample ttest of proportions: p(between PI trans in s1) > p(between PI trans in s2)
        x1 = stat_ic.iloc[period1_col,between_row]
        n1 = stat_ic.iloc[period1_col,between_row] + stat_ic.iloc[period1_col,within_row]
        x2 = stat_ic.iloc[period2_col,between_row]
        n2 = stat_ic.iloc[period2_col,between_row] + stat_ic.iloc[period2_col,within_row]
        alpha = .05
        print(x1,x2,n1,n2,alpha)
        two_sample_ttest_proportions(x1,x2,n1,n2,alpha)

In [98]:
# apply two_sample_ttest_proportions across categories
# InstitutionCategory_2

def ttest_by_institution_category_2(ics,stat,between_row,within_row,period1_col,period2_col):
        
    for ic in ics:

        print("\n\n",ic,"\n\n")

        stat_ic = stat[stat.InstitutionCategory_2==ic]

        # two-sample ttest of proportions: p(between PI trans in s1) > p(between PI trans in s2)
        x1 = stat_ic.iloc[period1_col,between_row]
        n1 = stat_ic.iloc[period1_col,between_row] + stat_ic.iloc[period1_col,within_row]
        x2 = stat_ic.iloc[period2_col,between_row]
        n2 = stat_ic.iloc[period2_col,between_row] + stat_ic.iloc[period2_col,within_row]
        alpha = .05
        print(x1,x2,n1,n2,alpha)
        two_sample_ttest_proportions(x1,x2,n1,n2,alpha)

# Format & Covariates

### 0. change datatypes

In [27]:
trans = trans.astype({"OrgAdvance":"int","PositionAdvance":"int",
                      "CareerStartYear_1":"int","CareerStartYear_2":"int",
                      "CareerStartDate_1":"int","CareerStartDate_2":"int"})
trans.dtypes

LeaderID                   object
CareerString_1             object
CareerDateString_2022_1    object
CareerStartYear_1           int32
CareerStartMonth_1         object
CareerStartDate_1           int32
CareerSubstring_1          object
InstitutionType_1          object
PrimaryInstitution_1       object
OrgName_1                  object
Local_1                    object
Position_1                 object
IsElected_1                object
OrgRank_1                  object
PositionRank_1             object
CareerString_2             object
CareerDateString_2022_2    object
CareerStartYear_2           int32
CareerStartMonth_2         object
CareerStartDate_2           int32
CareerSubstring_2          object
InstitutionType_2          object
PrimaryInstitution_2       object
OrgName_2                  object
Local_2                    object
Position_2                 object
IsElected_2                object
OrgRank_2                  object
PositionRank_2             object
OrgAdvance    

### 1. double-check exlcudes SPA jobs and local-local trans

In [28]:
trans.columns

Index(['LeaderID', 'CareerString_1', 'CareerDateString_2022_1',
       'CareerStartYear_1', 'CareerStartMonth_1', 'CareerStartDate_1',
       'CareerSubstring_1', 'InstitutionType_1', 'PrimaryInstitution_1',
       'OrgName_1', 'Local_1', 'Position_1', 'IsElected_1', 'OrgRank_1',
       'PositionRank_1', 'CareerString_2', 'CareerDateString_2022_2',
       'CareerStartYear_2', 'CareerStartMonth_2', 'CareerStartDate_2',
       'CareerSubstring_2', 'InstitutionType_2', 'PrimaryInstitution_2',
       'OrgName_2', 'Local_2', 'Position_2', 'IsElected_2', 'OrgRank_2',
       'PositionRank_2', 'OrgAdvance', 'PositionAdvance'],
      dtype='object')

In [29]:
trans.shape

(4306, 31)

In [30]:
trans.Local_1.unique()

array(['False', 'True'], dtype=object)

In [31]:
# remove Local-Local ties
trans = trans[~((trans["Local_1"]=="True") & (trans["Local_2"]=="True"))]
trans.shape

(4222, 31)

### 2. add InstitutionCategory_1, InstitutionCategory_2

In [32]:
trans.InstitutionType_1.unique()

array(['정권기관', '국제친선단체', '노동당', '인민군', '당외곽및사회단체_사회부문(별책)',
       '당외곽및사회단체_사회부문', '당외곽및사회단체_체육부문', '당외곽및사회단체_정치부문', '당외곽및사회단체_대외부문',
       '당외곽및사회단체_근로단체', '당외곽및사회단체_종교부문', '당외곽및사회단체_경제부문(별책)',
       '당외곽및사회단체_경제부문'], dtype=object)

### 3. PISame, OrgSame

In [33]:
trans.columns

Index(['LeaderID', 'CareerString_1', 'CareerDateString_2022_1',
       'CareerStartYear_1', 'CareerStartMonth_1', 'CareerStartDate_1',
       'CareerSubstring_1', 'InstitutionType_1', 'PrimaryInstitution_1',
       'OrgName_1', 'Local_1', 'Position_1', 'IsElected_1', 'OrgRank_1',
       'PositionRank_1', 'CareerString_2', 'CareerDateString_2022_2',
       'CareerStartYear_2', 'CareerStartMonth_2', 'CareerStartDate_2',
       'CareerSubstring_2', 'InstitutionType_2', 'PrimaryInstitution_2',
       'OrgName_2', 'Local_2', 'Position_2', 'IsElected_2', 'OrgRank_2',
       'PositionRank_2', 'OrgAdvance', 'PositionAdvance'],
      dtype='object')

In [34]:
trans["PISame"] = False
trans.loc[trans["PrimaryInstitution_1"]==trans["PrimaryInstitution_2"],["PISame"]] = True
trans.PISame.unique()

array([False,  True])

In [35]:
trans["OrgSame"] = np.nan
trans.loc[trans["PISame"]==True,"OrgSame"] = False
trans.loc[(trans["PISame"]==True) & (trans["OrgName_1"]==trans["OrgName_2"]),["OrgSame"]] = True
trans.OrgSame.unique()

array([nan, True, False], dtype=object)

### 4. OrgRankChange, PositionRankChange

In [36]:
trans["OrgRankChange"] = np.nan

In [37]:
trans.loc[trans["PISame"] & (trans["OrgRank_1"]>trans["OrgRank_2"]),"OrgRankChange"] = "lower"
trans.loc[trans["PISame"] & (trans["OrgRank_1"]==trans["OrgRank_2"]),"OrgRankChange"] = "same"
trans.loc[trans["PISame"] & (trans["OrgRank_1"]<trans["OrgRank_2"]),"OrgRankChange"] = "higher"

In [38]:
value_order = ["lower","same","higher"]
trans.OrgRankChange = trans.OrgRankChange.astype("category")
trans.OrgRankChange = trans.OrgRankChange.cat.set_categories(value_order)

In [39]:
trans.OrgRankChange.unique()

[NaN, 'same', 'higher', 'lower']
Categories (3, object): ['lower', 'same', 'higher']

In [40]:
trans["PositionRankChange"] = np.nan

In [41]:
# PositionRankChange, narrowly defined, with OrgSame=True
# trans.loc[trans["OrgSame"] & (trans["PositionRank_1"]>trans["PositionRank_2"]),"PositionRankChange"] = "lower"
# trans.loc[trans["OrgSame"] & (trans["PositionRank_1"]==trans["PositionRank_2"]),"PositionRankChange"] = "same"
# trans.loc[trans["OrgSame"] & (trans["PositionRank_1"]<trans["PositionRank_2"]),"PositionRankChange"] = "higher"

In [42]:
# PositionRankChange, broadly defined, with PISame=True & OrgRankChange="same"
trans.loc[trans["PISame"] & trans["OrgRankChange"].isin(["same"]) & (trans["PositionRank_1"]>trans["PositionRank_2"]),"PositionRankChange"] = "lower"
trans.loc[trans["PISame"] & trans["OrgRankChange"].isin(["same"]) & (trans["PositionRank_1"]==trans["PositionRank_2"]),"PositionRankChange"] = "same"
trans.loc[trans["PISame"] & trans["OrgRankChange"].isin(["same"]) & (trans["PositionRank_1"]<trans["PositionRank_2"]),"PositionRankChange"] = "higher"

In [43]:
value_order = ["lower","same","higher"]
trans.PositionRankChange = trans.PositionRankChange.astype("category")
trans.PositionRankChange = trans.PositionRankChange.cat.set_categories(value_order)

In [44]:
trans.PositionRankChange.unique()

[NaN, 'same', 'higher', 'lower']
Categories (3, object): ['lower', 'same', 'higher']

### 5. Succession Period - Broad

* KIS-->KJI: 1974-1993 - less institutionalized (stronger intra)
* KJI--->KJU:  2002-2011 - more institutionalized (stronger inter)

In [45]:
trans["Succession_Broad"] = np.nan

In [46]:
value_order = ["KIS to KJI","KJI to KJU"]
trans.Succession_Broad = trans.Succession_Broad.astype("category")
trans.Succession_Broad = trans.Succession_Broad.cat.set_categories(value_order)

In [47]:
trans.loc[(trans["CareerStartYear_2"]>1973) & (trans["CareerStartYear_2"]<1994),"Succession_Broad"] = "KIS to KJI"
trans.loc[(trans["CareerStartYear_2"]>2001) & (trans["CareerStartYear_2"]<2012),"Succession_Broad"] = "KJI to KJU"

In [48]:
trans[["Succession_Broad","OrgName_2"]].groupby("Succession_Broad",as_index=False).count()

Unnamed: 0,Succession_Broad,OrgName_2
0,KIS to KJI,674
1,KJI to KJU,648


In [49]:
trans.loc[trans["Succession_Broad"]=="KIS to KJI","CareerStartYear_2"]

34      1977
35      1977
36      1979
37      1980
38      1982
        ... 
3805    1980
3934    1993
4285    1977
4286    1980
4287    1990
Name: CareerStartYear_2, Length: 1006, dtype: int32

### 6. Succession Period - Narrow

#### Esther's periodization
* KIS-->KJI: 1987-1994 - less institutionalized (stronger intra)
* KJI--->KJU:  2009-2011 - more institutionalized (stronger inter)

In [50]:
trans["Succession_Narrow"] = np.nan

In [51]:
value_order = ["KIS to KJI","KJI to KJU"]
trans.Succession_Narrow = trans.Succession_Narrow.astype("category")
trans.Succession_Narrow = trans.Succession_Narrow.cat.set_categories(value_order)

In [52]:
### Esther's periodization

trans.loc[(trans["CareerStartYear_2"]>1986) & (trans["CareerStartYear_2"]<1995),"Succession_Narrow"] = "KIS to KJI"
trans.loc[(trans["CareerStartYear_2"]>2008) & (trans["CareerStartYear_2"]<2012),"Succession_Narrow"] = "KJI to KJU"

In [53]:
trans[["Succession_Narrow","OrgName_2"]].groupby("Succession_Narrow",as_index=False).count()

Unnamed: 0,Succession_Narrow,OrgName_2
0,KIS to KJI,323
1,KJI to KJU,383


### 7. OrgAdvanceYes

In [54]:
trans["OrgAdvanceYes"] = np.nan

In [55]:
trans.loc[trans["PISame"]==True,"OrgAdvanceYes"] = False
trans.loc[trans["PISame"]==True & (trans["OrgRank_1"]>trans["OrgRank_2"]),"OrgAdvanceYes"] = True

### 8. PositionAdvanceYes

In [56]:
trans["PositionAdvanceYes"] = np.nan

In [57]:
trans.loc[trans["OrgSame"]==True,"PositionAdvanceYes"] = False
trans.loc[trans["OrgSame"]==True & (trans["PositionRank_1"]>trans["PositionRank_2"]),"PositionAdvanceYes"] = True

### 9. InstitutionCategory

In [58]:
trans.columns

Index(['LeaderID', 'CareerString_1', 'CareerDateString_2022_1',
       'CareerStartYear_1', 'CareerStartMonth_1', 'CareerStartDate_1',
       'CareerSubstring_1', 'InstitutionType_1', 'PrimaryInstitution_1',
       'OrgName_1', 'Local_1', 'Position_1', 'IsElected_1', 'OrgRank_1',
       'PositionRank_1', 'CareerString_2', 'CareerDateString_2022_2',
       'CareerStartYear_2', 'CareerStartMonth_2', 'CareerStartDate_2',
       'CareerSubstring_2', 'InstitutionType_2', 'PrimaryInstitution_2',
       'OrgName_2', 'Local_2', 'Position_2', 'IsElected_2', 'OrgRank_2',
       'PositionRank_2', 'OrgAdvance', 'PositionAdvance', 'PISame', 'OrgSame',
       'OrgRankChange', 'PositionRankChange', 'Succession_Broad',
       'Succession_Narrow', 'OrgAdvanceYes', 'PositionAdvanceYes'],
      dtype='object')

In [59]:
trans["InstitutionCategory_1"] = trans["InstitutionType_1"].apply(define_institution_category)
trans["InstitutionCategory_2"] = trans["InstitutionType_2"].apply(define_institution_category)

### 10. InstitutionCategorySame

In [60]:
trans["InstitutionCategorySame"] = trans["InstitutionCategory_1"] == trans["InstitutionCategory_2"]

### 11. Succession_Five

#### Jacob's periodization
* KIS-->KJI: 1994-1999 - less institutionalized (stronger intra)
* KJI--->KJU:  2011-2016 - more institutionalized (stronger inter)

In [61]:
trans["Succession_Five"] = np.nan

In [62]:
value_order = ["KIS to KJI","KJI to KJU"]
trans.Succession_Five = trans.Succession_Five.astype("category")
trans.Succession_Five = trans.Succession_Five.cat.set_categories(value_order)

In [63]:
### Jacob's periodization

trans.loc[(trans["CareerStartYear_2"]>1993) & (trans["CareerStartYear_2"]<2000),"Succession_Five"] = "KIS to KJI"
trans.loc[(trans["CareerStartYear_2"]>2010) & (trans["CareerStartYear_2"]<2016),"Succession_Five"] = "KJI to KJU"

In [64]:
trans[["Succession_Five","OrgName_2"]].groupby("Succession_Five",as_index=False).count()

Unnamed: 0,Succession_Five,OrgName_2
0,KIS to KJI,229
1,KJI to KJU,229


### 12. Restrict to Top 2 OrgRanks (OrgRank = {0,1})

In [65]:
trans.columns

Index(['LeaderID', 'CareerString_1', 'CareerDateString_2022_1',
       'CareerStartYear_1', 'CareerStartMonth_1', 'CareerStartDate_1',
       'CareerSubstring_1', 'InstitutionType_1', 'PrimaryInstitution_1',
       'OrgName_1', 'Local_1', 'Position_1', 'IsElected_1', 'OrgRank_1',
       'PositionRank_1', 'CareerString_2', 'CareerDateString_2022_2',
       'CareerStartYear_2', 'CareerStartMonth_2', 'CareerStartDate_2',
       'CareerSubstring_2', 'InstitutionType_2', 'PrimaryInstitution_2',
       'OrgName_2', 'Local_2', 'Position_2', 'IsElected_2', 'OrgRank_2',
       'PositionRank_2', 'OrgAdvance', 'PositionAdvance', 'PISame', 'OrgSame',
       'OrgRankChange', 'PositionRankChange', 'Succession_Broad',
       'Succession_Narrow', 'OrgAdvanceYes', 'PositionAdvanceYes',
       'InstitutionCategory_1', 'InstitutionCategory_2',
       'InstitutionCategorySame', 'Succession_Five'],
      dtype='object')

In [66]:
### reset trans
# trans_orig = trans
# trans = trans_orig

In [67]:
trans["OrgRank_1"].astype(int).isin([0,1])

0       True
1       True
2       True
3       True
4       True
        ... 
4300    True
4301    True
4302    True
4303    True
4304    True
Name: OrgRank_1, Length: 4222, dtype: bool

In [68]:
# Top 1 & 2 OrgRanks
trans = trans[(trans["OrgRank_1"].astype(int).isin([0,1])) & (trans["OrgRank_2"].astype(int).isin([0,1]))]

In [69]:
trans.shape

(2190, 43)

In [70]:
# trans_orig.shape

NameError: name 'trans_orig' is not defined

# Descriptive Analysis of Top 1&2 OrgRanks

In [71]:
sending = trans[["InstitutionType_1","PrimaryInstitution_1","OrgName_1","OrgRank_1"]]
sending.columns = ["InstitutionType","PrimaryInstitution","OrgName","OrgRank"]

receiving = trans[["InstitutionType_2","PrimaryInstitution_2","OrgName_2","OrgRank_2"]]
receiving.columns = ["InstitutionType","PrimaryInstitution","OrgName","OrgRank"]

top2 = pd.concat([sending,receiving])
top2.shape

(4380, 4)

In [72]:
top2.drop_duplicates(inplace=True)
top2.sort_values(["InstitutionType","PrimaryInstitution","OrgName"],inplace=True)
top2.shape

(542, 4)

In [73]:
top2.to_excel(path_analysis + study0_path + "top2_piorg.xlsx",index=False)

In [74]:
trans.to_excel(path_analysis + study0_path + "top2_trans.xlsx",index=False)

# Hypotheses Series 1. s1s2, trans count, within & between PI

### 1. s1s2 trans count (within & between PI), succession broad

In [75]:
stat1_columns = ["PISame","Succession_Broad","OrgName_2"]
stat1_groupby_columns = ["Succession_Broad","PISame"]
# stat1_label_columns = ["Year","Total Transitions","Total Advancements of OrgRank"]

In [76]:
stat1 = trans[stat1_columns].groupby(stat1_groupby_columns,as_index=False)["OrgName_2"].agg({"Total Transitions":"count"})

# pivot PISame from column to rows
stat1 = stat1.pivot(index="Succession_Broad",columns="PISame",values="Total Transitions")
# pivot_column_labels = ["Succession_Broad","Between PI Trans","Within PI Trans"]
pivot_column_labels = ["Between PI Trans","Within PI Trans"]
stat1.columns = pivot_column_labels
stat1 = stat1.reset_index()

stat1

Unnamed: 0,Succession_Broad,Between PI Trans,Within PI Trans
0,KIS to KJI,190,73
1,KJI to KJU,218,90


In [77]:
stat1["Percentage Between PI Trans"] = stat1.apply(lambda x: round(x["Between PI Trans"] / (x["Between PI Trans"] + x["Within PI Trans"]),4) if (x["Between PI Trans"] + x["Within PI Trans"]) > 0 else np.nan,axis=1)
stat1

Unnamed: 0,Succession_Broad,Between PI Trans,Within PI Trans,Percentage Between PI Trans
0,KIS to KJI,190,73,0.7224
1,KJI to KJU,218,90,0.7078


In [78]:
# two-sample ttest of proportions: p(between PI trans in s1) > p(between PI trans in s2)
x1 = stat1.iloc[0,1]
n1 = stat1.iloc[0,1] + stat1.iloc[0,2]
x2 = stat1.iloc[1,1]
n2 = stat1.iloc[1,1] + stat1.iloc[1,2]
alpha = .05
print(x1,x2,n1,n2,alpha)
two_sample_ttest_proportions(x1,x2,n1,n2,alpha)

190 218 263 308 0.05
p1:	 0.7224334600760456
p2:	 0.7077922077922078
p1-p2:	 0.014641252283837836
z:	 0.3861229600869101
pvalue:	 0.34970280312360313


### 2. s1s2 trans count (within & between PI), succession narrow

In [79]:
stat2_columns = ["PISame","Succession_Narrow","OrgName_2"]
stat2_groupby_columns = ["Succession_Narrow","PISame"]
# stat1_label_columns = ["Year","Total Transitions","Total Advancements of OrgRank"]

In [80]:
stat2 = trans[stat2_columns].groupby(stat2_groupby_columns,as_index=False)["OrgName_2"].agg({"Total Transitions":"count"})

# pivot PISame from column to rows
stat2 = stat2.pivot(index="Succession_Narrow",columns="PISame",values="Total Transitions")
# pivot_column_labels = ["Succession_Broad","Between PI Trans","Within PI Trans"]
pivot_column_labels = ["Between PI Trans","Within PI Trans"]
stat2.columns = pivot_column_labels
stat2 = stat2.reset_index()

stat2

Unnamed: 0,Succession_Narrow,Between PI Trans,Within PI Trans
0,KIS to KJI,81,41
1,KJI to KJU,160,43


In [81]:
stat2["Percentage Between PI Trans"] = stat2.apply(lambda x: round(x["Between PI Trans"] / (x["Between PI Trans"] + x["Within PI Trans"]),4) if (x["Between PI Trans"] + x["Within PI Trans"]) > 0 else np.nan,axis=1)
stat2

Unnamed: 0,Succession_Narrow,Between PI Trans,Within PI Trans,Percentage Between PI Trans
0,KIS to KJI,81,41,0.6639
1,KJI to KJU,160,43,0.7882


In [82]:
# two-sample ttest of proportions: p(between PI trans in s1) > p(between PI trans in s2)
x1 = stat2.iloc[0,1]
n1 = stat2.iloc[0,1] + stat2.iloc[0,2]
x2 = stat2.iloc[1,1]
n2 = stat2.iloc[1,1] + stat2.iloc[1,2]
alpha = .05
print(x1,x2,n1,n2,alpha)
two_sample_ttest_proportions(x1,x2,n1,n2,alpha)

81 160 122 203 0.05
p1:	 0.6639344262295082
p2:	 0.7881773399014779
p1-p2:	 -0.12424291367196971
z:	 -2.4773825722817513
pvalue:	 0.006617497612158685**


### 3. s1s2 trans count (within & between PI), succession broad, by sending PI

In [83]:
stat3_columns = ["InstitutionCategory_1","PISame","Succession_Broad","OrgName_2"]
stat3_groupby_columns = ["InstitutionCategory_1","Succession_Broad","PISame"]

In [84]:
stat3 = trans[stat3_columns].groupby(stat3_groupby_columns,as_index=False)["OrgName_2"].agg({"Total Transitions":"count"})

# pivot PISame from column to rows
stat3 = stat3.pivot(index=["InstitutionCategory_1","Succession_Broad",],columns="PISame",values="Total Transitions")
# pivot_column_labels = ["Succession_Broad","Between PI Trans","Within PI Trans"]
pivot_column_labels = ["Between PI Trans","Within PI Trans"]
stat3.columns = pivot_column_labels
stat3 = stat3.reset_index()

stat3

Unnamed: 0,InstitutionCategory_1,Succession_Broad,Between PI Trans,Within PI Trans
0,Government,KIS to KJI,71,37
1,Government,KJI to KJU,65,75
2,Military,KIS to KJI,18,1
3,Military,KJI to KJU,52,3
4,Party,KIS to KJI,24,26
5,Party,KJI to KJU,9,0
6,Social,KIS to KJI,77,9
7,Social,KJI to KJU,92,12


In [85]:
stat3["Percentage Between PI Trans"] = stat3.apply(lambda x: round(x["Between PI Trans"] / (x["Between PI Trans"] + x["Within PI Trans"]),4) if (x["Between PI Trans"] + x["Within PI Trans"]) > 0 else np.nan,axis=1)
stat3

Unnamed: 0,InstitutionCategory_1,Succession_Broad,Between PI Trans,Within PI Trans,Percentage Between PI Trans
0,Government,KIS to KJI,71,37,0.6574
1,Government,KJI to KJU,65,75,0.4643
2,Military,KIS to KJI,18,1,0.9474
3,Military,KJI to KJU,52,3,0.9455
4,Party,KIS to KJI,24,26,0.48
5,Party,KJI to KJU,9,0,1.0
6,Social,KIS to KJI,77,9,0.8953
7,Social,KJI to KJU,92,12,0.8846


In [86]:
ics = list(trans.InstitutionCategory_2.unique())
ics

['Government', 'Social', 'Party', 'Military']

In [87]:
ttest_by_institution_category(ics,stat3,2,3,0,1)



 Government 


71 65 108 140 0.05
p1:	 0.6574074074074074
p2:	 0.4642857142857143
p1-p2:	 0.19312169312169314
z:	 3.0300816457868387
pvalue:	 0.0012224381847967258**


 Social 


77 92 86 104 0.05
p1:	 0.8953488372093024
p2:	 0.8846153846153846
p1-p2:	 0.01073345259391778
z:	 0.23487086031197224
pvalue:	 0.40715448353707107


 Party 


24 9 50 9 0.05
p1:	 0.48
p2:	 1.0
p1-p2:	 -0.52
z:	 -2.8926238482168274
pvalue:	 0.00191019264628145**


 Military 


18 52 19 55 0.05
p1:	 0.9473684210526315
p2:	 0.9454545454545454
p1-p2:	 0.00191387559808609
z:	 0.031805973636855445
pvalue:	 0.4873133913873938


### 4. s1s2 trans count (within & between PI), succession narrow, by sending PI

In [88]:
stat4_columns = ["InstitutionCategory_1","PISame","Succession_Narrow","OrgName_2"]
stat4_groupby_columns = ["InstitutionCategory_1","Succession_Narrow","PISame"]

In [89]:
stat4 = trans[stat4_columns].groupby(stat4_groupby_columns,as_index=False)["OrgName_2"].agg({"Total Transitions":"count"})

# pivot PISame from column to rows
stat4 = stat4.pivot(index=["InstitutionCategory_1","Succession_Narrow",],columns="PISame",values="Total Transitions")
# pivot_column_labels = ["Succession_Broad","Between PI Trans","Within PI Trans"]
pivot_column_labels = ["Between PI Trans","Within PI Trans"]
stat4.columns = pivot_column_labels
stat4 = stat4.reset_index()

stat4

Unnamed: 0,InstitutionCategory_1,Succession_Narrow,Between PI Trans,Within PI Trans
0,Government,KIS to KJI,25,25
1,Government,KJI to KJU,50,39
2,Military,KIS to KJI,3,1
3,Military,KJI to KJU,48,3
4,Party,KIS to KJI,12,13
5,Party,KJI to KJU,9,0
6,Social,KIS to KJI,41,2
7,Social,KJI to KJU,53,1


In [90]:
stat4["Percentage Between PI Trans"] = stat4.apply(lambda x: round(x["Between PI Trans"] / (x["Between PI Trans"] + x["Within PI Trans"]),4) if (x["Between PI Trans"] + x["Within PI Trans"]) > 0 else np.nan,axis=1)
stat4

Unnamed: 0,InstitutionCategory_1,Succession_Narrow,Between PI Trans,Within PI Trans,Percentage Between PI Trans
0,Government,KIS to KJI,25,25,0.5
1,Government,KJI to KJU,50,39,0.5618
2,Military,KIS to KJI,3,1,0.75
3,Military,KJI to KJU,48,3,0.9412
4,Party,KIS to KJI,12,13,0.48
5,Party,KJI to KJU,9,0,1.0
6,Social,KIS to KJI,41,2,0.9535
7,Social,KJI to KJU,53,1,0.9815


In [91]:
ics = list(trans.InstitutionCategory_2.unique())
ics

['Government', 'Social', 'Party', 'Military']

In [92]:
ttest_by_institution_category(ics,stat4,2,3,0,1)



 Government 


25 50 50 89 0.05
p1:	 0.5
p2:	 0.5617977528089888
p1-p2:	 -0.061797752808988804
z:	 -0.7015190513455424
pvalue:	 0.241489574744294


 Social 


41 53 43 54 0.05
p1:	 0.9534883720930233
p2:	 0.9814814814814815
p1-p2:	 -0.027993109388458226
z:	 -0.7911221823627147
pvalue:	 0.21443634794698374


 Party 


12 9 25 9 0.05
p1:	 0.48
p2:	 1.0
p1-p2:	 -0.52
z:	 -2.752661050173606
pvalue:	 0.0029556531624868043**


 Military 


3 48 4 51 0.05
p1:	 0.75
p2:	 0.9411764705882353
p1-p2:	 -0.19117647058823528
z:	 -1.4178026519447586
pvalue:	 0.07812419485736677


### 5. s1s2 trans count (within & between PI), succession broad, by receiving PI

In [93]:
stat5_columns = ["InstitutionCategory_2","PISame","Succession_Broad","OrgName_2"]
stat5_groupby_columns = ["InstitutionCategory_2","Succession_Broad","PISame"]

In [94]:
stat5 = trans[stat5_columns].groupby(stat5_groupby_columns,as_index=False)["OrgName_2"].agg({"Total Transitions":"count"})

# pivot PISame from column to rows
stat5 = stat5.pivot(index=["InstitutionCategory_2","Succession_Broad",],columns="PISame",values="Total Transitions")
# pivot_column_labels = ["Succession_Broad","Between PI Trans","Within PI Trans"]
pivot_column_labels = ["Between PI Trans","Within PI Trans"]
stat5.columns = pivot_column_labels
stat5 = stat5.reset_index()

stat5

Unnamed: 0,InstitutionCategory_2,Succession_Broad,Between PI Trans,Within PI Trans
0,Government,KIS to KJI,72,37
1,Government,KJI to KJU,69,75
2,Military,KIS to KJI,5,1
3,Military,KJI to KJU,15,3
4,Party,KIS to KJI,86,26
5,Party,KJI to KJU,110,0
6,Social,KIS to KJI,27,9
7,Social,KJI to KJU,24,12


In [95]:
stat5["Percentage Between PI Trans"] = stat5.apply(lambda x: round(x["Between PI Trans"] / (x["Between PI Trans"] + x["Within PI Trans"]),4) if (x["Between PI Trans"] + x["Within PI Trans"]) > 0 else np.nan,axis=1)
stat5

Unnamed: 0,InstitutionCategory_2,Succession_Broad,Between PI Trans,Within PI Trans,Percentage Between PI Trans
0,Government,KIS to KJI,72,37,0.6606
1,Government,KJI to KJU,69,75,0.4792
2,Military,KIS to KJI,5,1,0.8333
3,Military,KJI to KJU,15,3,0.8333
4,Party,KIS to KJI,86,26,0.7679
5,Party,KJI to KJU,110,0,1.0
6,Social,KIS to KJI,27,9,0.75
7,Social,KJI to KJU,24,12,0.6667


In [96]:
ics = list(trans.InstitutionCategory_2.unique())
ics

['Government', 'Social', 'Party', 'Military']

In [100]:
ttest_by_institution_category_2(ics,stat5,2,3,0,1)



 Government 


72 69 109 144 0.05
p1:	 0.6605504587155964
p2:	 0.4791666666666667
p1-p2:	 0.18138379204892968
z:	 2.8763032040275616
pvalue:	 0.002011815036797704**


 Social 


27 24 36 36 0.05
p1:	 0.75
p2:	 0.6666666666666666
p1-p2:	 0.08333333333333337
z:	 0.7778444682625976
pvalue:	 0.21833035452449723


 Party 


86 110 112 110 0.05
p1:	 0.7678571428571429
p2:	 1.0
p1-p2:	 -0.2321428571428571
z:	 -5.378020542566587
pvalue:	 3.765461276117321e-08***


 Military 


5 15 6 18 0.05
p1:	 0.8333333333333334
p2:	 0.8333333333333334
p1-p2:	 0.0
z:	 0.0
pvalue:	 0.5


### 6. s1s2 trans count (within & between PI), succession narrow, by receving PI

In [101]:
stat6_columns = ["InstitutionCategory_2","PISame","Succession_Narrow","OrgName_2"]
stat6_groupby_columns = ["InstitutionCategory_2","Succession_Narrow","PISame"]

In [102]:
stat6 = trans[stat6_columns].groupby(stat6_groupby_columns,as_index=False)["OrgName_2"].agg({"Total Transitions":"count"})

# pivot PISame from column to rows
stat6 = stat6.pivot(index=["InstitutionCategory_2","Succession_Narrow",],columns="PISame",values="Total Transitions")
# pivot_column_labels = ["Succession_Broad","Between PI Trans","Within PI Trans"]
pivot_column_labels = ["Between PI Trans","Within PI Trans"]
stat6.columns = pivot_column_labels
stat6 = stat6.reset_index()

stat6

Unnamed: 0,InstitutionCategory_2,Succession_Narrow,Between PI Trans,Within PI Trans
0,Government,KIS to KJI,39,25
1,Government,KJI to KJU,31,39
2,Military,KIS to KJI,6,1
3,Military,KJI to KJU,11,3
4,Party,KIS to KJI,23,13
5,Party,KJI to KJU,110,0
6,Social,KIS to KJI,13,2
7,Social,KJI to KJU,8,1


In [103]:
stat6["Percentage Between PI Trans"] = stat6.apply(lambda x: round(x["Between PI Trans"] / (x["Between PI Trans"] + x["Within PI Trans"]),4) if (x["Between PI Trans"] + x["Within PI Trans"]) > 0 else np.nan,axis=1)
stat6

Unnamed: 0,InstitutionCategory_2,Succession_Narrow,Between PI Trans,Within PI Trans,Percentage Between PI Trans
0,Government,KIS to KJI,39,25,0.6094
1,Government,KJI to KJU,31,39,0.4429
2,Military,KIS to KJI,6,1,0.8571
3,Military,KJI to KJU,11,3,0.7857
4,Party,KIS to KJI,23,13,0.6389
5,Party,KJI to KJU,110,0,1.0
6,Social,KIS to KJI,13,2,0.8667
7,Social,KJI to KJU,8,1,0.8889


In [104]:
ics = list(trans.InstitutionCategory_2.unique())
ics

['Government', 'Social', 'Party', 'Military']

In [105]:
ttest_by_institution_category_2(ics,stat6,2,3,0,1)



 Government 


39 31 64 70 0.05
p1:	 0.609375
p2:	 0.44285714285714284
p1-p2:	 0.16651785714285716
z:	 1.9275835556878367
pvalue:	 0.026953472949736046*


 Social 


13 8 15 9 0.05
p1:	 0.8666666666666667
p2:	 0.8888888888888888
p1-p2:	 -0.022222222222222143
z:	 -0.1593638145779186
pvalue:	 0.43669112315483205


 Party 


23 110 36 110 0.05
p1:	 0.6388888888888888
p2:	 1.0
p1-p2:	 -0.36111111111111116
z:	 -6.603396508384963
pvalue:	 2.0092150165851308e-11***


 Military 


6 11 7 14 0.05
p1:	 0.8571428571428571
p2:	 0.7857142857142857
p1-p2:	 0.0714285714285714
z:	 0.39295262399668773
pvalue:	 0.3471772365152276
