In [139]:
import pandas as pd
import numpy as np
from datetime import date
from statistics import mean, mode
from pandas.api.types import CategoricalDtype
import math
from scipy.stats import norm

In [140]:
today = date.today()
print(today)

2024-01-14


# Tables

In [141]:
path_tables = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 2 tables/"

In [142]:
# tables
filename_careerorglink = "careerorglink.xlsx"
filename_leadercareerlink = "leadercareerlink.xlsx"
filename_orgtree = "orgtree.xlsx"
filename_elected = "positions_elected.xlsx"

In [143]:
# career-org link
# col = pd.read_excel(path_tables + filename_careerorglink,dtype="str")
# col.shape

In [144]:
# col.columns

In [145]:
# leader-career link
# lcl = pd.read_excel(path_tables + filename_leadercareerlink,dtype="str")
# lcl.shape

In [146]:
# lcl.columns

In [147]:
# orgtree
# org = pd.read_excel(path_tables + filename_orgtree,dtype="str")
# org.shape

In [148]:
# org.columns

In [149]:
# elected = pd.read_excel(path_tables + filename_elected,dtype="str")
# elected.shape

In [150]:
# elected.columns

# Queries

In [151]:
path_queries = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 3 queries/"

In [152]:
filename_leaderjob_all = "leaderjob_electUnelect_inOutgov.xlsx"
filename_leaderjob_no_spa = "leaderjob_no_spa.xlsx"
filename_leaderjobtransition_no_spa = "leaderjobtransition_no_spa.xlsx"

In [153]:
# leader jobs
# ljobs_all = pd.read_excel(path_queries + filename_leaderjob_all,dtype="str")
# ljobs_all.shape

In [154]:
# ljobs = pd.read_excel(path_queries + filename_leaderjob_no_spa,dtype="str")
# ljobs.shape

In [155]:
# transitions used for analysis - no SPA - no local-local
trans = pd.read_excel(path_queries + filename_leaderjobtransition_no_spa,dtype="str")
trans.shape

(4306, 31)

# Analysis - Research Note

In [156]:
path_analysis = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/analysis/"

In [157]:
# analysis sub-paths
study0_path = "2023.10.04 Study 0 - research note/"
study1_path = "2023.10.04 Study 1 - political capital/"
study2_path = "2023.10.04 Study 2 - commitment vs control/"
study3_path = "2023.10.04 Study 3 - reds vs experts/"

In [158]:
# ljobs = ljobs.astype({"CareerStartYear":"int","CareerStartDate":"int"})
# ljobs.dtypes

# Functions

In [159]:
def merge_results(m):
    
    print("\nMerge Results...")
    print("")
    print("\tshape     :",m.shape)
    print("\tleft_only :",m[m["_merge"]=="left_only"].shape)
    print("\tboth      :",m[m["_merge"]=="both"].shape)
    print("\tright_only:",m[m["_merge"]=="right_only"].shape)

In [160]:
# using this on (PI,OrgName) will ensure unique & non-null keys
# using this on a larger df will ensure unique rows and non-null keys, but not unique keys

def unique_non_null_rows(olddf):
    
    df = olddf.copy()
    
    
    ### drop duplicates
    df = df.drop_duplicates(keep="first",ignore_index=True)

    ### drop null rows
    df.dropna(how="all",axis=0,inplace=True)
    
    # drop rows with null PI
    df = df[~(df["PrimaryInstitution"].isna())]
        
    ### drop rows where PI contains stop words
    stop_words_lower = ["uncertain","current","deprecated","please_revise"]
    df = df[ ~ df["PrimaryInstitution"].str.lower().isin(stop_words_lower)]
            

    print("\nUnique Non-null Rows...")
    print("")
    print("\tNon-unique rows:",olddf.shape)
    print("\tUnique rows    :",df.shape)

    ### sort
    df = df.sort_values(["PrimaryInstitution","OrgName"])
    
    return df

In [161]:
def create_time_series(series,group_var,count_var):
    
    yeardist = series.groupby(group_var,as_index=False).count().sort_values(group_var)
    
    max_year = max(series[group_var])
    x = pd.DataFrame({"year":pd.Series(range(min(yeardist[group_var].astype(int)),max(yeardist[group_var].astype(int))+1))})
    
    yeardist[group_var] = yeardist[group_var].astype(int,errors="ignore")
    
    ts = x.merge(yeardist,left_on="year",right_on=group_var,how="left")
    ts.loc[ts[count_var].isna(),count_var]=0
    # merge_results(ts)
    
    ts_columns = ["year",count_var]
    ts = ts[ts_columns]
    
    return ts

In [162]:
# create InstitutionCategory (English) variable InstitutionType

# use with apply. e.g., 
# df["InstitutionCategory] = df["InstitutionType"].apply(define_institution_category)

def define_institution_category(PI):
    
    c = "Social"
    
    if PI=="정권기관":
        c = "Government"
    elif PI=="노동당":
        c = "Party"
    elif PI=="인민군":
        c = "Military"
        
    return c

In [163]:
def two_sample_ttest_proportions(x1,x2,n1,n2,alpha):
    
    pstar = (x1+x2)/(n1+n2)
    p1 = x1/n1
    p2 = x2/n2
    z = (p1-p2) / math.sqrt(pstar*(1-pstar)*((1/n1) + (1/n2)))
    pvalue = 1-norm.cdf(abs(z))
    
    # add significance stars to result
    stars = ""
    if pvalue<.05:
        stars = "*"
    if pvalue<.01:
        stars = "**"
    if pvalue<.001:
        stars = "***"
    result = str(pvalue)+stars
    
    print("p1:\t",p1)
    print("p2:\t",p2)
    print("p1-p2:\t",(p1-p2))
    print("z:\t",z)
    print("pvalue:\t",result)

In [164]:
# apply two_sample_ttest_proportions across categories

def ttest_by_group_variable(group_var,ics,stat,between_row,within_row,period1_col,period2_col):

    print("\ngrouping by: ",group_var)

    for ic in ics:

        print("\n\n",ic,"\n\n")
        

        stat_ic = stat[stat[group_var]==ic]

        # two-sample ttest of proportions: p(between PI trans in s1) > p(between PI trans in s2)
        x1 = stat_ic.iloc[period1_col,between_row]
        n1 = stat_ic.iloc[period1_col,between_row] + stat_ic.iloc[period1_col,within_row]
        x2 = stat_ic.iloc[period2_col,between_row]
        n2 = stat_ic.iloc[period2_col,between_row] + stat_ic.iloc[period2_col,within_row]
        alpha = .05
        print(x1,x2,n1,n2,alpha)
        two_sample_ttest_proportions(x1,x2,n1,n2,alpha)

# Format & Covariates

### 0. change datatypes

In [165]:
trans = trans.astype({"OrgAdvance":"int","PositionAdvance":"int",
                      "CareerStartYear_1":"int","CareerStartYear_2":"int",
                      "CareerStartDate_1":"int","CareerStartDate_2":"int"})
trans.dtypes

LeaderID                   object
CareerString_1             object
CareerDateString_2022_1    object
CareerStartYear_1           int32
CareerStartMonth_1         object
CareerStartDate_1           int32
CareerSubstring_1          object
InstitutionType_1          object
PrimaryInstitution_1       object
OrgName_1                  object
Local_1                    object
Position_1                 object
IsElected_1                object
OrgRank_1                  object
PositionRank_1             object
CareerString_2             object
CareerDateString_2022_2    object
CareerStartYear_2           int32
CareerStartMonth_2         object
CareerStartDate_2           int32
CareerSubstring_2          object
InstitutionType_2          object
PrimaryInstitution_2       object
OrgName_2                  object
Local_2                    object
Position_2                 object
IsElected_2                object
OrgRank_2                  object
PositionRank_2             object
OrgAdvance    

### 1. double-check exlcudes SPA jobs and local-local trans

In [166]:
trans.columns

Index(['LeaderID', 'CareerString_1', 'CareerDateString_2022_1',
       'CareerStartYear_1', 'CareerStartMonth_1', 'CareerStartDate_1',
       'CareerSubstring_1', 'InstitutionType_1', 'PrimaryInstitution_1',
       'OrgName_1', 'Local_1', 'Position_1', 'IsElected_1', 'OrgRank_1',
       'PositionRank_1', 'CareerString_2', 'CareerDateString_2022_2',
       'CareerStartYear_2', 'CareerStartMonth_2', 'CareerStartDate_2',
       'CareerSubstring_2', 'InstitutionType_2', 'PrimaryInstitution_2',
       'OrgName_2', 'Local_2', 'Position_2', 'IsElected_2', 'OrgRank_2',
       'PositionRank_2', 'OrgAdvance', 'PositionAdvance'],
      dtype='object')

In [167]:
trans.shape

(4306, 31)

In [168]:
trans.Local_1.unique()

array(['False', 'True'], dtype=object)

In [169]:
# remove Local-Local ties
trans = trans[~((trans["Local_1"]=="True") & (trans["Local_2"]=="True"))]
trans.shape

(4222, 31)

### 2. add InstitutionCategory_1, InstitutionCategory_2

In [170]:
trans.InstitutionType_1.unique()

array(['정권기관', '국제친선단체', '노동당', '인민군', '당외곽및사회단체_사회부문(별책)',
       '당외곽및사회단체_사회부문', '당외곽및사회단체_체육부문', '당외곽및사회단체_정치부문', '당외곽및사회단체_대외부문',
       '당외곽및사회단체_근로단체', '당외곽및사회단체_종교부문', '당외곽및사회단체_경제부문(별책)',
       '당외곽및사회단체_경제부문'], dtype=object)

### 3. PISame, OrgSame

In [171]:
trans.columns

Index(['LeaderID', 'CareerString_1', 'CareerDateString_2022_1',
       'CareerStartYear_1', 'CareerStartMonth_1', 'CareerStartDate_1',
       'CareerSubstring_1', 'InstitutionType_1', 'PrimaryInstitution_1',
       'OrgName_1', 'Local_1', 'Position_1', 'IsElected_1', 'OrgRank_1',
       'PositionRank_1', 'CareerString_2', 'CareerDateString_2022_2',
       'CareerStartYear_2', 'CareerStartMonth_2', 'CareerStartDate_2',
       'CareerSubstring_2', 'InstitutionType_2', 'PrimaryInstitution_2',
       'OrgName_2', 'Local_2', 'Position_2', 'IsElected_2', 'OrgRank_2',
       'PositionRank_2', 'OrgAdvance', 'PositionAdvance'],
      dtype='object')

In [172]:
trans["PISame"] = False
trans.loc[trans["PrimaryInstitution_1"]==trans["PrimaryInstitution_2"],["PISame"]] = True
trans.PISame.unique()

array([False,  True])

In [173]:
trans["OrgSame"] = np.nan
trans.loc[trans["PISame"]==True,"OrgSame"] = False
trans.loc[(trans["PISame"]==True) & (trans["OrgName_1"]==trans["OrgName_2"]),["OrgSame"]] = True
trans.OrgSame.unique()

array([nan, True, False], dtype=object)

### 4. OrgRankChange, PositionRankChange

In [174]:
trans["OrgRankChange"] = np.nan

In [175]:
trans.loc[trans["PISame"] & (trans["OrgRank_1"]>trans["OrgRank_2"]),"OrgRankChange"] = "lower"
trans.loc[trans["PISame"] & (trans["OrgRank_1"]==trans["OrgRank_2"]),"OrgRankChange"] = "same"
trans.loc[trans["PISame"] & (trans["OrgRank_1"]<trans["OrgRank_2"]),"OrgRankChange"] = "higher"

In [176]:
value_order = ["lower","same","higher"]
trans.OrgRankChange = trans.OrgRankChange.astype("category")
trans.OrgRankChange = trans.OrgRankChange.cat.set_categories(value_order)

In [177]:
trans.OrgRankChange.unique()

[NaN, 'same', 'higher', 'lower']
Categories (3, object): ['lower', 'same', 'higher']

In [178]:
trans["PositionRankChange"] = np.nan

In [179]:
trans.loc[trans["OrgSame"] & (trans["PositionRank_1"]>trans["PositionRank_2"]),"PositionRankChange"] = "lower"
trans.loc[trans["OrgSame"] & (trans["PositionRank_1"]==trans["PositionRank_2"]),"PositionRankChange"] = "same"
trans.loc[trans["OrgSame"] & (trans["PositionRank_1"]<trans["PositionRank_2"]),"PositionRankChange"] = "higher"

In [180]:
value_order = ["lower","same","higher"]
trans.PositionRankChange = trans.PositionRankChange.astype("category")
trans.PositionRankChange = trans.PositionRankChange.cat.set_categories(value_order)

In [181]:
trans.PositionRankChange.unique()

[NaN, 'same', 'lower', 'higher']
Categories (3, object): ['lower', 'same', 'higher']

### 5. Succession Period - Broad

* KIS-->KJI: 1974-1993 - less institutionalized (stronger intra)
* KJI--->KJU:  2002-2011 - more institutionalized (stronger inter)

* KISlate: 1974-1993 - succession
* KJIearly: 1994-2001 - non-succession
* KJIlate: 2002-2011 - succession
* KJUearly: 2011-2018 - X 

* s1 vs s2: KISlate vs KJIlate

* succession vs non-succession
    * succession: KISlate, KJIlate
    * non-succession: KJIearly (1994-2001)

In [182]:
trans["Succession_Broad"] = np.nan

In [183]:
value_order = ["KIS to KJI","KJI to KJU"]
trans.Succession_Broad = trans.Succession_Broad.astype("category")
trans.Succession_Broad = trans.Succession_Broad.cat.set_categories(value_order)

In [184]:
trans.loc[(trans["CareerStartYear_2"]>1973) & (trans["CareerStartYear_2"]<1994),"Succession_Broad"] = "KIS to KJI"
trans.loc[(trans["CareerStartYear_2"]>2001) & (trans["CareerStartYear_2"]<2012),"Succession_Broad"] = "KJI to KJU"

In [185]:
trans[["Succession_Broad","OrgName_2"]].groupby("Succession_Broad",as_index=False).count()

Unnamed: 0,Succession_Broad,OrgName_2
0,KIS to KJI,674
1,KJI to KJU,648


In [186]:
trans.loc[trans["Succession_Broad"]=="KIS to KJI","CareerStartYear_2"]

34      1977
35      1977
36      1979
37      1980
38      1982
        ... 
3805    1980
3934    1993
4285    1977
4286    1980
4287    1990
Name: CareerStartYear_2, Length: 1006, dtype: int32

### 5.1 IsSuccessionBroad

In [187]:
trans["IsSuccessionBroad"] = np.nan

In [189]:
trans.loc[~trans["Succession_Broad"].isna(),"IsSuccessionBroad"] = True
trans.loc[(trans["CareerStartYear_2"]>1993) & (trans["CareerStartYear_2"]<2002),"IsSuccessionBroad"] = False

In [191]:
select_columns = ["CareerStartYear_2","IsSuccessionBroad","LeaderID"]
group_columns = ["CareerStartYear_2","IsSuccessionBroad"]
trans[select_columns].groupby(group_columns,as_index=False)["LeaderID"].count()

Unnamed: 0,CareerStartYear_2,IsSuccessionBroad,LeaderID
0,1974,True,28
1,1975,True,15
2,1976,True,25
3,1977,True,38
4,1978,True,25
5,1979,True,14
6,1980,True,132
7,1981,True,26
8,1982,True,48
9,1983,True,28


### 6. Succession Period - Narrow

#### Esther's periodization
* KIS-->KJI: 1987-1994 - less institutionalized (stronger intra)
* KJI--->KJU:  2009-2011 - more institutionalized (stronger inter)

In [210]:
trans["Succession_Narrow"] = np.nan

In [211]:
value_order = ["KIS to KJI","KJI to KJU"]
trans.Succession_Narrow = trans.Succession_Narrow.astype("category")
trans.Succession_Narrow = trans.Succession_Narrow.cat.set_categories(value_order)

In [212]:
### Esther's periodization

trans.loc[(trans["CareerStartYear_2"]>1986) & (trans["CareerStartYear_2"]<1995),"Succession_Narrow"] = "KIS to KJI"
trans.loc[(trans["CareerStartYear_2"]>2008) & (trans["CareerStartYear_2"]<2012),"Succession_Narrow"] = "KJI to KJU"

In [213]:
trans[["Succession_Narrow","OrgName_2"]].groupby("Succession_Narrow",as_index=False).count()

Unnamed: 0,Succession_Narrow,OrgName_2
0,KIS to KJI,323
1,KJI to KJU,383


### 7. OrgAdvanceYes

In [214]:
trans["OrgAdvanceYes"] = np.nan

In [215]:
trans.loc[trans["PISame"]==True,"OrgAdvanceYes"] = False
trans.loc[trans["PISame"]==True & (trans["OrgRank_1"]>trans["OrgRank_2"]),"OrgAdvanceYes"] = True

### 8. PositionAdvanceYes

In [216]:
trans["PositionAdvanceYes"] = np.nan

In [217]:
trans.loc[trans["OrgSame"]==True,"PositionAdvanceYes"] = False
trans.loc[trans["OrgSame"]==True & (trans["PositionRank_1"]>trans["PositionRank_2"]),"PositionAdvanceYes"] = True

### 9. InstitutionCategory

In [218]:
trans.columns

Index(['LeaderID', 'CareerString_1', 'CareerDateString_2022_1',
       'CareerStartYear_1', 'CareerStartMonth_1', 'CareerStartDate_1',
       'CareerSubstring_1', 'InstitutionType_1', 'PrimaryInstitution_1',
       'OrgName_1', 'Local_1', 'Position_1', 'IsElected_1', 'OrgRank_1',
       'PositionRank_1', 'CareerString_2', 'CareerDateString_2022_2',
       'CareerStartYear_2', 'CareerStartMonth_2', 'CareerStartDate_2',
       'CareerSubstring_2', 'InstitutionType_2', 'PrimaryInstitution_2',
       'OrgName_2', 'Local_2', 'Position_2', 'IsElected_2', 'OrgRank_2',
       'PositionRank_2', 'OrgAdvance', 'PositionAdvance', 'PISame', 'OrgSame',
       'OrgRankChange', 'PositionRankChange', 'Succession_Broad',
       'IsSuccessionBroad', 'Succession_Narrow', 'OrgAdvanceYes',
       'PositionAdvanceYes'],
      dtype='object')

In [219]:
trans["InstitutionCategory_1"] = trans["InstitutionType_1"].apply(define_institution_category)
trans["InstitutionCategory_2"] = trans["InstitutionType_2"].apply(define_institution_category)

### 10. InstitutionCategorySame

In [220]:
trans["InstitutionCategorySame"] = trans["InstitutionCategory_1"] == trans["InstitutionCategory_2"]

### 11. Succession_Five

#### Jacob's periodization
* KIS-->KJI: 1994-1999 - less institutionalized (stronger intra)
* KJI--->KJU:  2011-2016 - more institutionalized (stronger inter)

In [60]:
trans["Succession_Five"] = np.nan

In [61]:
value_order = ["KIS to KJI","KJI to KJU"]
trans.Succession_Five = trans.Succession_Five.astype("category")
trans.Succession_Five = trans.Succession_Five.cat.set_categories(value_order)

In [62]:
### Jacob's periodization

trans.loc[(trans["CareerStartYear_2"]>1993) & (trans["CareerStartYear_2"]<2000),"Succession_Five"] = "KIS to KJI"
trans.loc[(trans["CareerStartYear_2"]>2010) & (trans["CareerStartYear_2"]<2016),"Succession_Five"] = "KJI to KJU"

In [63]:
trans[["Succession_Five","OrgName_2"]].groupby("Succession_Five",as_index=False).count()

Unnamed: 0,Succession_Five,OrgName_2
0,KIS to KJI,229
1,KJI to KJU,229


### 12. IsSuccession_Five

In [70]:
trans["IsSuccession_Five"] = False

In [71]:
trans.loc[~trans.Succession_Five.isna(),"IsSuccession_Five"] = True

In [72]:
trans[["IsSuccession_Five","OrgName_2"]].groupby("IsSuccession_Five",as_index=False).count()

Unnamed: 0,IsSuccession_Five,OrgName_2
0,False,2470
1,True,458


In [None]:
### 13. Regime X First_Five Variables

In [95]:
trans["Regime"] = np.nan
trans["First_Five"] = np.nan

In [96]:
### Regime periodization

trans.loc[(trans["CareerStartYear_2"]>1993) & (trans["CareerStartYear_2"]<2012),"Regime"] = "KJI"
trans.loc[trans["CareerStartYear_2"]>2011,"Regime"] = "KJU"

In [97]:
### First_Five periodization

trans.loc[(trans["CareerStartYear_2"]>1993) & (trans["CareerStartYear_2"]<2000),"First_Five"] = True
trans.loc[(trans["CareerStartYear_2"]>2010) & (trans["CareerStartYear_2"]<2016),"First_Five"] = True

In [98]:
trans.loc[(~trans["Regime"].isna() & trans["First_Five"].isna()),"First_Five"] = False

In [99]:
trans.groupby(["Regime","First_Five"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,LeaderID,CareerString_1,CareerDateString_2022_1,CareerStartYear_1,CareerStartMonth_1,CareerStartDate_1,CareerSubstring_1,InstitutionType_1,PrimaryInstitution_1,OrgName_1,...,Succession_Narrow,OrgAdvanceYes,PositionAdvanceYes,InstitutionCategory_1,InstitutionCategory_2,InstitutionCategorySame,Succession_Five,Succession_Non,IsSuccession_Five,Three_Periods
Regime,First_Five,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
KJI,False,1055,1055,203,1055,945,1055,181,1055,1055,631,...,472,891,298,1055,1055,1055,0,1055,1055,0
KJI,True,429,429,78,429,402,429,84,429,429,288,...,104,323,71,429,429,429,429,429,429,429
KJU,False,927,927,802,927,887,927,292,927,927,736,...,0,790,346,927,927,927,0,927,927,0
KJU,True,346,346,140,346,319,346,104,346,346,254,...,0,255,75,346,346,346,346,346,346,346


# Hypotheses 1. IsSuccessionBroad: trans count, within & between PI

### 1. IsSuccessionBroad trans count (within & between PI)

In [197]:
stat1_columns = ["PISame","IsSuccessionBroad","OrgName_2"]
stat1_groupby_columns = ["IsSuccessionBroad","PISame"]

In [199]:
stat1 = trans[stat1_columns].groupby(stat1_groupby_columns,as_index=False)["OrgName_2"].agg({"Total Transitions":"count"})

# pivot PISame from column to rows
stat1 = stat1.pivot(index="IsSuccessionBroad",columns="PISame",values="Total Transitions")
pivot_column_labels = ["Between PI Trans","Within PI Trans"]
stat1.columns = pivot_column_labels
stat1 = stat1.reset_index()

stat1

Unnamed: 0,IsSuccessionBroad,Between PI Trans,Within PI Trans
0,False,183,120
1,True,788,534


In [200]:
stat1["Percentage Between PI Trans"] = stat1.apply(lambda x: round(x["Between PI Trans"] / (x["Between PI Trans"] + x["Within PI Trans"]),4) if (x["Between PI Trans"] + x["Within PI Trans"]) > 0 else np.nan,axis=1)
stat1

Unnamed: 0,IsSuccessionBroad,Between PI Trans,Within PI Trans,Percentage Between PI Trans
0,False,183,120,0.604
1,True,788,534,0.5961


In [201]:
# two-sample ttest of proportions: p(between PI trans in s1) > p(between PI trans in s2)
x1 = stat1.iloc[0,1]
n1 = stat1.iloc[0,1] + stat1.iloc[0,2]
x2 = stat1.iloc[1,1]
n2 = stat1.iloc[1,1] + stat1.iloc[1,2]
alpha = .05
print(x1,x2,n1,n2,alpha)
two_sample_ttest_proportions(x1,x2,n1,n2,alpha)

183 788 303 1322 0.05
p1:	 0.6039603960396039
p2:	 0.5960665658093798
p1-p2:	 0.007893830230224164
z:	 0.25272780027386377
pvalue:	 0.4002392817923839


### 2. IsSuccessionBroad, by sending PI: trans count (within & between PI)

In [206]:
stat2_columns = ["InstitutionCategory_1","PISame","IsSuccessionBroad","OrgName_2"]
stat2_groupby_columns = ["InstitutionCategory_1","IsSuccessionBroad","PISame"]

In [222]:
stat2 = trans[stat2_columns].groupby(stat2_groupby_columns,as_index=False)["OrgName_2"].agg({"Total Transitions":"count"})

# pivot PISame from column to rows
stat2 = stat2.pivot(index=["InstitutionCategory_1","IsSuccessionBroad",],columns="PISame",values="Total Transitions")
pivot_column_labels = ["Between PI Trans","Within PI Trans"]
stat2.columns = pivot_column_labels
stat2 = stat2.reset_index()

stat2

Unnamed: 0,InstitutionCategory_1,IsSuccessionBroad,Between PI Trans,Within PI Trans
0,Government,False,97,79
1,Government,True,295,259
2,Military,False,12,7
3,Military,True,126,11
4,Party,False,27,30
5,Party,True,102,239
6,Social,False,47,4
7,Social,True,265,25


In [223]:
stat2["Percentage Between PI Trans"] = stat2.apply(lambda x: round(x["Between PI Trans"] / (x["Between PI Trans"] + x["Within PI Trans"]),4) if (x["Between PI Trans"] + x["Within PI Trans"]) > 0 else np.nan,axis=1)
stat2

Unnamed: 0,InstitutionCategory_1,IsSuccessionBroad,Between PI Trans,Within PI Trans,Percentage Between PI Trans
0,Government,False,97,79,0.5511
1,Government,True,295,259,0.5325
2,Military,False,12,7,0.6316
3,Military,True,126,11,0.9197
4,Party,False,27,30,0.4737
5,Party,True,102,239,0.2991
6,Social,False,47,4,0.9216
7,Social,True,265,25,0.9138


In [224]:
ics = list(trans.InstitutionCategory_1.unique())
ics

['Government', 'Social', 'Party', 'Military']

In [225]:
ttest_by_group_variable("InstitutionCategory_1",ics,stat2,2,3,0,1)


grouping by:  InstitutionCategory_1


 Government 


97 295 176 554 0.05
p1:	 0.5511363636363636
p2:	 0.5324909747292419
p1-p2:	 0.01864538890712175
z:	 0.43215815131725954
pvalue:	 0.33281323632429205


 Social 


47 265 51 290 0.05
p1:	 0.9215686274509803
p2:	 0.9137931034482759
p1-p2:	 0.007775524002704426
z:	 0.18357546919976897
pvalue:	 0.427173254393611


 Party 


27 102 57 341 0.05
p1:	 0.47368421052631576
p2:	 0.2991202346041056
p1-p2:	 0.17456397592221018
z:	 2.6063922955451573
pvalue:	 0.0045750797989165815**


 Military 


12 126 19 137 0.05
p1:	 0.631578947368421
p2:	 0.9197080291970803
p1-p2:	 -0.28812908182865926
z:	 -3.683924175727469
pvalue:	 0.00011483529329048281***


# Hypotheses 2. IsSuccessionNarrow: trans count, within & between PI

### 3. IsSuccessionNarrow trans count (within & between PI)

In [231]:
stat3_columns = ["PISame","IsSuccessionNarrow","OrgName_2"]
stat3_groupby_columns = ["IsSuccessionNarrow","PISame"]

In [232]:
stat3 = trans[stat3_columns].groupby(stat3_groupby_columns,as_index=False)["OrgName_2"].agg({"Total Transitions":"count"})

# pivot PISame from column to rows
stat3 = stat3.pivot(index="IsSuccessionNarrow",columns="PISame",values="Total Transitions")
pivot_column_labels = ["Between PI Trans","Within PI Trans"]
stat3.columns = pivot_column_labels
stat3 = stat3.reset_index()

stat3

KeyError: "['IsSuccessionNarrow'] not in index"

In [200]:
stat3["Percentage Between PI Trans"] = stat3.apply(lambda x: round(x["Between PI Trans"] / (x["Between PI Trans"] + x["Within PI Trans"]),4) if (x["Between PI Trans"] + x["Within PI Trans"]) > 0 else np.nan,axis=1)
stat3

Unnamed: 0,IsSuccessionBroad,Between PI Trans,Within PI Trans,Percentage Between PI Trans
0,False,183,120,0.604
1,True,788,534,0.5961


In [233]:
# two-sample ttest of proportions: p(between PI trans in s1) > p(between PI trans in s2)
x1 = stat3.iloc[0,1]
n1 = stat3.iloc[0,1] + stat3.iloc[0,2]
x2 = stat3.iloc[1,1]
n2 = stat3.iloc[1,1] + stat3.iloc[1,2]
alpha = .05
print(x1,x2,n1,n2,alpha)
two_sample_ttest_proportions(x1,x2,n1,n2,alpha)

False True 375 172 0.05
p1:	 0.0
p2:	 0.005813953488372093
p1-p2:	 -0.005813953488372093
z:	 -1.477912601632068
pvalue:	 0.06971558645202247


### 2. IsSuccessionNarrow, by sending PI: trans count (within & between PI)

In [206]:
stat2_columns = ["InstitutionCategory_1","PISame","IsSuccessionBroad","OrgName_2"]
stat2_groupby_columns = ["InstitutionCategory_1","IsSuccessionBroad","PISame"]

In [222]:
stat2 = trans[stat2_columns].groupby(stat2_groupby_columns,as_index=False)["OrgName_2"].agg({"Total Transitions":"count"})

# pivot PISame from column to rows
stat2 = stat2.pivot(index=["InstitutionCategory_1","IsSuccessionBroad",],columns="PISame",values="Total Transitions")
pivot_column_labels = ["Between PI Trans","Within PI Trans"]
stat2.columns = pivot_column_labels
stat2 = stat2.reset_index()

stat2

Unnamed: 0,InstitutionCategory_1,IsSuccessionBroad,Between PI Trans,Within PI Trans
0,Government,False,97,79
1,Government,True,295,259
2,Military,False,12,7
3,Military,True,126,11
4,Party,False,27,30
5,Party,True,102,239
6,Social,False,47,4
7,Social,True,265,25


In [223]:
stat2["Percentage Between PI Trans"] = stat2.apply(lambda x: round(x["Between PI Trans"] / (x["Between PI Trans"] + x["Within PI Trans"]),4) if (x["Between PI Trans"] + x["Within PI Trans"]) > 0 else np.nan,axis=1)
stat2

Unnamed: 0,InstitutionCategory_1,IsSuccessionBroad,Between PI Trans,Within PI Trans,Percentage Between PI Trans
0,Government,False,97,79,0.5511
1,Government,True,295,259,0.5325
2,Military,False,12,7,0.6316
3,Military,True,126,11,0.9197
4,Party,False,27,30,0.4737
5,Party,True,102,239,0.2991
6,Social,False,47,4,0.9216
7,Social,True,265,25,0.9138


In [224]:
ics = list(trans.InstitutionCategory_1.unique())
ics

['Government', 'Social', 'Party', 'Military']

In [225]:
ttest_by_group_variable("InstitutionCategory_1",ics,stat2,2,3,0,1)


grouping by:  InstitutionCategory_1


 Government 


97 295 176 554 0.05
p1:	 0.5511363636363636
p2:	 0.5324909747292419
p1-p2:	 0.01864538890712175
z:	 0.43215815131725954
pvalue:	 0.33281323632429205


 Social 


47 265 51 290 0.05
p1:	 0.9215686274509803
p2:	 0.9137931034482759
p1-p2:	 0.007775524002704426
z:	 0.18357546919976897
pvalue:	 0.427173254393611


 Party 


27 102 57 341 0.05
p1:	 0.47368421052631576
p2:	 0.2991202346041056
p1-p2:	 0.17456397592221018
z:	 2.6063922955451573
pvalue:	 0.0045750797989165815**


 Military 


12 126 19 137 0.05
p1:	 0.631578947368421
p2:	 0.9197080291970803
p1-p2:	 -0.28812908182865926
z:	 -3.683924175727469
pvalue:	 0.00011483529329048281***
