In [22]:
import pandas as pd
import numpy as np
from datetime import date
from statistics import mean, mode
from pandas.api.types import CategoricalDtype
import math
from scipy.stats import norm

In [23]:
today = date.today()
print(today)

2024-01-07


# Tables

In [24]:
path_tables = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 2 tables/"

In [25]:
# tables
filename_careerorglink = "careerorglink.xlsx"
filename_leadercareerlink = "leadercareerlink.xlsx"
filename_orgtree = "orgtree.xlsx"
filename_elected = "positions_elected.xlsx"

In [27]:
# career-org link
col = pd.read_excel(path_tables + filename_careerorglink,dtype="str")
col.shape

(9002, 12)

In [28]:
col.columns

Index(['CareerString', 'CareerDateString_2022', 'IsJob', 'MultipleSubstrings',
       'CareerStartYear', 'CareerStartMonth', 'CareerSubstring',
       'InstitutionType', 'PrimaryInstitution', 'OrgName', 'Position',
       'Notes'],
      dtype='object')

In [29]:
# leader-career link
# lcl = pd.read_excel(path_tables + filename_leadercareerlink,dtype="str")
# lcl.shape

(12617, 3)

In [30]:
# lcl.columns

Index(['LeaderID', 'CareerString', 'CareerDateString_2022'], dtype='object')

In [9]:
# orgtree
# org = pd.read_excel(path_tables + filename_orgtree,dtype="str")
# org.shape

In [10]:
# org.columns

In [11]:
# elected = pd.read_excel(path_tables + filename_elected,dtype="str")
# elected.shape

In [12]:
# elected.columns

# Queries

In [31]:
path_queries = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 3 queries/"

In [32]:
filename_leaderjob_all = "leaderjob_electUnelect_inOutgov.xlsx"
filename_leaderjob_no_spa = "leaderjob_no_spa.xlsx"
filename_leaderjobtransition_no_spa = "leaderjobtransition_no_spa.xlsx"

In [15]:
# leader job - all
# ljobs_all = pd.read_excel(path_queries + filename_leaderjob_all,dtype="str")
# ljobs_all.shape

In [33]:
# leader job - no spa
ljobs = pd.read_excel(path_queries + filename_leaderjob_no_spa,dtype="str")
ljobs.shape

(6047, 15)

In [35]:
ljobs.columns

Index(['LeaderID', 'CareerString', 'CareerDateString_2022', 'CareerStartYear',
       'CareerStartMonth', 'CareerStartDate', 'CareerSubstring',
       'InstitutionType', 'PrimaryInstitution', 'OrgName', 'Local', 'Position',
       'IsElected', 'OrgRank', 'PositionRank'],
      dtype='object')

In [17]:
# transitions used for analysis - no SPA - no local-local
trans = pd.read_excel(path_queries + filename_leaderjobtransition_no_spa,dtype="str")
trans.shape

(4306, 31)

In [20]:
trans.columns

Index(['LeaderID', 'CareerString_1', 'CareerDateString_2022_1',
       'CareerStartYear_1', 'CareerStartMonth_1', 'CareerStartDate_1',
       'CareerSubstring_1', 'InstitutionType_1', 'PrimaryInstitution_1',
       'OrgName_1', 'Local_1', 'Position_1', 'IsElected_1', 'OrgRank_1',
       'PositionRank_1', 'CareerString_2', 'CareerDateString_2022_2',
       'CareerStartYear_2', 'CareerStartMonth_2', 'CareerStartDate_2',
       'CareerSubstring_2', 'InstitutionType_2', 'PrimaryInstitution_2',
       'OrgName_2', 'Local_2', 'Position_2', 'IsElected_2', 'OrgRank_2',
       'PositionRank_2', 'OrgAdvance', 'PositionAdvance'],
      dtype='object')

# Analysis - Research Note

In [36]:
path_analysis = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/analysis/"

In [37]:
# analysis sub-paths
study0_path = "2023.10.04 Study 0 - research note/"
study1_path = "2023.10.04 Study 1 - political capital/"
study2_path = "2023.10.04 Study 2 - commitment vs control/"
study3_path = "2023.10.04 Study 3 - reds vs experts/"

In [38]:
# ljobs = ljobs.astype({"CareerStartYear":"int","CareerStartDate":"int"})
# ljobs.dtypes

# Functions

In [39]:
def merge_results(m):
    
    print("\nMerge Results...")
    print("")
    print("\tshape     :",m.shape)
    print("\tleft_only :",m[m["_merge"]=="left_only"].shape)
    print("\tboth      :",m[m["_merge"]=="both"].shape)
    print("\tright_only:",m[m["_merge"]=="right_only"].shape)

In [40]:
# using this on (PI,OrgName) will ensure unique & non-null keys
# using this on a larger df will ensure unique rows and non-null keys, but not unique keys

def unique_non_null_rows(olddf):
    
    df = olddf.copy()
    
    
    ### drop duplicates
    df = df.drop_duplicates(keep="first",ignore_index=True)

    ### drop null rows
    df.dropna(how="all",axis=0,inplace=True)
    
    # drop rows with null PI
    df = df[~(df["PrimaryInstitution"].isna())]
        
    ### drop rows where PI contains stop words
    stop_words_lower = ["uncertain","current","deprecated","please_revise"]
    df = df[ ~ df["PrimaryInstitution"].str.lower().isin(stop_words_lower)]
            

    print("\nUnique Non-null Rows...")
    print("")
    print("\tNon-unique rows:",olddf.shape)
    print("\tUnique rows    :",df.shape)

    ### sort
    df = df.sort_values(["PrimaryInstitution","OrgName"])
    
    return df

In [41]:
def create_time_series(series,group_var,count_var):
    
    yeardist = series.groupby(group_var,as_index=False).count().sort_values(group_var)
    
    max_year = max(series[group_var])
    x = pd.DataFrame({"year":pd.Series(range(min(yeardist[group_var].astype(int)),max(yeardist[group_var].astype(int))+1))})
    
    yeardist[group_var] = yeardist[group_var].astype(int,errors="ignore")
    
    ts = x.merge(yeardist,left_on="year",right_on=group_var,how="left")
    ts.loc[ts[count_var].isna(),count_var]=0
    # merge_results(ts)
    
    ts_columns = ["year",count_var]
    ts = ts[ts_columns]
    
    return ts

In [42]:
# create InstitutionCategory (English) variable InstitutionType

# use with apply. e.g., 
# df["InstitutionCategory] = df["InstitutionType"].apply(define_institution_category)

def define_institution_category(PI):
    
    c = "Social"
    
    if PI=="정권기관":
        c = "Government"
    elif PI=="노동당":
        c = "Party"
    elif PI=="인민군":
        c = "Military"
        
    return c

In [43]:
def two_sample_ttest_proportions(x1,x2,n1,n2,alpha):
    
    pstar = (x1+x2)/(n1+n2)
    p1 = x1/n1
    p2 = x2/n2
    z = (p1-p2) / math.sqrt(pstar*(1-pstar)*((1/n1) + (1/n2)))
    #pvalue = 
    print("p1:\t",p1)
    print("p2:\t",p2)
    print("p1-p2:\t",(p1-p2))
    print("z:\t",z)
    print("pvalue:\t","calculate")

In [193]:
def two_sample_ttest_means(m1,m2,v1,v2,n1,n2,alpha):
    
    num = m1-m2
    den = math.sqrt(v1/n1 + v2/n2)
    t = num/den
    #pvalue = 
    print("m1:\t",m1)
    print("m2:\t",m2)
    print("m1-m2:\t",(m1-m2))
    print("t:\t",t)
    print("pvalue:\t","calculate")

# Format & Covariates

### 1. add col.JointAppointment to ljobs

In [44]:
col.columns

Index(['CareerString', 'CareerDateString_2022', 'IsJob', 'MultipleSubstrings',
       'CareerStartYear', 'CareerStartMonth', 'CareerSubstring',
       'InstitutionType', 'PrimaryInstitution', 'OrgName', 'Position',
       'Notes'],
      dtype='object')

In [46]:
col.MultipleSubstrings

array(['1', '2', '4', '3', '5'], dtype=object)

In [47]:
ljobs.columns

Index(['LeaderID', 'CareerString', 'CareerDateString_2022', 'CareerStartYear',
       'CareerStartMonth', 'CareerStartDate', 'CareerSubstring',
       'InstitutionType', 'PrimaryInstitution', 'OrgName', 'Local', 'Position',
       'IsElected', 'OrgRank', 'PositionRank'],
      dtype='object')

In [50]:
key_columns = ["CareerString","CareerDateString_2022"]
col_columns = key_columns + ["MultipleSubstrings"]

In [52]:
# merge results confirm perfect match
ljobs2 = ljobs.merge(col[col_columns],on=key_columns,how="left")
# ljobs2 = ljobs.merge(col[col_columns],on=key_columns,how="left",indicator=True)
# merge_results(ljobs2)

### 2. Change Variable Type

In [156]:
ljobs2.CareerStartYear = ljobs2.CareerStartYear.astype("int")

In [157]:
ljobs2.MultipleSubstrings = ljobs2.MultipleSubstrings.astype("int")

### 3. Create new dataset - Unique LeaderID, CareerString, CareerStartYear, MultipleAppointments

* group by LeaderID, CareerStartYear and MultipleSubstrings
* in order to count all instances of appointments, including Joint and non-Joint appointments
* but not to double-count Joint Appointments

In [158]:
ljobs2.columns

Index(['LeaderID', 'CareerString', 'CareerDateString_2022', 'CareerStartYear',
       'CareerStartMonth', 'CareerStartDate', 'CareerSubstring',
       'InstitutionType', 'PrimaryInstitution', 'OrgName', 'Local', 'Position',
       'IsElected', 'OrgRank', 'PositionRank', 'MultipleSubstrings',
       'Succession_Broad', 'Succession_Narrow'],
      dtype='object')

In [159]:
analysis_columns = ["LeaderID","CareerString","CareerStartYear","MultipleSubstrings"]

In [160]:
lappt = ljobs2[analysis_columns].drop_duplicates()
lappt.shape

(5405, 4)

In [161]:
lappt.head()

Unnamed: 0,LeaderID,CareerString,CareerStartYear,MultipleSubstrings
0,리원일,노동성 상(유임),1999,1
1,조용원,당 정치국 후보위원,2020,1
2,정경택,당 중앙위원회 정치국 위원 (*당 중앙위원회 제7기 제4차 전원회의에서 보선),2019,1
3,리하일,인민무력부 작전국 국장,1975,1
4,최상건,(前)당 비서국 비서,2021,1


In [162]:
lappt["IsJointAppointment"]=False
lappt.loc[lappt["MultipleSubstrings"].astype("int")>1,"IsJointAppointment"]=True

In [163]:
lappt.head()

Unnamed: 0,LeaderID,CareerString,CareerStartYear,MultipleSubstrings,IsJointAppointment
0,리원일,노동성 상(유임),1999,1,False
1,조용원,당 정치국 후보위원,2020,1,False
2,정경택,당 중앙위원회 정치국 위원 (*당 중앙위원회 제7기 제4차 전원회의에서 보선),2019,1,False
3,리하일,인민무력부 작전국 국장,1975,1,False
4,최상건,(前)당 비서국 비서,2021,1,False


### 4. Succession Period - Broad

* KIS-->KJI: 1974-1993 - less institutionalized (stronger intra)
* KJI--->KJU:  2002-2011 - more institutionalized (stronger inter)

In [164]:
lappt.columns

Index(['LeaderID', 'CareerString', 'CareerStartYear', 'MultipleSubstrings',
       'IsJointAppointment'],
      dtype='object')

In [165]:
lappt["Succession_Broad"] = np.nan

In [166]:
value_order = ["KIS to KJI","KJI to KJU"]
lappt.Succession_Broad = ljobs2.Succession_Broad.astype("category")
lappt.Succession_Broad = ljobs2.Succession_Broad.cat.set_categories(value_order)

In [167]:
lappt.loc[(lappt["CareerStartYear"]>1973) & (lappt["CareerStartYear"]<1994),"Succession_Broad"] = "KIS to KJI"
lappt.loc[(lappt["CareerStartYear"]>2001) & (lappt["CareerStartYear"]<2012),"Succession_Broad"] = "KJI to KJU"

In [168]:
lappt.head()

Unnamed: 0,LeaderID,CareerString,CareerStartYear,MultipleSubstrings,IsJointAppointment,Succession_Broad
0,리원일,노동성 상(유임),1999,1,False,
1,조용원,당 정치국 후보위원,2020,1,False,
2,정경택,당 중앙위원회 정치국 위원 (*당 중앙위원회 제7기 제4차 전원회의에서 보선),2019,1,False,
3,리하일,인민무력부 작전국 국장,1975,1,False,KIS to KJI
4,최상건,(前)당 비서국 비서,2021,1,False,


### 5. Succession Period - Narrow

#### Esther's periodization
* KIS-->KJI: 1987-1994 - less institutionalized (stronger intra)
* KJI--->KJU:  2009-2011 - more institutionalized (stronger inter)

In [169]:
lappt["Succession_Narrow"] = np.nan

In [170]:
value_order = ["KIS to KJI","KJI to KJU"]
lappt.Succession_Narrow = lappt.Succession_Narrow.astype("category")
lappt.Succession_Narrow = lappt.Succession_Narrow.cat.set_categories(value_order)

In [171]:
### Esther's periodization
lappt.loc[(lappt["CareerStartYear"]>1986) & (lappt["CareerStartYear"]<1995),"Succession_Narrow"] = "KIS to KJI"
lappt.loc[(lappt["CareerStartYear"]>2008) & (lappt["CareerStartYear"]<2012),"Succession_Narrow"] = "KJI to KJU"

In [172]:
lappt.head()

Unnamed: 0,LeaderID,CareerString,CareerStartYear,MultipleSubstrings,IsJointAppointment,Succession_Broad,Succession_Narrow
0,리원일,노동성 상(유임),1999,1,False,,
1,조용원,당 정치국 후보위원,2020,1,False,,
2,정경택,당 중앙위원회 정치국 위원 (*당 중앙위원회 제7기 제4차 전원회의에서 보선),2019,1,False,,
3,리하일,인민무력부 작전국 국장,1975,1,False,KIS to KJI,
4,최상건,(前)당 비서국 비서,2021,1,False,,


# Hypothesis Test - # of Joint Appointments in Two Periods

### 1. two-sample t-test of proportion of joint appointments

In [173]:
lappt.columns

Index(['LeaderID', 'CareerString', 'CareerStartYear', 'MultipleSubstrings',
       'IsJointAppointment', 'Succession_Broad', 'Succession_Narrow'],
      dtype='object')

In [174]:
group_columns = ["Succession_Broad"]
ttest1 = lappt.groupby(group_columns,as_index=False).agg({"CareerString":"count","IsJointAppointment":"sum"})
ttest1

Unnamed: 0,Succession_Broad,CareerString,IsJointAppointment
0,KIS to KJI,1415,170
1,KJI to KJU,1445,193


In [175]:
x1 = ttest1.iloc[0,2]
x2 = ttest1.iloc[1,2]
n1 = ttest1.iloc[0,1]
n2 = ttest1.iloc[1,1]
alpha = .05
print(x1,x2,n1,n2,alpha)

170 193 1415 1445 0.05


In [176]:
two_sample_ttest_proportions(x1,x2,n1,n2,alpha)

p1:	 0.12014134275618374
p2:	 0.13356401384083044
p1-p2:	 -0.013422671084646703
z:	 -1.0781309772160719
pvalue:	 calculate


### 2. two-sample t-test of mean  joint appointments

In [181]:
group_columns = ["Succession_Broad"]
ttest2 = lappt.groupby(group_columns,as_index=False).agg({"CareerString":"count","MultipleSubstrings":["mean","var"]})
ttest2

Unnamed: 0_level_0,Succession_Broad,CareerString,MultipleSubstrings,MultipleSubstrings
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,var
0,KIS to KJI,1415,1.158304,0.22669
1,KJI to KJU,1445,1.172318,0.242446


In [184]:
n1 = ttest2.iloc[0,1]
m1 = ttest2.iloc[0,2]
v1 = ttest2.iloc[0,3]

n2 = ttest2.iloc[1,1]
m2 = ttest2.iloc[1,2]
v2 = ttest2.iloc[1,3]

alpha = .05
print(m1,m2,v1,v2,n1,n2,alpha)

1.1583038869257951 1.172318339100346 0.22669019047285835 0.24244649138782193 1415 1445 0.05


In [194]:
two_sample_ttest_means(m1,m2,v1,v2,n1,n2,alpha)

m1:	 1.1583038869257951
m2:	 1.172318339100346
m1-m2:	 -0.014014452174550884
t:	 -0.773832683675395
pvalue:	 calculate
