In [1]:
import pandas as pd
import numpy as np
from datetime import date
from statistics import mean, mode
from pandas.api.types import CategoricalDtype
import math
from scipy.stats import norm

In [2]:
today = date.today()
print(today)

2024-02-16


# Tables

In [3]:
path_tables = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 2 tables/"

In [4]:
# tables
filename_careerorglink = "careerorglink.xlsx"
filename_leadercareerlink = "leadercareerlink.xlsx"
filename_orgtree = "orgtree.xlsx"
filename_elected = "positions_elected.xlsx"

In [5]:
# career-org link
col = pd.read_excel(path_tables + filename_careerorglink,dtype="str")
col.shape

(9002, 12)

In [6]:
col.columns

Index(['CareerString', 'CareerDateString_2022', 'IsJob', 'MultipleSubstrings',
       'CareerStartYear', 'CareerStartMonth', 'CareerSubstring',
       'InstitutionType', 'PrimaryInstitution', 'OrgName', 'Position',
       'Notes'],
      dtype='object')

In [7]:
# leader-career link
# lcl = pd.read_excel(path_tables + filename_leadercareerlink,dtype="str")
# lcl.shape

In [8]:
# lcl.columns

In [9]:
# orgtree
# org = pd.read_excel(path_tables + filename_orgtree,dtype="str")
# org.shape

In [10]:
# org.columns

In [11]:
# elected = pd.read_excel(path_tables + filename_elected,dtype="str")
# elected.shape

In [12]:
# elected.columns

# Queries

In [13]:
path_queries = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 3 queries/"

In [14]:
filename_leaderjob_all = "leaderjob_electUnelect_inOutgov.xlsx"
filename_leaderjob_no_spa = "leaderjob_no_spa.xlsx"
filename_leaderjobtransition_no_spa = "leaderjobtransition_no_spa.xlsx"

In [15]:
# leader job - all
# ljobs_all = pd.read_excel(path_queries + filename_leaderjob_all,dtype="str")
# ljobs_all.shape

In [16]:
# leader job - no spa
ljobs = pd.read_excel(path_queries + filename_leaderjob_no_spa,dtype="str")
ljobs.shape

(6047, 15)

In [17]:
ljobs.columns

Index(['LeaderID', 'CareerString', 'CareerDateString_2022', 'CareerStartYear',
       'CareerStartMonth', 'CareerStartDate', 'CareerSubstring',
       'InstitutionType', 'PrimaryInstitution', 'OrgName', 'Local', 'Position',
       'IsElected', 'OrgRank', 'PositionRank'],
      dtype='object')

In [18]:
# transitions used for analysis - no SPA - no local-local
trans = pd.read_excel(path_queries + filename_leaderjobtransition_no_spa,dtype="str")
trans.shape

(4306, 31)

In [19]:
trans.columns

Index(['LeaderID', 'CareerString_1', 'CareerDateString_2022_1',
       'CareerStartYear_1', 'CareerStartMonth_1', 'CareerStartDate_1',
       'CareerSubstring_1', 'InstitutionType_1', 'PrimaryInstitution_1',
       'OrgName_1', 'Local_1', 'Position_1', 'IsElected_1', 'OrgRank_1',
       'PositionRank_1', 'CareerString_2', 'CareerDateString_2022_2',
       'CareerStartYear_2', 'CareerStartMonth_2', 'CareerStartDate_2',
       'CareerSubstring_2', 'InstitutionType_2', 'PrimaryInstitution_2',
       'OrgName_2', 'Local_2', 'Position_2', 'IsElected_2', 'OrgRank_2',
       'PositionRank_2', 'OrgAdvance', 'PositionAdvance'],
      dtype='object')

# Analysis - Research Note

In [20]:
path_analysis = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/analysis/"

In [21]:
# analysis sub-paths
study0_path = "2023.10.04 Study 0 - research note/"
study1_path = "2023.10.04 Study 1 - political capital/"
study2_path = "2023.10.04 Study 2 - commitment vs control/"
study3_path = "2023.10.04 Study 3 - reds vs experts/"

In [22]:
# ljobs = ljobs.astype({"CareerStartYear":"int","CareerStartDate":"int"})
# ljobs.dtypes

# Functions

In [23]:
def merge_results(m):
    
    print("\nMerge Results...")
    print("")
    print("\tshape     :",m.shape)
    print("\tleft_only :",m[m["_merge"]=="left_only"].shape)
    print("\tboth      :",m[m["_merge"]=="both"].shape)
    print("\tright_only:",m[m["_merge"]=="right_only"].shape)

In [24]:
# using this on (PI,OrgName) will ensure unique & non-null keys
# using this on a larger df will ensure unique rows and non-null keys, but not unique keys

def unique_non_null_rows(olddf):
    
    df = olddf.copy()
    
    
    ### drop duplicates
    df = df.drop_duplicates(keep="first",ignore_index=True)

    ### drop null rows
    df.dropna(how="all",axis=0,inplace=True)
    
    # drop rows with null PI
    df = df[~(df["PrimaryInstitution"].isna())]
        
    ### drop rows where PI contains stop words
    stop_words_lower = ["uncertain","current","deprecated","please_revise"]
    df = df[ ~ df["PrimaryInstitution"].str.lower().isin(stop_words_lower)]
            

    print("\nUnique Non-null Rows...")
    print("")
    print("\tNon-unique rows:",olddf.shape)
    print("\tUnique rows    :",df.shape)

    ### sort
    df = df.sort_values(["PrimaryInstitution","OrgName"])
    
    return df

In [25]:
def create_time_series(series,group_var,count_var):
    
    yeardist = series.groupby(group_var,as_index=False).count().sort_values(group_var)
    
    max_year = max(series[group_var])
    x = pd.DataFrame({"year":pd.Series(range(min(yeardist[group_var].astype(int)),max(yeardist[group_var].astype(int))+1))})
    
    yeardist[group_var] = yeardist[group_var].astype(int,errors="ignore")
    
    ts = x.merge(yeardist,left_on="year",right_on=group_var,how="left")
    ts.loc[ts[count_var].isna(),count_var]=0
    # merge_results(ts)
    
    ts_columns = ["year",count_var]
    ts = ts[ts_columns]
    
    return ts

In [26]:
# create InstitutionCategory (English) variable InstitutionType

# use with apply. e.g., 
# df["InstitutionCategory] = df["InstitutionType"].apply(define_institution_category)

def define_institution_category(PI):
    
    c = "Social"
    
    if PI=="정권기관":
        c = "Government"
    elif PI=="노동당":
        c = "Party"
    elif PI=="인민군":
        c = "Military"
        
    return c

In [27]:
def two_sample_ttest_proportions(x1,x2,n1,n2,alpha):
    
    pstar = (x1+x2)/(n1+n2)
    p1 = x1/n1
    p2 = x2/n2
    z = (p1-p2) / math.sqrt(pstar*(1-pstar)*((1/n1) + (1/n2)))
    #pvalue = 
    print("p1:\t",p1)
    print("p2:\t",p2)
    print("p1-p2:\t",(p1-p2))
    print("z:\t",z)
    print("pvalue:\t","calculate")

In [28]:
def two_sample_ttest_means(m1,m2,v1,v2,n1,n2,alpha):
    
    num = m1-m2
    den = math.sqrt(v1/n1 + v2/n2)
    t = num/den
    #pvalue = 
    print("m1:\t",m1)
    print("m2:\t",m2)
    print("m1-m2:\t",(m1-m2))
    print("t:\t",t)
    print("pvalue:\t","calculate")

# Format & Covariates

### 1. add col.JointAppointment to ljobs

In [29]:
col.columns

Index(['CareerString', 'CareerDateString_2022', 'IsJob', 'MultipleSubstrings',
       'CareerStartYear', 'CareerStartMonth', 'CareerSubstring',
       'InstitutionType', 'PrimaryInstitution', 'OrgName', 'Position',
       'Notes'],
      dtype='object')

In [30]:
col.MultipleSubstrings

0       1
1       2
2       1
3       1
4       1
       ..
8997    1
8998    1
8999    1
9000    1
9001    1
Name: MultipleSubstrings, Length: 9002, dtype: object

In [31]:
ljobs.columns

Index(['LeaderID', 'CareerString', 'CareerDateString_2022', 'CareerStartYear',
       'CareerStartMonth', 'CareerStartDate', 'CareerSubstring',
       'InstitutionType', 'PrimaryInstitution', 'OrgName', 'Local', 'Position',
       'IsElected', 'OrgRank', 'PositionRank'],
      dtype='object')

In [32]:
key_columns = ["CareerString","CareerDateString_2022"]
col_columns = key_columns + ["MultipleSubstrings"]

In [33]:
# merge results confirm perfect match
ljobs2 = ljobs.merge(col[col_columns],on=key_columns,how="left")
# ljobs2 = ljobs.merge(col[col_columns],on=key_columns,how="left",indicator=True)
# merge_results(ljobs2)

### 2. Change Variable Type

In [34]:
ljobs2.CareerStartYear = ljobs2.CareerStartYear.astype("int")

In [35]:
ljobs2.MultipleSubstrings = ljobs2.MultipleSubstrings.astype("int")

### 3. Create new dataset - Unique LeaderID, CareerString, CareerStartYear, MultipleAppointments

* group by LeaderID, CareerStartYear and MultipleSubstrings
* in order to count all instances of appointments, including Joint and non-Joint appointments
* but not to double-count Joint Appointments

In [36]:
ljobs2.columns

Index(['LeaderID', 'CareerString', 'CareerDateString_2022', 'CareerStartYear',
       'CareerStartMonth', 'CareerStartDate', 'CareerSubstring',
       'InstitutionType', 'PrimaryInstitution', 'OrgName', 'Local', 'Position',
       'IsElected', 'OrgRank', 'PositionRank', 'MultipleSubstrings'],
      dtype='object')

In [37]:
analysis_columns = ["LeaderID","CareerString","CareerStartYear","MultipleSubstrings"]

In [38]:
lappt = ljobs2[analysis_columns].drop_duplicates()
lappt.shape

(5405, 4)

In [39]:
lappt.head()

Unnamed: 0,LeaderID,CareerString,CareerStartYear,MultipleSubstrings
0,리원일,노동성 상(유임),1999,1
1,조용원,당 정치국 후보위원,2020,1
2,정경택,당 중앙위원회 정치국 위원 (*당 중앙위원회 제7기 제4차 전원회의에서 보선),2019,1
3,리하일,인민무력부 작전국 국장,1975,1
4,최상건,(前)당 비서국 비서,2021,1


In [40]:
lappt["IsJointAppointment"]=False
lappt.loc[lappt["MultipleSubstrings"].astype("int")>1,"IsJointAppointment"]=True

In [41]:
lappt.head()

Unnamed: 0,LeaderID,CareerString,CareerStartYear,MultipleSubstrings,IsJointAppointment
0,리원일,노동성 상(유임),1999,1,False
1,조용원,당 정치국 후보위원,2020,1,False
2,정경택,당 중앙위원회 정치국 위원 (*당 중앙위원회 제7기 제4차 전원회의에서 보선),2019,1,False
3,리하일,인민무력부 작전국 국장,1975,1,False
4,최상건,(前)당 비서국 비서,2021,1,False


### 4. Succession Period - Broad

* KIS-->KJI: 1974-1993 - less institutionalized (stronger intra)
* KJI--->KJU:  2002-2011 - more institutionalized (stronger inter)

In [42]:
lappt.columns

Index(['LeaderID', 'CareerString', 'CareerStartYear', 'MultipleSubstrings',
       'IsJointAppointment'],
      dtype='object')

In [43]:
lappt["Succession_Broad"] = np.nan

In [46]:
value_order = ["KIS to KJI","KJI to KJU"]
lappt.Succession_Broad = lappt.Succession_Broad.astype("category")
lappt.Succession_Broad = lappt.Succession_Broad.cat.set_categories(value_order)

In [47]:
lappt.loc[(lappt["CareerStartYear"]>1973) & (lappt["CareerStartYear"]<1994),"Succession_Broad"] = "KIS to KJI"
lappt.loc[(lappt["CareerStartYear"]>2001) & (lappt["CareerStartYear"]<2012),"Succession_Broad"] = "KJI to KJU"

In [48]:
lappt.head()

Unnamed: 0,LeaderID,CareerString,CareerStartYear,MultipleSubstrings,IsJointAppointment,Succession_Broad,Succession_Narrow
0,리원일,노동성 상(유임),1999,1,False,,
1,조용원,당 정치국 후보위원,2020,1,False,,
2,정경택,당 중앙위원회 정치국 위원 (*당 중앙위원회 제7기 제4차 전원회의에서 보선),2019,1,False,,
3,리하일,인민무력부 작전국 국장,1975,1,False,KIS to KJI,
4,최상건,(前)당 비서국 비서,2021,1,False,,


### 5. Succession Period - Narrow

#### Esther's periodization
* KIS-->KJI: 1987-1994 - less institutionalized (stronger intra)
* KJI--->KJU:  2009-2011 - more institutionalized (stronger inter)

In [49]:
lappt["Succession_Narrow"] = np.nan

In [50]:
value_order = ["KIS to KJI","KJI to KJU"]
lappt.Succession_Narrow = lappt.Succession_Narrow.astype("category")
lappt.Succession_Narrow = lappt.Succession_Narrow.cat.set_categories(value_order)

In [51]:
### Esther's periodization
lappt.loc[(lappt["CareerStartYear"]>1986) & (lappt["CareerStartYear"]<1995),"Succession_Narrow"] = "KIS to KJI"
lappt.loc[(lappt["CareerStartYear"]>2008) & (lappt["CareerStartYear"]<2012),"Succession_Narrow"] = "KJI to KJU"

In [52]:
lappt.head()

Unnamed: 0,LeaderID,CareerString,CareerStartYear,MultipleSubstrings,IsJointAppointment,Succession_Broad,Succession_Narrow
0,리원일,노동성 상(유임),1999,1,False,,
1,조용원,당 정치국 후보위원,2020,1,False,,
2,정경택,당 중앙위원회 정치국 위원 (*당 중앙위원회 제7기 제4차 전원회의에서 보선),2019,1,False,,
3,리하일,인민무력부 작전국 국장,1975,1,False,KIS to KJI,
4,최상건,(前)당 비서국 비서,2021,1,False,,


# Descriptive Analysis of Joint Appointments

In [54]:
lappt.shape

(5405, 7)

In [None]:
lappt.head()

In [56]:
col.shape

(9002, 12)

In [59]:
col.head()

Unnamed: 0,CareerString,CareerDateString_2022,IsJob,MultipleSubstrings,CareerStartYear,CareerStartMonth,CareerSubstring,InstitutionType,PrimaryInstitution,OrgName,Position,Notes
0,4.15문학창작단 단장,1989.04,True,1,1989,4.0,,당외곽및사회단체_사회부문(별책),4.15문화창작단,,단장,
1,"1989. 4.15문학창작단 단장, 조선작가동맹 통일문학담당 부위원장",,True,2,1989,4.0,4.15 문화창작단 단장,당외곽및사회단체_사회부문(별책),4.15문화창작단,,단장,
2,2004. 4.15문학창작단 부단장,,True,1,2004,,,당외곽및사회단체_사회부문(별책),4.15문화창작단,,부단장,
3,4.15문학창작단 부단장,2004.04,True,1,2004,4.0,,당외곽및사회단체_사회부문(별책),4.15문화창작단,,부단장,
4,7.7연합기업소 기사장,1989.12,True,1,1989,12.0,,UNCERTAIN,7.7연합기업소,,기사장,


In [63]:
tcol = col[(col["IsJob"]=="True") & ~(col["PrimaryInstitution"]=="최고인민회의")]
tcol.shape

(5930, 12)

In [69]:
tcol[tcol["MultipleSubstrings"].astype(int)>1].shape

(1214, 12)

In [86]:
# overall percentage
tcol[tcol["MultipleSubstrings"].astype(int)>1].shape[0] / tcol.shape[0]

0.20472175379426644

In [89]:
tcol_multiple = tcol.loc[tcol["MultipleSubstrings"].astype(int)>1,["InstitutionType","CareerString"]].groupby("InstitutionType",as_index=False).agg({"CareerString":"count"})
tcol_all = tcol[["InstitutionType","CareerString"]].groupby("InstitutionType",as_index=False).agg({"CareerString":"count"})

In [90]:
tcol_combo = tcol_multiple.merge(tcol_all,on="InstitutionType",how="inner")

In [91]:
tcol_combo.columns = ["InstitutionType","Multiple","Total"]

In [100]:
tcol_combo = tcol_combo[tcol_combo["InstitutionType"]!="UNCERTAIN"]
tcol_combo

Unnamed: 0,InstitutionType,Multiple,Total,Percent
1,국제친선단체,18,258,0.069767
2,노동당,632,1665,0.37958
3,당외곽및사회단체_경제부문,1,35,0.028571
4,당외곽및사회단체_경제부문(별책),2,115,0.017391
5,당외곽및사회단체_근로단체,3,103,0.029126
6,당외곽및사회단체_대외부문,8,81,0.098765
7,당외곽및사회단체_사회부문,11,126,0.087302
8,당외곽및사회단체_사회부문(별책),15,259,0.057915
9,당외곽및사회단체_정치부문,5,137,0.036496
10,당외곽및사회단체_종교부문,3,52,0.057692


In [99]:
# tcol_social = 
tcol_combo[~tcol_combo["InstitutionType"].isin(["노동당","인민군","정권기관"])].agg({"Multiple":"sum","Total":"sum"})
# tcol_social

Multiple      73
Total       1262
dtype: int64

In [101]:
# social
73 / 1262

0.05784469096671949

In [105]:
tcol_combo["Percent"] = tcol_combo["Multiple"]/tcol_combo["Total"]
tcol_combo[tcol_combo["InstitutionType"].isin(["노동당","인민군","정권기관"])]

Unnamed: 0,InstitutionType,Multiple,Total,Percent
2,노동당,632,1665,0.37958
12,인민군,107,570,0.187719
13,정권기관,384,2194,0.175023


In [94]:
tcol_combo

Unnamed: 0,InstitutionType,Multiple,Total,Percent
1,국제친선단체,18,258,0.069767
2,노동당,632,1665,0.37958
3,당외곽및사회단체_경제부문,1,35,0.028571
4,당외곽및사회단체_경제부문(별책),2,115,0.017391
5,당외곽및사회단체_근로단체,3,103,0.029126
6,당외곽및사회단체_대외부문,8,81,0.098765
7,당외곽및사회단체_사회부문,11,126,0.087302
8,당외곽및사회단체_사회부문(별책),15,259,0.057915
9,당외곽및사회단체_정치부문,5,137,0.036496
10,당외곽및사회단체_종교부문,3,52,0.057692


# Hypothesis Test - # of Joint Appointments in Two Periods

### 1. two-sample t-test of proportion of joint appointments

In [173]:
lappt.columns

Index(['LeaderID', 'CareerString', 'CareerStartYear', 'MultipleSubstrings',
       'IsJointAppointment', 'Succession_Broad', 'Succession_Narrow'],
      dtype='object')

In [174]:
group_columns = ["Succession_Broad"]
ttest1 = lappt.groupby(group_columns,as_index=False).agg({"CareerString":"count","IsJointAppointment":"sum"})
ttest1

Unnamed: 0,Succession_Broad,CareerString,IsJointAppointment
0,KIS to KJI,1415,170
1,KJI to KJU,1445,193


In [175]:
x1 = ttest1.iloc[0,2]
x2 = ttest1.iloc[1,2]
n1 = ttest1.iloc[0,1]
n2 = ttest1.iloc[1,1]
alpha = .05
print(x1,x2,n1,n2,alpha)

170 193 1415 1445 0.05


In [176]:
two_sample_ttest_proportions(x1,x2,n1,n2,alpha)

p1:	 0.12014134275618374
p2:	 0.13356401384083044
p1-p2:	 -0.013422671084646703
z:	 -1.0781309772160719
pvalue:	 calculate


### 2. two-sample t-test of mean  joint appointments

In [181]:
group_columns = ["Succession_Broad"]
ttest2 = lappt.groupby(group_columns,as_index=False).agg({"CareerString":"count","MultipleSubstrings":["mean","var"]})
ttest2

Unnamed: 0_level_0,Succession_Broad,CareerString,MultipleSubstrings,MultipleSubstrings
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,var
0,KIS to KJI,1415,1.158304,0.22669
1,KJI to KJU,1445,1.172318,0.242446


In [184]:
n1 = ttest2.iloc[0,1]
m1 = ttest2.iloc[0,2]
v1 = ttest2.iloc[0,3]

n2 = ttest2.iloc[1,1]
m2 = ttest2.iloc[1,2]
v2 = ttest2.iloc[1,3]

alpha = .05
print(m1,m2,v1,v2,n1,n2,alpha)

1.1583038869257951 1.172318339100346 0.22669019047285835 0.24244649138782193 1415 1445 0.05


In [194]:
two_sample_ttest_means(m1,m2,v1,v2,n1,n2,alpha)

m1:	 1.1583038869257951
m2:	 1.172318339100346
m1-m2:	 -0.014014452174550884
t:	 -0.773832683675395
pvalue:	 calculate
