In [40]:
import pandas as pd
import numpy as np
from datetime import date
from statistics import mean, mode
from pandas.api.types import CategoricalDtype

In [41]:
today = date.today()
print(today)

2024-02-10


# Tables

In [42]:
path_tables = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 2 tables/"

In [43]:
# tables
filename_careerorglink = "careerorglink.xlsx"
filename_leadercareerlink = "leadercareerlink.xlsx"
filename_orgtree = "orgtree.xlsx"
filename_elected = "positions_elected.xlsx"

In [44]:
# career-org link
# col = pd.read_excel(path_tables + filename_careerorglink,dtype="str")
# col.shape

In [45]:
# col.columns

In [46]:
# leader-career link
# lcl = pd.read_excel(path_tables + filename_leadercareerlink,dtype="str")
# lcl.shape

In [47]:
# lcl.columns

In [48]:
# orgtree
# org = pd.read_excel(path_tables + filename_orgtree,dtype="str")
# org.shape

In [49]:
# org.columns

In [50]:
# elected = pd.read_excel(path_tables + filename_elected,dtype="str")
# elected.shape

In [51]:
# elected.columns

# Queries

In [52]:
path_queries = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 3 queries/"

In [53]:
filename_leaderjob_all = "leaderjob_electUnelect_inOutgov.xlsx"
filename_leaderjob_no_spa = "leaderjob_no_spa.xlsx"
filename_leaderjobtransition_no_spa = "leaderjobtransition_no_spa.xlsx"

In [54]:
# leader jobs
# ljobs_all = pd.read_excel(path_queries + filename_leaderjob_all,dtype="str")
# ljobs_all.shape

In [55]:
# ljobs = pd.read_excel(path_queries + filename_leaderjob_no_spa,dtype="str")
# ljobs.shape

In [56]:
# transitions used for analysis - no SPA - no local-local
trans = pd.read_excel(path_queries + filename_leaderjobtransition_no_spa,dtype="str")
trans.shape

(4306, 31)

#### change datatypes

In [57]:
# ljobs = ljobs.astype({"CareerStartYear":"int","CareerStartDate":"int"})
# ljobs.dtypes

In [58]:
trans = trans.astype({"OrgAdvance":"int","PositionAdvance":"int",
                      "CareerStartYear_1":"int","CareerStartYear_2":"int",
                      "CareerStartDate_1":"int","CareerStartDate_2":"int"})
trans.dtypes

LeaderID                   object
CareerString_1             object
CareerDateString_2022_1    object
CareerStartYear_1           int32
CareerStartMonth_1         object
CareerStartDate_1           int32
CareerSubstring_1          object
InstitutionType_1          object
PrimaryInstitution_1       object
OrgName_1                  object
Local_1                    object
Position_1                 object
IsElected_1                object
OrgRank_1                  object
PositionRank_1             object
CareerString_2             object
CareerDateString_2022_2    object
CareerStartYear_2           int32
CareerStartMonth_2         object
CareerStartDate_2           int32
CareerSubstring_2          object
InstitutionType_2          object
PrimaryInstitution_2       object
OrgName_2                  object
Local_2                    object
Position_2                 object
IsElected_2                object
OrgRank_2                  object
PositionRank_2             object
OrgAdvance    

# Functions

In [59]:
def merge_results(m):
    
    print("\nMerge Results...")
    print("")
    print("\tshape     :",m.shape)
    print("\tleft_only :",m[m["_merge"]=="left_only"].shape)
    print("\tboth      :",m[m["_merge"]=="both"].shape)
    print("\tright_only:",m[m["_merge"]=="right_only"].shape)

In [60]:
# using this on (PI,OrgName) will ensure unique & non-null keys
# using this on a larger df will ensure unique rows and non-null keys, but not unique keys

def unique_non_null_rows(olddf):
    
    df = olddf.copy()
    
    
    ### drop duplicates
    df = df.drop_duplicates(keep="first",ignore_index=True)

    ### drop null rows
    df.dropna(how="all",axis=0,inplace=True)
    
    # drop rows with null PI
    df = df[~(df["PrimaryInstitution"].isna())]
        
    ### drop rows where PI contains stop words
    stop_words_lower = ["uncertain","current","deprecated","please_revise"]
    df = df[ ~ df["PrimaryInstitution"].str.lower().isin(stop_words_lower)]
            

    print("\nUnique Non-null Rows...")
    print("")
    print("\tNon-unique rows:",olddf.shape)
    print("\tUnique rows    :",df.shape)

    ### sort
    df = df.sort_values(["PrimaryInstitution","OrgName"])
    
    return df

In [61]:
def create_time_series(series,group_var,count_var):
    
    yeardist = series.groupby(group_var,as_index=False).count().sort_values(group_var)
    
    max_year = max(series[group_var])
    x = pd.DataFrame({"year":pd.Series(range(min(yeardist[group_var].astype(int)),max(yeardist[group_var].astype(int))+1))})
    
    yeardist[group_var] = yeardist[group_var].astype(int,errors="ignore")
    
    ts = x.merge(yeardist,left_on="year",right_on=group_var,how="left")
    ts.loc[ts[count_var].isna(),count_var]=0
    # merge_results(ts)
    
    ts_columns = ["year",count_var]
    ts = ts[ts_columns]
    
    return ts

# Analysis - Research Note

In [62]:
path_analysis = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/analysis/"

In [63]:
# analysis sub-paths
study0_path = "2023.10.04 Study 0 - research note/"
study1_path = "2023.10.04 Study 1 - political capital/"
study2_path = "2023.10.04 Study 2 - commitment vs control/"
study3_path = "2023.10.04 Study 3 - reds vs experts/"

# Format & Covariates

### 1. exlcude SPA jobs and local-local trans

In [64]:
trans.columns

Index(['LeaderID', 'CareerString_1', 'CareerDateString_2022_1',
       'CareerStartYear_1', 'CareerStartMonth_1', 'CareerStartDate_1',
       'CareerSubstring_1', 'InstitutionType_1', 'PrimaryInstitution_1',
       'OrgName_1', 'Local_1', 'Position_1', 'IsElected_1', 'OrgRank_1',
       'PositionRank_1', 'CareerString_2', 'CareerDateString_2022_2',
       'CareerStartYear_2', 'CareerStartMonth_2', 'CareerStartDate_2',
       'CareerSubstring_2', 'InstitutionType_2', 'PrimaryInstitution_2',
       'OrgName_2', 'Local_2', 'Position_2', 'IsElected_2', 'OrgRank_2',
       'PositionRank_2', 'OrgAdvance', 'PositionAdvance'],
      dtype='object')

In [65]:
trans.shape

(4306, 31)

In [66]:
trans.Local_1.unique()

array(['False', 'True'], dtype=object)

In [67]:
# remove Local-Local ties
trans2 = trans[~((trans["Local_1"]=="True") & (trans["Local_2"]=="True"))]
trans2.shape

(4222, 31)

### 2. add InstitutionCategory_1, InstitutionCategory_2

In [68]:
trans.InstitutionType_1.unique()

array(['정권기관', '국제친선단체', '노동당', '인민군', '당외곽및사회단체_사회부문(별책)',
       '당외곽및사회단체_사회부문', '당외곽및사회단체_체육부문', '당외곽및사회단체_정치부문', '당외곽및사회단체_대외부문',
       '당외곽및사회단체_근로단체', '당외곽및사회단체_종교부문', '당외곽및사회단체_경제부문(별책)',
       '당외곽및사회단체_경제부문'], dtype=object)

### double-check OrgRank

In [69]:
trans.columns

Index(['LeaderID', 'CareerString_1', 'CareerDateString_2022_1',
       'CareerStartYear_1', 'CareerStartMonth_1', 'CareerStartDate_1',
       'CareerSubstring_1', 'InstitutionType_1', 'PrimaryInstitution_1',
       'OrgName_1', 'Local_1', 'Position_1', 'IsElected_1', 'OrgRank_1',
       'PositionRank_1', 'CareerString_2', 'CareerDateString_2022_2',
       'CareerStartYear_2', 'CareerStartMonth_2', 'CareerStartDate_2',
       'CareerSubstring_2', 'InstitutionType_2', 'PrimaryInstitution_2',
       'OrgName_2', 'Local_2', 'Position_2', 'IsElected_2', 'OrgRank_2',
       'PositionRank_2', 'OrgAdvance', 'PositionAdvance'],
      dtype='object')

In [97]:
# select_cols = ["InstitutionType_1","PrimaryInstitution_1","OrgName_1","OrgRank_1"]
select_cols = ["InstitutionType_1","PrimaryInstitution_1","OrgName_1","OrgRank_1"]

In [104]:
# pd.set_option('display.max_rows', None)
# trans[select_cols]

In [105]:
# trans.loc[(trans["PrimaryInstitution_1"]=="내각") & (trans["OrgName_1"]=="인민경비대"),select_cols]
# trans.loc[(trans["PrimaryInstitution_1"]=="노동당") & (trans["OrgName_1"]=="당중앙위원회_X위원회"),select_cols]
# trans.loc[(trans["PrimaryInstitution_1"]=="노동당") & (trans["OrgName_1"]=="당중앙위원회_공업부"),select_cols]
# trans.loc[(trans["PrimaryInstitution_1"].isin(["국방위원회"])) & (trans["OrgName_1"].isin(["호위사령부_호위총국"])),select_cols]

### 3. PISame, OrgSame

In [30]:
trans.columns

Index(['LeaderID', 'CareerString_1', 'CareerDateString_2022_1',
       'CareerStartYear_1', 'CareerStartMonth_1', 'CareerStartDate_1',
       'CareerSubstring_1', 'InstitutionType_1', 'PrimaryInstitution_1',
       'OrgName_1', 'Local_1', 'Position_1', 'IsElected_1', 'OrgRank_1',
       'PositionRank_1', 'CareerString_2', 'CareerDateString_2022_2',
       'CareerStartYear_2', 'CareerStartMonth_2', 'CareerStartDate_2',
       'CareerSubstring_2', 'InstitutionType_2', 'PrimaryInstitution_2',
       'OrgName_2', 'Local_2', 'Position_2', 'IsElected_2', 'OrgRank_2',
       'PositionRank_2', 'OrgAdvance', 'PositionAdvance'],
      dtype='object')

In [31]:
trans["PISame"] = False
trans.loc[trans["PrimaryInstitution_1"]==trans["PrimaryInstitution_2"],["PISame"]] = True
trans.PISame.unique()

array([False,  True])

In [58]:
trans["OrgSame"] = np.nan
trans.loc[trans["PISame"]==True,"OrgSame"] = False
trans.loc[(trans["PISame"]==True) & (trans["OrgName_1"]==trans["OrgName_2"]),["OrgSame"]] = True
trans.OrgSame.unique()

array([nan, True, False], dtype=object)

### 4. OrgRankChange, PositionRankChange

In [69]:
trans["OrgRankChange"] = np.nan

In [70]:
trans.loc[trans["PISame"] & (trans["OrgRank_1"]>trans["OrgRank_2"]),"OrgRankChange"] = "lower"
trans.loc[trans["PISame"] & (trans["OrgRank_1"]==trans["OrgRank_2"]),"OrgRankChange"] = "same"
trans.loc[trans["PISame"] & (trans["OrgRank_1"]<trans["OrgRank_2"]),"OrgRankChange"] = "higher"

In [79]:
value_order = ["lower","same","higher"]
trans.OrgRankChange = trans.OrgRankChange.astype("category")
trans.OrgRankChange = trans.OrgRankChange.cat.set_categories(value_order)

In [80]:
trans.OrgRankChange.unique()

[NaN, 'same', 'higher', 'lower']
Categories (3, object): ['lower', 'same', 'higher']

In [72]:
trans["PositionRankChange"] = np.nan

In [73]:
trans.loc[trans["OrgSame"] & (trans["PositionRank_1"]>trans["PositionRank_2"]),"PositionRankChange"] = "lower"
trans.loc[trans["OrgSame"] & (trans["PositionRank_1"]==trans["PositionRank_2"]),"PositionRankChange"] = "same"
trans.loc[trans["OrgSame"] & (trans["PositionRank_1"]<trans["PositionRank_2"]),"PositionRankChange"] = "higher"

In [81]:
value_order = ["lower","same","higher"]
trans.PositionRankChange = trans.PositionRankChange.astype("category")
trans.PositionRankChange = trans.PositionRankChange.cat.set_categories(value_order)

In [82]:
trans.PositionRankChange.unique()

[NaN, 'same', 'lower', 'higher']
Categories (3, object): ['lower', 'same', 'higher']

# Statistics

### 1. Number of Elites with at least one transition

In [30]:
trans.columns

Index(['LeaderID', 'CareerString_1', 'CareerDateString_2022_1',
       'CareerStartYear_1', 'CareerStartMonth_1', 'CareerStartDate_1',
       'CareerSubstring_1', 'InstitutionType_1', 'PrimaryInstitution_1',
       'OrgName_1', 'Local_1', 'Position_1', 'IsElected_1', 'OrgRank_1',
       'PositionRank_1', 'CareerString_2', 'CareerDateString_2022_2',
       'CareerStartYear_2', 'CareerStartMonth_2', 'CareerStartDate_2',
       'CareerSubstring_2', 'InstitutionType_2', 'PrimaryInstitution_2',
       'OrgName_2', 'Local_2', 'Position_2', 'IsElected_2', 'OrgRank_2',
       'PositionRank_2', 'OrgAdvance', 'PositionAdvance'],
      dtype='object')

In [33]:
len(trans.LeaderID.unique())

505

### 2. Distribution of Number of Transitions

In [46]:
stat1_columns = ["LeaderID","PrimaryInstitution_1"]
stat1_group_columns = ["LeaderID"]

In [47]:
stat1 = trans[stat1_columns].groupby(stat1_group_columns,as_index=False).count()
stat1

Unnamed: 0,LeaderID,PrimaryInstitution_1
0,강관주,16
1,강능수,14
2,강동윤,4
3,강련학,4
4,강명철,1
...,...,...
500,황민,3
501,황병서,12
502,황봉영,7
503,황순희,14


In [48]:
stat1a_group_columns = ["PrimaryInstitution_1"]
stat1a_column_labels = ["Number of Transitions","Number of Elites"]

In [49]:
stat1a = stat1.groupby(stat1a_group_columns,as_index=False).count()
stat1a.columns = stat1a_column_labels
stat1a

Unnamed: 0,Number of Transitions,Number of Elites
0,1,67
1,2,61
2,3,52
3,4,53
4,5,32
5,6,23
6,7,27
7,8,24
8,9,26
9,10,14


### 3. Matrix of Transition Counts by sending and receiving Institution Category

In [None]:
# stat3

### 4. Within/Between Primary Institution, Overall

* use PISame=True to compare change in Org Rank

In [54]:
stat4_columns = ["PISame","LeaderID"]
stat4_groupby_columns = ["PISame"]
stat4_column_labels = ["Within Same Institution","Number of Transitions"]

In [55]:
stat4 = trans[stat4_columns].groupby(stat4_groupby_columns,as_index=False).count()
stat4.columns = stat4_column_labels
stat4

Unnamed: 0,Within Same Institution,Number of Transitions
0,False,2753
1,True,1553


### 4a. Within/Between Institution, by Institution Category

* use PISame=True to compare change in Org Rank

In [None]:
# stat4a

### 5. Within/Between Org, Overall

* use OrgSame=True to compare change in Position Rank

In [59]:
stat5_columns = ["OrgSame","LeaderID"]
stat5_groupby_columns = ["OrgSame"]
stat5_column_labels = ["Within Same Organization","Number of Transitions"]

In [60]:
stat5 = trans[stat5_columns].groupby(stat5_groupby_columns,as_index=False).count()
stat5.columns = stat5_column_labels
stat5

Unnamed: 0,Within Same Institution,Number of Transitions
0,False,1211
1,True,342


In [64]:
stat5["Number of Transitions"].sum()

1553

### 5a. Within/Between Org, by Institution

* use PISame=True to compare change in Org Rank

In [None]:
# stat5a

### 6. Change in OrgRank

* PISame = True

In [83]:
stat6_columns = ["OrgRankChange","LeaderID"]
stat6_groupby_columns = ["OrgRankChange"]
stat6_column_labels = ["Change in Org Rank","Number of Transitions"]

In [84]:
stat6 = trans[stat6_columns].groupby(stat6_groupby_columns,as_index=False).count()
stat6.columns = stat6_column_labels
stat6

Unnamed: 0,Change in Org Rank,Number of Transitions
0,lower,388
1,same,838
2,higher,327


### 6a. Change in OrgRank, by Institution Category

* PISame = True

In [76]:
# stat6a

### 7. Change in PositionRank

* PISame, OrgSame =True

In [85]:
stat7_columns = ["PositionRankChange","LeaderID"]
stat7_groupby_columns = ["PositionRankChange"]
stat7_column_labels = ["Change in Position Rank","Number of Transitions"]

In [86]:
stat7 = trans[stat7_columns].groupby(stat7_groupby_columns,as_index=False).count()
stat7.columns = stat7_column_labels
stat7

Unnamed: 0,Change in Position Rank,Number of Transitions
0,lower,90
1,same,239
2,higher,13


### 7a. Change in PositionRank, by Institution Category

* PISame, OrgSame =True

In [None]:
# stat7a

### 8. Time Series of Transition Counts

In [88]:
# stat8

### 8a. Time Series of Transition Counts, by Institution Category

In [89]:
# stat8a