In [1]:
import pandas as pd
import numpy as np
from datetime import date
from statistics import mean, mode
from pandas.api.types import CategoricalDtype
pd.set_option('display.max_rows', None)

In [2]:
today = date.today()
print(today)

2023-10-21


# Tables

In [3]:
path_tables = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 2 tables/"

In [4]:
# tables
filename_careerorglink = "careerorglink.xlsx"
filename_leadercareerlink = "leadercareerlink.xlsx"
filename_orgtree = "orgtree.xlsx"
filename_elected = "positions_elected.xlsx"

In [5]:
col = pd.read_excel(path_tables + filename_careerorglink,dtype="str")
col.shape

(9002, 12)

In [6]:
col.columns

Index(['CareerString', 'CareerDateString_2022', 'IsJob', 'MultipleSubstrings',
       'CareerStartYear', 'CareerStartMonth', 'CareerSubstring',
       'InstitutionType', 'PrimaryInstitution', 'OrgName', 'Position',
       'Notes'],
      dtype='object')

In [7]:
lcl = pd.read_excel(path_tables + filename_leadercareerlink,dtype="str")
lcl.shape

(12617, 3)

In [8]:
lcl.columns

Index(['LeaderID', 'CareerString', 'CareerDateString_2022'], dtype='object')

In [9]:
org = pd.read_excel(path_tables + filename_orgtree,dtype="str")
org.shape

(2368, 19)

In [10]:
org.columns

Index(['InstitutionType', 'OrgType', 'PrimaryInstitution', 'OrgName',
       'PI_Index', 'OrgRank', 'P1', 'P2', 'P3', 'Alias_OrgName',
       'LinkToNext_PI', 'LinkToNext_Org', 'LinkToNext_Year', 'Notes',
       'L1_Index', 'L2_Index', 'L3_Index', 'L4_Index', 'L5_Index'],
      dtype='object')

In [11]:
elected = pd.read_excel(path_tables + filename_elected,dtype="str")
elected.shape

(105, 4)

In [12]:
elected.columns

Index(['PrimaryInstitution', 'OrgName', 'Position', 'IsElected'], dtype='object')

# Queries

In [13]:
path_queries = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 3 queries/"

In [14]:
# jobs and job transitions, elected & unelected, in and out of gov
filename_leaderjob_electUnelect_inOutgov = "leaderjob_electUnelect_inOutgov.xlsx"
filename_leaderjobtransition_electUnelect_inOutgov = "leaderjobtransition_electUnelect_inOutgov.xlsx"

In [15]:
# certain jobs
# IsJob == True
# InstitutionType, PrimaryInstitution, OrgName != "Uncertain"
jobs = pd.read_excel(path_queries + filename_leaderjob_electUnelect_inOutgov,dtype="str")
jobs.shape

(8594, 14)

In [16]:
trans = pd.read_excel(path_queries + filename_leaderjobtransition_electUnelect_inOutgov,dtype="str")
trans.shape

(6311, 29)

#### change datatypes

In [17]:
trans = trans.astype({"OrgAdvance":"int","PositionAdvance":"int","CareerStartYear_1":"int"})
trans.dtypes

LeaderID                   object
CareerString_1             object
CareerDateString_2022_1    object
CareerStartYear_1           int32
CareerStartMonth_1         object
CareerStartDate_1          object
CareerSubstring_1          object
InstitutionType_1          object
PrimaryInstitution_1       object
OrgName_1                  object
Position_1                 object
IsElected_1                object
OrgRank_1                  object
PositionRank_1             object
CareerString_2             object
CareerDateString_2022_2    object
CareerStartYear_2          object
CareerStartMonth_2         object
CareerStartDate_2          object
CareerSubstring_2          object
InstitutionType_2          object
PrimaryInstitution_2       object
OrgName_2                  object
Position_2                 object
IsElected_2                object
OrgRank_2                  object
PositionRank_2             object
OrgAdvance                  int32
PositionAdvance             int32
dtype: object

# Functions

In [18]:
def merge_results(m):
    
    print("\nMerge Results...")
    print("")
    print("\tshape     :",m.shape)
    print("\tleft_only :",m[m["_merge"]=="left_only"].shape)
    print("\tboth      :",m[m["_merge"]=="both"].shape)
    print("\tright_only:",m[m["_merge"]=="right_only"].shape)

In [19]:
# using this on (PI,OrgName) will ensure unique & non-null keys
# using this on a larger df will ensure unique rows and non-null keys, but not unique keys

def unique_non_null_rows(olddf):
    
    df = olddf.copy()
    
    
    ### drop duplicates
    df = df.drop_duplicates(keep="first",ignore_index=True)

    ### drop null rows
    df.dropna(how="all",axis=0,inplace=True)
    
    # drop rows with null PI
    df = df[~(df["PrimaryInstitution"].isna())]
        
    ### drop rows where PI contains stop words
    stop_words_lower = ["uncertain","current","deprecated","please_revise"]
    df = df[ ~ df["PrimaryInstitution"].str.lower().isin(stop_words_lower)]
            

    print("\nUnique Non-null Rows...")
    print("")
    print("\tNon-unique rows:",olddf.shape)
    print("\tUnique rows    :",df.shape)

    ### sort
    df = df.sort_values(["PrimaryInstitution","OrgName"])
    
    return df

In [20]:
def create_time_series(series,group_var,count_var):
    
    yeardist = series.groupby(group_var,as_index=False).count().sort_values(group_var)
    
    max_year = max(series[group_var])
    x = pd.DataFrame({"year":pd.Series(range(min(yeardist[group_var].astype(int)),max(yeardist[group_var].astype(int))+1))})
    
    yeardist[group_var] = yeardist[group_var].astype(int,errors="ignore")
    
    ts = x.merge(yeardist,left_on="year",right_on=group_var,how="left")
    ts.loc[ts[count_var].isna(),count_var]=0
    # merge_results(ts)
    
    ts_columns = ["year",count_var]
    ts = ts[ts_columns]
    
    return ts

# Analysis - Research Note

In [21]:
path_analysis = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/analysis/"

In [22]:
# analysis sub-paths
study0_path = "2023.10.04 Study 0 - research note/"
study1_path = "2023.10.04 Study 1 - political capital/"
study2_path = "2023.10.04 Study 2 - commitment vs control/"
study3_path = "2023.10.04 Study 3 - reds vs experts/"

### SPA 대의원

In [27]:
jobs.columns

Index(['LeaderID', 'CareerString', 'CareerDateString_2022', 'CareerStartYear',
       'CareerStartMonth', 'CareerStartDate', 'CareerSubstring',
       'InstitutionType', 'PrimaryInstitution', 'OrgName', 'Position',
       'IsElected', 'OrgRank', 'PositionRank'],
      dtype='object')

In [29]:
spa_analysis_columns = ["LeaderID","CareerStartDate","InstitutionType","PrimaryInstitution","OrgName","Position"]
spajobs = jobs[spa_analysis_columns]
spajobs[spajobs.PrimaryInstitution=="최고인민회의"].head(2)

Unnamed: 0,LeaderID,CareerStartDate,InstitutionType,PrimaryInstitution,OrgName,Position
4,김창섭,201403,정권기관,최고인민회의,,대의원
5,최태복,201403,정권기관,최고인민회의,,대의원


In [64]:
# how many leaders were 최고인민회의 대의원?
spa_delegate_jobs = spajobs[(spajobs.PrimaryInstitution=="최고인민회의") & (spajobs.Position=="대의원")].drop_duplicates()
spa_delegate_jobs.shape

(1490, 6)

In [61]:
# 438 leaders were SPA Reps.
spa_delegates = spa_delegate_jobs.groupby("LeaderID",as_index=False).count()
spa_delegates = list(spa_delegates["LeaderID"])
len(spa_delegates)

438

In [42]:
# On average, each was a rep more than 3 times.
1490/438

3.401826484018265

### SPA career transitions

In [65]:
spa_delegate_alljobs = jobs[jobs.LeaderID.isin(spa_delegates)]
spa_delegate_alljobs.shape

(7748, 14)

In [68]:
spa_delegate_alljobs.head(5)

Unnamed: 0,LeaderID,CareerString,CareerDateString_2022,CareerStartYear,CareerStartMonth,CareerStartDate,CareerSubstring,InstitutionType,PrimaryInstitution,OrgName,Position,IsElected,OrgRank,PositionRank
0,리원일,노동성 상(유임),1999.02,1999,2.0,199902,,정권기관,내각,노동성,상,,1,1
2,정경택,당 중앙위원회 정치국 위원 (*당 중앙위원회 제7기 제4차 전원회의에서 보선),2019.04,2019,4.0,201904,,노동당,노동당,당중앙위원회_정치국,위원,1.0,2,2
3,리하일,인민무력부 작전국 국장,1975.0,1975,,197500,,정권기관,정무원,인민무력부A_작전국,국장,,2,1
4,김창섭,최고인민회의 제13기 대의원,2014.03,2014,3.0,201403,,정권기관,최고인민회의,,대의원,1.0,0,3
5,최태복,최고인민회의 제13기 대의원,2014.03,2014,3.0,201403,,정권기관,최고인민회의,,대의원,1.0,0,3


In [69]:
"리원일" in spa_delegates

True

In [83]:
spa_delegate_alljobs.columns

Index(['LeaderID', 'CareerString', 'CareerDateString_2022', 'CareerStartYear',
       'CareerStartMonth', 'CareerStartDate', 'CareerSubstring',
       'InstitutionType', 'PrimaryInstitution', 'OrgName', 'Position',
       'IsElected', 'OrgRank', 'PositionRank'],
      dtype='object')

In [84]:
# all job pairings of all jobs ever held by SPA delegates
spa_trans_key = ["LeaderID"]
spa_trans_columns = ["LeaderID","CareerStartDate","PrimaryInstitution","OrgName","Position"]
spa_delegate_alljobs = spa_delegate_alljobs[spa_trans_columns]
spa_trans = spa_delegate_alljobs.merge(spa_delegate_alljobs,on=spa_trans_key,how="inner")
spa_trans.shape

(243816, 9)

In [85]:
# all unique job pairings of all jobs ever held by SPA delegates
spa_trans2 = spa_trans[spa_trans.CareerStartDate_x < spa_trans.CareerStartDate_y]

In [86]:
# all unique job pairings of all SPA jobs ever held by SPA delegates
spa_trans3 = spa_trans2[(spa_trans2.PrimaryInstitution_x=="최고인민회의") & (spa_trans2.PrimaryInstitution_y=="최고인민회의")]
spa_trans3.shape

(11883, 9)

In [87]:
# query: how many 
spa_trans3.head(5)

Unnamed: 0,LeaderID,CareerStartDate_x,PrimaryInstitution_x,OrgName_x,Position_x,CareerStartDate_y,PrimaryInstitution_y,OrgName_y,Position_y
51,리원일,199809,최고인민회의,,대의원,200309,최고인민회의,,대의원
52,리원일,199809,최고인민회의,,대의원,200903,최고인민회의,,대의원
53,리원일,199809,최고인민회의,,대의원,201009,최고인민회의,황해북도인민위원회,위원장
58,리원일,199809,최고인민회의,,대의원,200309,최고인민회의,,대의원
59,리원일,199809,최고인민회의,,대의원,201009,최고인민회의,황해북도인민위원회,위원장


In [89]:
# unique prior OrgName, Position
query1_columns = ["PrimaryInstitution_x","OrgName_x","Position_x"]
spa_trans3[query1_columns].drop_duplicates()

Unnamed: 0,PrimaryInstitution_x,OrgName_x,Position_x
51,최고인민회의,,대의원
2511,최고인민회의,외교위원회,위원장
2831,최고인민회의,,의장
5443,최고인민회의,평양시인민위원회_모란봉구역인민위원회,위원장
5456,최고인민회의,법제위원회,위원
6031,최고인민회의,상임위원회,의원
6278,최고인민회의,상임위원회,위원
8898,최고인민회의,상설회의,의원
9381,최고인민회의,상임위원회,참사
9982,최고인민회의,외교위원회,위원


In [91]:
spa_trans3[spa_trans3.Position_x.isin(["후보","후보의원"])]

Unnamed: 0,LeaderID,CareerStartDate_x,PrimaryInstitution_x,OrgName_x,Position_x,CareerStartDate_y,PrimaryInstitution_y,OrgName_y,Position_y
