In [1]:
import pandas as pd
import numpy as np
from datetime import date
from statistics import mean, mode

# Tables

In [2]:
path_tables = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 2 tables/"

In [3]:
# tables
filename_careerorglink = "careerorglink.xlsx"
filename_leadercareerlink = "leadercareerlink.xlsx"
filename_orgtree = "orgtree.xlsx"
filename_elected = "positions_elected.xlsx"

In [4]:
col = pd.read_excel(path_tables + filename_careerorglink,dtype="str")
col.shape

(9002, 12)

In [5]:
col.columns

Index(['CareerString', 'CareerDateString_2022', 'IsJob', 'MultipleSubstrings',
       'CareerStartYear', 'CareerStartMonth', 'CareerSubstring',
       'InstitutionType', 'PrimaryInstitution', 'OrgName', 'Position',
       'Notes'],
      dtype='object')

In [6]:
lcl = pd.read_excel(path_tables + filename_leadercareerlink,dtype="str")
lcl.shape

(12617, 3)

In [7]:
lcl.columns

Index(['LeaderID', 'CareerString', 'CareerDateString_2022'], dtype='object')

In [8]:
org = pd.read_excel(path_tables + filename_orgtree,dtype="str")
org.shape

(2368, 19)

In [9]:
org.columns

Index(['InstitutionType', 'OrgType', 'PrimaryInstitution', 'OrgName',
       'PI_Index', 'OrgRank', 'P1', 'P2', 'P3', 'Alias_OrgName',
       'LinkToNext_PI', 'LinkToNext_Org', 'LinkToNext_Year', 'Notes',
       'L1_Index', 'L2_Index', 'L3_Index', 'L4_Index', 'L5_Index'],
      dtype='object')

In [10]:
elected = pd.read_excel(path_tables + filename_elected,dtype="str")
elected.shape

(105, 4)

In [11]:
elected.columns

Index(['PrimaryInstitution', 'OrgName', 'Position', 'IsElected'], dtype='object')

# Queries

In [12]:
path_queries = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 3 queries/"

In [13]:
filename_leaderjobtransition_alljobs = "leaderjobtransition_alljobs.xlsx"
# filename_leaderjobtransition_ingov = "leaderjobtransition_ingov.xlsx"

In [14]:
# trans = pd.read_excel(path_queries + filename_leaderjobtransition,dtype="str")

In [15]:
today = date.today()
print(today)

2023-09-01


# Analysis

In [16]:
path_analysis = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/analysis/"

In [17]:
# subpaths
current_subpath = "2023.09.01 analysis/"

# Functions

In [18]:
def merge_results(m):
    
    print("\nMerge Results...")
    print("")
    print("\tshape     :",m.shape)
    print("\tleft_only :",m[m["_merge"]=="left_only"].shape)
    print("\tboth      :",m[m["_merge"]=="both"].shape)
    print("\tright_only:",m[m["_merge"]=="right_only"].shape)

In [19]:
# using this on (PI,OrgName) will ensure unique & non-null keys
# using this on a larger df will ensure unique rows and non-null keys, but not unique keys

def unique_non_null_rows(olddf):
    
    df = olddf.copy()
    
    
    ### drop duplicates
    df = df.drop_duplicates(keep="first",ignore_index=True)

    ### drop null rows
    df.dropna(how="all",axis=0,inplace=True)
    
    # drop rows with null PI
    df = df[~(df["PrimaryInstitution"].isna())]
        
    ### drop rows where PI contains stop words
    stop_words_lower = ["uncertain","current","deprecated","please_revise"]
    df = df[ ~ df["PrimaryInstitution"].str.lower().isin(stop_words_lower)]
            

    print("\nUnique Non-null Rows...")
    print("")
    print("\tNon-unique rows:",olddf.shape)
    print("\tUnique rows    :",df.shape)

    ### sort
    df = df.sort_values(["PrimaryInstitution","OrgName"])
    
    return df

# Step. Format Jobs

#### merge col with elected - to identify elected positions

In [20]:
position_key_columns = ['PrimaryInstitution', 'OrgName', 'Position']
elected_columns = position_key_columns + ["IsElected"]

In [21]:
col2 = col.merge(elected[elected_columns],on=position_key_columns,how="outer",indicator=True)
merge_results(col2)


Merge Results...

	shape     : (9033, 14)
	left_only : (7261, 14)
	both      : (1741, 14)
	right_only: (31, 14)


In [22]:
# address this later... whether
# col2[col2["_merge"]=="right_only"][position_key_columns]

In [23]:
col2.columns

Index(['CareerString', 'CareerDateString_2022', 'IsJob', 'MultipleSubstrings',
       'CareerStartYear', 'CareerStartMonth', 'CareerSubstring',
       'InstitutionType', 'PrimaryInstitution', 'OrgName', 'Position', 'Notes',
       'IsElected', '_merge'],
      dtype='object')

In [24]:
# update col with elected positions
col = col2.drop(columns=["_merge"])

In [25]:
col.IsElected.unique()

array([nan, '1'], dtype=object)

#### select jobs that satisfy criterion for transitions 

In [26]:
# IsJob = True
# CareerStartYear can't be null
# IsElected = False
# PI, OrgName, Position can't be UNCERTAIN

select_jobs1 = (col["CareerStartYear"].notna()) & (col["IsElected"].isnull()) & (col["PrimaryInstitution"]!="UNCERTAIN") & (col["OrgName"]!="UNCERTAIN") & (col["Position"]!="UNCERTAIN")
select_jobs2 = select_jobs1 & (col["IsJob"]=="True")
job_columns = ['CareerString', 'CareerDateString_2022', 'CareerStartYear', 'CareerStartMonth', 'CareerSubstring',
       'InstitutionType', 'PrimaryInstitution', 'OrgName', 'Position']

In [27]:
jobs = col.loc[select_jobs2,job_columns]
jobs.shape

(4387, 9)

#### descriptive statistics

In [28]:
# not jobs, excluded
col[col["IsJob"]=="False"].shape

(2271, 13)

In [29]:
# jobs, excluded, either because elected or because CareerStartYear is unknown
selection = (col["IsJob"]=="True") & ~select_jobs1
col[selection].shape

(2344, 13)

In [30]:
# jobs, included
col[select_jobs2].shape

(4387, 13)

# Step. add Org and Position metadata from orgtree, such as Rank

In [31]:
org_key_columns = ['PrimaryInstitution', 'OrgName']

In [32]:
orgtree_columns = org_key_columns + ['OrgRank', 'P1', 'P2', 'P3']

In [33]:
jobs2 = jobs.merge(org[orgtree_columns],on=org_key_columns,how="left",indicator=True)
merge_results(jobs2)


Merge Results...

	shape     : (4387, 14)
	left_only : (0, 14)
	both      : (4387, 14)
	right_only: (0, 14)


In [34]:
# Good! No left_only means we are matching on all PI-OrgName keys
# jobs2[jobs2["_merge"]=="left_only"]

In [35]:
jobs2.head(2)

Unnamed: 0,CareerString,CareerDateString_2022,CareerStartYear,CareerStartMonth,CareerSubstring,InstitutionType,PrimaryInstitution,OrgName,Position,OrgRank,P1,P2,P3,_merge
0,4.15문학창작단 단장,1989.04,1989,4,,당외곽및사회단체_사회부문(별책),4.15문화창작단,,단장,0,단장,부단장,,both
1,"1989. 4.15문학창작단 단장, 조선작가동맹 통일문학담당 부위원장",,1989,4,4.15 문화창작단 단장,당외곽및사회단체_사회부문(별책),4.15문화창작단,,단장,0,단장,부단장,,both


In [36]:
# determine PositionRank - i.e., find Position in P1-P3

# row-vectorized formula - meant for use in df.apply()

def determine_Position_Rank(row):
    
    PositionRank = np.nan
    rowPos = row["Position"]

    # print("\n")
    # print(row["Position"],type(row["Position"]),type(rowPos))
    
    # print("P1:",row["P1"])
    if isinstance(row["P1"],str):
        if rowPos in row["P1"].split(","):
            PositionRank = 1
    
    # print("P2:",row["P2"])
    if isinstance(row["P2"],str):
        if rowPos in row["P2"].split(","):
            PositionRank = 2
          
    # print("P3:",row["P3"])
    if isinstance(row["P3"],str):
        if rowPos in row["P3"].split(","):
            PositionRank = 3
            
    # print("PositionRank:",PositionRank)
        
    return PositionRank

In [37]:
jobs2["PositionRank"] = jobs2.apply(determine_Position_Rank,axis=1)

In [38]:
jobs2.columns

Index(['CareerString', 'CareerDateString_2022', 'CareerStartYear',
       'CareerStartMonth', 'CareerSubstring', 'InstitutionType',
       'PrimaryInstitution', 'OrgName', 'Position', 'OrgRank', 'P1', 'P2',
       'P3', '_merge', 'PositionRank'],
      dtype='object')

In [39]:
new_jobs_columns = ['CareerString', 'CareerDateString_2022', 'CareerStartYear',
       'CareerStartMonth', 'CareerSubstring', 'InstitutionType',
       'PrimaryInstitution', 'OrgName', 'Position', 'OrgRank', 'PositionRank']

In [40]:
jobs = jobs2[new_jobs_columns]

# Step. merge leadercareerlink & jobs

In [41]:
job_key_columns = ['CareerString', 'CareerDateString_2022']

In [42]:
lcl.head(2)

Unnamed: 0,LeaderID,CareerString,CareerDateString_2022
0,리선권,개성공단 남북공동위원회 통행통신통관 분과위원회,2013.09 ~ 2014.01
1,조경철,故 김정일 국가장의위원회 위원,2011.12


In [43]:
# leaderjoblink - outer join for descriptive statistics
ljob_outer = lcl.merge(jobs,on=job_key_columns,how="outer",indicator=True)
merge_results(ljob_outer)


Merge Results...

	shape     : (12840, 13)
	left_only : (8361, 13)
	both      : (4479, 13)
	right_only: (0, 13)


In [44]:
# no right_only confirms no mismatched career keys in careerorglink
# left-only indicates all of the excluded career items

In [45]:
# leaderjoblink - inner join for the table we will use to make transitions
ljob = lcl.merge(jobs,on=job_key_columns,how="inner",indicator=True)
merge_results(ljob)


Merge Results...

	shape     : (4479, 13)
	left_only : (0, 13)
	both      : (4479, 13)
	right_only: (0, 13)


In [46]:
# instantiate CareerStartDate
ljob["CareerStartDate"]=np.nan

In [47]:
# CareerStartDate when CareerStartMonth is null
selection = ljob["CareerStartMonth"].isna()
ljob.loc[selection,"CareerStartDate"] = ljob[selection].apply(lambda x: str(x["CareerStartYear"]) + "00",axis=1)

In [48]:
# CareerStartDate when CareerStartMonth has 2 digits
selection = (ljob["CareerStartMonth"].notna()) & (ljob.apply(lambda x: len(str(x["CareerStartMonth"])),axis=1)==2)
ljob.loc[selection,"CareerStartDate"] = ljob[selection].apply(lambda x: str(x["CareerStartYear"]) + str(x["CareerStartMonth"]),axis=1)

In [49]:
# CareerStartDate when CareerStartMonth has 1 digit
selection = (ljob["CareerStartMonth"].notna()) & (ljob.apply(lambda x: len(str(x["CareerStartMonth"])),axis=1)==1)
ljob.loc[selection,"CareerStartDate"] = ljob[selection].apply(lambda x: str(x["CareerStartYear"]) + "0" + str(x["CareerStartMonth"]),axis=1)

In [50]:
ljob.columns

Index(['LeaderID', 'CareerString', 'CareerDateString_2022', 'CareerStartYear',
       'CareerStartMonth', 'CareerSubstring', 'InstitutionType',
       'PrimaryInstitution', 'OrgName', 'Position', 'OrgRank', 'PositionRank',
       '_merge', 'CareerStartDate'],
      dtype='object')

In [51]:
ljob_columns = ['LeaderID', 'CareerString', 'CareerDateString_2022','CareerStartYear', 'CareerStartMonth','CareerStartDate','CareerSubstring', 'InstitutionType',
       'PrimaryInstitution', 'OrgName', 'Position','OrgRank', 'PositionRank']

In [52]:
ljob = ljob[ljob_columns]

In [53]:
ljob.head(2)

Unnamed: 0,LeaderID,CareerString,CareerDateString_2022,CareerStartYear,CareerStartMonth,CareerStartDate,CareerSubstring,InstitutionType,PrimaryInstitution,OrgName,Position,OrgRank,PositionRank
0,리원일,노동성 상(유임),1999.02,1999,2.0,199902,,정권기관,내각,노동성,상,1,1
1,리하일,인민무력부 작전국 국장,1975.0,1975,,197500,,정권기관,정무원,인민무력부A_작전국,국장,2,1


#### descriptive statistics

In [54]:
# unique leaders, overall
len(lcl.LeaderID.unique())

637

In [55]:
# unique leaders, with jobs included
len(ljob.LeaderID.unique())

547

In [56]:
# leaders not having jobs that satisfy our critiria; excluded from analysis
len(lcl.LeaderID.unique())-len(ljob.LeaderID.unique())

90

In [57]:
# job items, after pairing with leaders (some jobs reported for more than one leader)
ljob.shape

(4479, 13)

In [58]:
# unique positions
position_key_columns = ['PrimaryInstitution', 'OrgName','Position']
ljob[position_key_columns].drop_duplicates().shape

(1532, 3)

In [59]:
# unique orgs
org_key_columns = ['PrimaryInstitution', 'OrgName']
ljob[org_key_columns].drop_duplicates().shape

(1009, 2)

# Step. calculate time-adjacent transitions for each leader

In [60]:
def calculate_resume_transitions(lc,leaderid):

    t = lc[(lc["LeaderID"]==leaderid) & (lc["CareerStartYear"].notna())]
    t = t.drop_duplicates(["CareerStartYear","PrimaryInstitution","OrgName"],keep="first")
    t = t.sort_values("CareerStartDate",ignore_index=True)
    # print("resume:",t.shape)
    
    tr = t.merge(t,on="LeaderID",how="inner",suffixes=("_1","_2"))
    # sort based on CareerStartDates
    tr = tr.sort_values(["CareerStartDate_1","CareerStartDate_2"])
    
    # drop keys where CareerStartDate_1 <= CareerStartDate_2
    tr = tr[tr.apply(lambda x: x["CareerStartDate_1"] < x["CareerStartDate_2"],axis=1)]
    #tr.shape
    
    # smallest next startdate
    dates = tr[["CareerStartDate_1","CareerStartDate_2"]]
    #dates
    mindate = dates.groupby("CareerStartDate_1",as_index=False).agg({"CareerStartDate_2":"min"})
    mindate.columns = ["CareerStartDate_1","CareerStartDate_2_min"]
    #mindate
    
    tr2 = tr.merge(mindate,on="CareerStartDate_1",how="left")
    #tr2.shape
    
    tr3 = tr2[tr2.apply(lambda x: x["CareerStartDate_2"] == x["CareerStartDate_2_min"],axis=1)]
    # print("transitions:",tr3.shape)
    
    #print(tr3[some_columns])
    
    print(leaderid,tr3.shape[0],"transitions")

    return tr3

#### a good example of many of the problems we need to address

In [61]:
leaderid = "강관주"
resume = calculate_resume_transitions(ljob,leaderid)

강관주 14 transitions


In [62]:
some_columns = ["LeaderID","CareerStartDate_1","PrimaryInstitution_1","OrgName_1","Position_1","CareerStartDate_2","PrimaryInstitution_2","OrgName_2","Position_2"]
resume[some_columns]

Unnamed: 0,LeaderID,CareerStartDate_1,PrimaryInstitution_1,OrgName_1,Position_1,CareerStartDate_2,PrimaryInstitution_2,OrgName_2,Position_2
0,강관주,196100,체육연구원,체육과학연구소,연구원,197300,노동당,당중앙위원회_문화예술부_X과,과장
1,강관주,196100,체육연구원,체육과학연구소,연구원,197300,평양만경대예술단,섭외부,부부장
12,강관주,197300,노동당,당중앙위원회_문화예술부_X과,과장,198600,노동당,당중앙위원회_조직지도부,부부장
13,강관주,197300,평양만경대예술단,섭외부,부부장,198600,노동당,당중앙위원회_조직지도부,부부장
32,강관주,198600,노동당,당중앙위원회_조직지도부,부부장,198800,노동당,당중앙위원회_통일전선부,부부장
41,강관주,198800,노동당,당중앙위원회_통일전선부,부부장,198903,노동당,당중앙위원회_통일전선부,제1부부장
49,강관주,198903,노동당,당중앙위원회_통일전선부,제1부부장,199008,내각,조국평화통일위원회,부위원장
50,강관주,198903,노동당,당중앙위원회_통일전선부,제1부부장,199008,정무원,조국평화통일위원회,부위원장
56,강관주,199008,내각,조국평화통일위원회,부위원장,199109,보천보전자악단,,고문
57,강관주,199008,정무원,조국평화통일위원회,부위원장,199109,보천보전자악단,,고문


In [63]:
leaderids = list(ljob["LeaderID"].unique())

In [64]:
len(leaderids)

547

In [65]:
def compile_leader_transitions(lc):
    
    leaderids = list(lc["LeaderID"].unique())
    
    leader_resumes = []
    
    for leaderid in leaderids:
        leader_resume = calculate_resume_transitions(lc,leaderid)
        leader_resumes.append(leader_resume)
        
    lct = pd.concat(leader_resumes,ignore_index=True)
    
    return lct

In [66]:
transitions = compile_leader_transitions(ljob)

리원일 9 transitions
리하일 4 transitions
최희태 0 transitions
김기남 21 transitions
김령성 10 transitions
김정숙 26 transitions
박성철a 37 transitions
김창룡 0 transitions
김인식 0 transitions
리영호 2 transitions
최태복 17 transitions
김영일a 15 transitions
주상성 5 transitions
강동윤 2 transitions
김일성 22 transitions
김익현 10 transitions
김일본 21 transitions
박태화 8 transitions
최광 15 transitions
백학림 21 transitions
리종옥 29 transitions
김중린 14 transitions
리을설 13 transitions
오진우 12 transitions
김락희 10 transitions
김의순 12 transitions
양형섭 25 transitions
김영주 13 transitions
김원균 6 transitions
김철만 10 transitions
백인준 10 transitions
황순희 11 transitions
최영림 17 transitions
김영남 20 transitions
전병호 10 transitions
계응태 19 transitions
홍성남 20 transitions
강석숭 13 transitions
김복신 19 transitions
채희정 22 transitions
김영춘 6 transitions
길재경 5 transitions
강관주 14 transitions
박용석 8 transitions
박봉주 11 transitions
전하철 9 transitions
장병규 0 transitions
김국태 13 transitions
김정일 10 transitions
리용무 7 transitions
리종혁 24 transitions
오극렬 6 transitions
김기룡 12 transitions
리종산 2 tra

리택건 3 transitions
김성혜 3 transitions
리현 1 transitions
윤정호 0 transitions
임경재 0 transitions
강영수 3 transitions
김용연 2 transitions
박춘남 4 transitions
승정규 0 transitions
최상려 0 transitions
방강수 0 transitions
박명철b 0 transitions
조경철 0 transitions
김경남 1 transitions
박혁철 0 transitions
전학철 0 transitions
강형봉 2 transitions
리영용 1 transitions
송춘섭 1 transitions
강영철 2 transitions
김영환 2 transitions
최강일 0 transitions
리선권 0 transitions
리길성 1 transitions
최선희 4 transitions
김성 1 transitions
김윤혁b 1 transitions
왕창욱 0 transitions
리제선 0 transitions
강표영 2 transitions
장정남 1 transitions
서홍찬 2 transitions
박영식 1 transitions
리강선 0 transitions
한룡국 0 transitions
김춘섭 0 transitions
리형근 0 transitions
김충성 0 transitions
기광호 4 transitions
고정범 0 transitions
김유일 0 transitions
김재성 2 transitions
손철주 1 transitions
김류호 2 transitions
심영학 1 transitions
강명철 1 transitions
김창엽 0 transitions
한종혁 0 transitions
김정순 1 transitions
강수린 1 transitions
장춘실 0 transitions
손광호 1 transitions
원길우 3 transitions
김일국 3 transitions
김지선 0 transitions
강하국 0 tran

#### add OrgAdvance, PositionAdvance variables

In [67]:
transitions["OrgAdvance"] = transitions.apply(lambda x: int(x["OrgRank_1"]) - int(x["OrgRank_2"]),axis=1)

In [68]:
transitions["PositionAdvance"] = transitions.apply(lambda x: int(x["PositionRank_1"]) - int(x["PositionRank_2"]),axis=1)

#### descriptive statistics

In [69]:
# leaders with 1 or more job transition
len(transitions.LeaderID.unique())

444

In [70]:
# leaders with only 1 job; excluded from analysis
len(ljob.LeaderID.unique())-len(transitions.LeaderID.unique())

103

In [71]:
# number of transitions
transitions.shape

(2868, 28)

In [72]:
transition_counts = transitions.groupby("LeaderID")["Position_1"].count()

In [73]:
min(transition_counts)

1

In [74]:
max(transition_counts)

48

In [75]:
mean(transition_counts)

6.45945945945946

In [76]:
mode(transition_counts)

1

#### frequency of leaders by # of job transitions

In [77]:
x_axis = []
transition_count_distribution = []
for i in range(0,max(transition_counts)+1):
    pdf = sum([1 for count in transition_counts if count ==i])
    # print(i,pdf)
    x_axis.append(i) 
    transition_count_distribution.append(pdf)

In [78]:
transition_count_freq = pd.DataFrame({"Count":x_axis,"Frequency":transition_count_distribution})

In [79]:
filename_transition_count_freq = "transition_count_freq.xlsx"
transition_count_freq.to_excel(path_analysis + current_subpath + filename_transition_count_freq,index=False)

# Export Query

#### format transitions query for export

In [80]:
transitions.columns

Index(['LeaderID', 'CareerString_1', 'CareerDateString_2022_1',
       'CareerStartYear_1', 'CareerStartMonth_1', 'CareerStartDate_1',
       'CareerSubstring_1', 'InstitutionType_1', 'PrimaryInstitution_1',
       'OrgName_1', 'Position_1', 'OrgRank_1', 'PositionRank_1',
       'CareerString_2', 'CareerDateString_2022_2', 'CareerStartYear_2',
       'CareerStartMonth_2', 'CareerStartDate_2', 'CareerSubstring_2',
       'InstitutionType_2', 'PrimaryInstitution_2', 'OrgName_2', 'Position_2',
       'OrgRank_2', 'PositionRank_2', 'CareerStartDate_2_min', 'OrgAdvance',
       'PositionAdvance'],
      dtype='object')

In [81]:
# omit CareerStartDate_2_min and any other fields we don't need

transitions_columns = ['LeaderID', 'CareerString_1', 'CareerDateString_2022_1',
       'CareerStartYear_1', 'CareerStartMonth_1', 'CareerStartDate_1',
       'CareerSubstring_1', 'InstitutionType_1', 'PrimaryInstitution_1',
        'OrgName_1', 'Position_1', 'OrgRank_1', 'PositionRank_1',
        'CareerString_2', 'CareerDateString_2022_2', 'CareerStartYear_2',
        'CareerStartMonth_2', 'CareerStartDate_2', 'CareerSubstring_2',
        'InstitutionType_2', 'PrimaryInstitution_2', 'OrgName_2', 'Position_2',
        'OrgRank_2', 'PositionRank_2','OrgAdvance','PositionAdvance']

In [82]:
transitions = transitions[transitions_columns]

#### export query

In [83]:
# refresh leaderjobtransition query
transitions.to_excel(path_queries + filename_leaderjobtransition_alljobs,index=False)