In [1]:
import pandas as pd
import numpy as np
from datetime import date

# combined data - 2 tables

In [2]:
path_tables = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 2 tables/"

In [3]:
filename_careers = "careers.xlsx"
careers = pd.read_excel(path_tables + filename_careers,dtype="str")
careers.shape

(9001, 12)

In [4]:
careers.columns

Index(['CareerString', 'CareerDateString_2022', 'IsJob', 'MultipleSubstrings',
       'CareerStartYear', 'CareerStartMonth', 'CareerSubstring', 'OrgString',
       'PrimaryInstitution', 'OrgName', 'Position', 'Notes'],
      dtype='object')

In [5]:
filename_orgtree = "orgtree.xlsx"
ot = pd.read_excel(path_tables + filename_orgtree,dtype="str")
ot.shape

(1742, 9)

In [6]:
filename_leadercareerlink = "leadercareerlink.xlsx"
lclink = pd.read_excel(path_tables + filename_leadercareerlink,dtype="str")
lclink.shape

(12617, 3)

In [58]:
filename_positions_elected = "positions_elected.xlsx"
elected = pd.read_excel(path_tables + filename_positions_elected,dtype="str")
elected.shape

(105, 4)

In [59]:
elected[elected.duplicated(keep=False)]

Unnamed: 0,PrimaryInstitution,OrgName,Position,IsElected


In [None]:
# position hierarchy table


In [8]:
# filename_leaderjoblink = "leaderjoblink.xlsx"

In [9]:
# filename_joborglink = "joborglink.xlsx"

# combined data - 3 queries

In [8]:
path_queries = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 3 queries/"

In [9]:
filename_leaderjobtransition = "leaderjobtransition.xlsx"
# ljt = pd.read_excel(path_queries + filename_leaderjobtransition,dtype="str")

In [10]:
today = date.today()
print(today)

2023-07-21


# functions

In [11]:
def merge_results(m):
    
    print("\nMerge Results...")
    print("")
    print("\tshape     :",m.shape)
    print("\tleft_only :",m[m["_merge"]=="left_only"].shape)
    print("\tboth      :",m[m["_merge"]=="both"].shape)
    print("\tright_only:",m[m["_merge"]=="right_only"].shape)

In [12]:
# using this on (PI,OrgName) will ensure unique & non-null keys
# using this on a larger df will ensure unique rows and non-null keys, but not unique keys

def unique_non_null_rows(olddf):
    
    df = olddf.copy()
    
    
    ### drop duplicates
    df = df.drop_duplicates(keep="first",ignore_index=True)

    ### drop null rows
    df.dropna(how="all",axis=0,inplace=True)
    
    # drop rows with null PI
    df = df[~(df["PrimaryInstitution"].isna())]
        
    ### drop rows where PI contains stop words
    stop_words_lower = ["uncertain","current","deprecated","please_revise"]
    df = df[ ~ df["PrimaryInstitution"].str.lower().isin(stop_words_lower)]
            

    print("\nUnique Non-null Rows...")
    print("")
    print("\tNon-unique rows:",olddf.shape)
    print("\tUnique rows    :",df.shape)

    ### sort
    df = df.sort_values(["PrimaryInstitution","OrgName"])
    
    return df

# 1. deprecate old version

In [18]:
table_name = "leaderjobtransition"
table_name

'leaderjobtransition'

In [19]:
filename_leaderjobtransition

'leaderjobtransition.xlsx'

In [20]:
filename_deprecated = table_name + "_deprecated_"+str(today)+".xlsx"
filename_deprecated

'leaderjobtransition_deprecated_2023-07-21.xlsx'

In [21]:
# deprecate old query
# oldtable = pd.read_excel(path_queries + filename_careers,dtype="str")
# oldtable.to_excel(path_queries + filename_deprecated,index=False)

# 2. select on careers: IsJob=True, remove elected positions

In [40]:
# get jobs from careers
jobs = careers[(careers["IsJob"]=="True") & (careers["CareerStartYear"].notna())]

In [41]:
jobs.shape

(6397, 12)

In [60]:
elected.shape

(105, 4)

In [61]:
elected = unique_non_null_rows(elected)
elected.shape


Unique Non-null Rows...

	Non-unique rows: (105, 4)
	Unique rows    : (105, 4)


(105, 4)

In [62]:
# remove elected positions
key_columns = ["PrimaryInstitution","OrgName","Position"]
jobs2 = jobs.merge(elected,on=key_columns,how="outer",indicator=True)
jobs2.shape

(6397, 14)

In [63]:
merge_results(jobs2)


Merge Results...

	shape     : (6397, 14)
	left_only : (4678, 14)
	both      : (1719, 14)
	right_only: (0, 14)


In [66]:
jobs2.IsElected.unique()

array([nan, '1'], dtype=object)

In [71]:
jobs2.drop(columns=["_merge"],inplace=True)

In [72]:
jobs2.head(5)

Unnamed: 0,CareerString,CareerDateString_2022,IsJob,MultipleSubstrings,CareerStartYear,CareerStartMonth,CareerSubstring,OrgString,PrimaryInstitution,OrgName,Position,Notes,IsElected
0,1996. 3 외교부 10국장,,True,1,1996,3,,외교부,정무원,외교부,국장,,
1,1973.12 외교부 국장,,True,1,1973,12,,외교부,정무원,외교부,국장,,
2,1983. 3 외교부 국장,,True,1,1983,3,,외교부,정무원,외교부,국장,,
3,1990. 3 외교부 국장,,True,1,1990,3,,외교부,정무원,외교부,국장,,
4,외교부 국장,1973.12,True,1,1973,12,,외교부,정무원,외교부,국장,,


In [73]:
jobs2 = jobs2[jobs2["IsElected"].isna()]
jobs2.shape

(4678, 13)

# 3. merge leadercareerlink & careers tables

In [74]:
lclink.head(2)

Unnamed: 0,LeaderID,CareerString,CareerDateString_2022
0,리선권,개성공단 남북공동위원회 통행통신통관 분과위원회,2013.09 ~ 2014.01
1,조경철,故 김정일 국가장의위원회 위원,2011.12


In [75]:
# leader-job query - left join - includes IsJob True & False
key_columns = ["CareerString","CareerDateString_2022"]
lc = lclink.merge(jobs2,on=key_columns,how="left",indicator=True)
lc.shape

(12857, 15)

In [76]:
merge_results(lc)


Merge Results...

	shape     : (12857, 15)
	left_only : (8032, 15)
	both      : (4825, 15)
	right_only: (0, 15)


In [77]:
# no "left-only" cases when IsJob=True
lc[(lc["IsJob"]=="True") & (lc["_merge"]=="left_only")]

Unnamed: 0,LeaderID,CareerString,CareerDateString_2022,IsJob,MultipleSubstrings,CareerStartYear,CareerStartMonth,CareerSubstring,OrgString,PrimaryInstitution,OrgName,Position,Notes,IsElected,_merge


In [78]:
# leader-job query - inner join - only includes IsJob = True
key_columns = ["CareerString","CareerDateString_2022"]
lc = lclink.merge(jobs2,on=key_columns,how="inner",indicator=False)
lc = lc.sort_values("LeaderID",ignore_index=True)
lc.shape

(4825, 14)

In [79]:
# instantiate CareerStartDate
lc["CareerStartDate"]=np.nan
lc.loc[lc["CareerStartMonth"].isna(),"CareerStartDate"] = lc[lc["CareerStartMonth"].isna()].apply(lambda x: str(x["CareerStartYear"]) + "00",axis=1)
lc.loc[(lc["CareerStartMonth"].notna()) & (lc.apply(lambda x: len(str(x["CareerStartMonth"])),axis=1)==2),"CareerStartDate"] = lc[(lc["CareerStartMonth"].notna()) & (lc.apply(lambda x: len(str(x["CareerStartMonth"])),axis=1)==2)].apply(lambda x: str(x["CareerStartYear"]) + str(x["CareerStartMonth"]),axis=1)
lc.loc[(lc["CareerStartMonth"].notna()) & (lc.apply(lambda x: len(str(x["CareerStartMonth"])),axis=1)==1),"CareerStartDate"] = lc[(lc["CareerStartMonth"].notna()) & (lc.apply(lambda x: len(str(x["CareerStartMonth"])),axis=1)==1)].apply(lambda x: str(x["CareerStartYear"]) + "0" + str(x["CareerStartMonth"]),axis=1)

In [80]:
lc.shape

(4825, 15)

In [81]:
pd.set_option('display.max_rows', None)

# 4. calculate temporally adjacent transitions for each leader

In [113]:
def calculate_resume_transitions(lc,leaderid):

    t = lc[(lc["LeaderID"]==leaderid) & (lc["CareerStartYear"].notna())]
    t = t.drop_duplicates(["CareerStartYear","PrimaryInstitution","OrgName"],keep="first")
    t = t.sort_values("CareerStartDate",ignore_index=True)
    print("resume:",t.shape)
    
    tr = t.merge(t,on="LeaderID",how="inner",suffixes=("_1","_2"))
    # sort based on CareerStartDates
    tr = tr.sort_values(["CareerStartDate_1","CareerStartDate_2"])
    
    # drop keys where CareerStartDate_1 <= CareerStartDate_2
    tr = tr[tr.apply(lambda x: x["CareerStartDate_1"] < x["CareerStartDate_2"],axis=1)]
    #tr.shape
    
    # smallest next startdate
    dates = tr[["CareerStartDate_1","CareerStartDate_2"]]
    #dates
    mindate = dates.groupby("CareerStartDate_1",as_index=False).agg({"CareerStartDate_2":"min"})
    mindate.columns = ["CareerStartDate_1","CareerStartDate_2_min"]
    #mindate
    
    tr2 = tr.merge(mindate,on="CareerStartDate_1",how="left")
    #tr2.shape
    
    tr3 = tr2[tr2.apply(lambda x: x["CareerStartDate_2"] == x["CareerStartDate_2_min"],axis=1)]
    print("transitions:",tr3.shape)
    
    #print(tr3[some_columns])

    return tr3

In [100]:
leaderid = "강관주"
lct = calculate_resume_transitions(lc,leaderid)

resume: (17, 15)
transitions: (21, 30)


In [101]:
some_columns = ["LeaderID","CareerStartDate_1","PrimaryInstitution_1","OrgName_1","Position_1","CareerStartDate_2","PrimaryInstitution_2","OrgName_2","Position_2"]
lct[some_columns]

Unnamed: 0,LeaderID,CareerStartDate_1,PrimaryInstitution_1,OrgName_1,Position_1,CareerStartDate_2,PrimaryInstitution_2,OrgName_2,Position_2
0,강관주,196100,체육연구원,체육과학연구소,연구원,197300,평양만경대예술단,섭외부,부부장
1,강관주,196100,체육연구원,체육과학연구소,연구원,197300,노동당,문화예술부,과장
16,강관주,197300,평양만경대예술단,섭외부,부부장,197505,조선대외문화연락협회,,국장
17,강관주,197300,노동당,문화예술부,과장,197505,조선대외문화연락협회,,국장
44,강관주,197505,조선대외문화연락협회,,국장,198600,노동당,당중앙위원회_조직지도부,부부장
57,강관주,198600,노동당,당중앙위원회_조직지도부,부부장,198800,노동당,당중앙위원회_통일전선부_문화교류국,부부장
58,강관주,198600,노동당,당중앙위원회_조직지도부,부부장,198800,노동당,당중앙위원회_통일전선부,부부장
69,강관주,198800,노동당,당중앙위원회_통일전선부_문화교류국,부부장,198903,노동당,당중앙위원회_통일전선부,부장
70,강관주,198800,노동당,당중앙위원회_통일전선부,부부장,198903,노동당,당중앙위원회_통일전선부,부장
89,강관주,198903,노동당,당중앙위원회_통일전선부,부장,199008,정무원,조국평화통일위원회,부위원장


In [109]:
leaderids = list(lc["LeaderID"].unique())

In [110]:
len(leaderids)

554

In [114]:
def compile_leader_transitions(lc):
    
    leaderids = list(lc["LeaderID"].unique())
    
    leader_resumes = []
    
    for leaderid in leaderids:
        leader_resume = calculate_resume_transitions(lc,leaderid)
        leader_resumes.append(leader_resume)
        
    lct = pd.concat(leader_resumes,ignore_index=True)
    
    return lct

In [117]:
ljt = compile_leader_transitions(lc)

resume: (17, 15)
transitions: (21, 30)
resume: (12, 15)
transitions: (13, 30)
resume: (5, 15)
transitions: (5, 30)
resume: (5, 15)
transitions: (4, 30)
resume: (2, 15)
transitions: (1, 30)
resume: (5, 15)
transitions: (5, 30)
resume: (4, 15)
transitions: (4, 30)
resume: (14, 15)
transitions: (16, 30)
resume: (8, 15)
transitions: (7, 30)
resume: (15, 15)
transitions: (17, 30)
resume: (2, 15)
transitions: (1, 30)
resume: (1, 15)
transitions: (0, 0)
resume: (3, 15)
transitions: (2, 30)
resume: (15, 15)
transitions: (22, 30)
resume: (4, 15)
transitions: (3, 30)
resume: (3, 15)
transitions: (2, 30)
resume: (5, 15)
transitions: (4, 30)
resume: (5, 15)
transitions: (4, 30)
resume: (7, 15)
transitions: (7, 30)
resume: (5, 15)
transitions: (4, 30)
resume: (3, 15)
transitions: (2, 30)
resume: (2, 15)
transitions: (0, 0)
resume: (2, 15)
transitions: (1, 30)
resume: (5, 15)
transitions: (5, 30)
resume: (19, 15)
transitions: (23, 30)
resume: (1, 15)
transitions: (0, 0)
resume: (7, 15)
transitions: 

transitions: (18, 30)
resume: (8, 15)
transitions: (8, 30)
resume: (11, 15)
transitions: (12, 30)
resume: (3, 15)
transitions: (2, 30)
resume: (4, 15)
transitions: (3, 30)
resume: (2, 15)
transitions: (1, 30)
resume: (4, 15)
transitions: (4, 30)
resume: (2, 15)
transitions: (1, 30)
resume: (3, 15)
transitions: (2, 30)
resume: (4, 15)
transitions: (3, 30)
resume: (4, 15)
transitions: (3, 30)
resume: (27, 15)
transitions: (33, 30)
resume: (19, 15)
transitions: (25, 30)
resume: (12, 15)
transitions: (11, 30)
resume: (4, 15)
transitions: (3, 30)
resume: (7, 15)
transitions: (7, 30)
resume: (4, 15)
transitions: (3, 30)
resume: (8, 15)
transitions: (7, 30)
resume: (2, 15)
transitions: (1, 30)
resume: (3, 15)
transitions: (2, 30)
resume: (4, 15)
transitions: (3, 30)
resume: (5, 15)
transitions: (4, 30)
resume: (2, 15)
transitions: (1, 30)
resume: (2, 15)
transitions: (1, 30)
resume: (3, 15)
transitions: (2, 30)
resume: (1, 15)
transitions: (0, 0)
resume: (1, 15)
transitions: (0, 0)
resume: (1

transitions: (6, 30)
resume: (1, 15)
transitions: (0, 0)
resume: (2, 15)
transitions: (1, 30)
resume: (3, 15)
transitions: (0, 0)
resume: (14, 15)
transitions: (18, 30)
resume: (3, 15)
transitions: (2, 30)
resume: (10, 15)
transitions: (15, 30)
resume: (2, 15)
transitions: (1, 30)
resume: (1, 15)
transitions: (0, 0)
resume: (20, 15)
transitions: (24, 30)
resume: (1, 15)
transitions: (0, 0)
resume: (1, 15)
transitions: (0, 0)
resume: (14, 15)
transitions: (15, 30)
resume: (7, 15)
transitions: (6, 30)
resume: (4, 15)
transitions: (4, 30)
resume: (17, 15)
transitions: (23, 30)
resume: (3, 15)
transitions: (2, 30)
resume: (2, 15)
transitions: (1, 30)
resume: (3, 15)
transitions: (2, 30)
resume: (23, 15)
transitions: (29, 30)
resume: (3, 15)
transitions: (2, 30)
resume: (4, 15)
transitions: (3, 30)
resume: (9, 15)
transitions: (10, 30)
resume: (1, 15)
transitions: (0, 0)
resume: (5, 15)
transitions: (4, 30)
resume: (8, 15)
transitions: (8, 30)
resume: (14, 15)
transitions: (15, 30)
resume: 

In [116]:
ljt.shape

(3337, 30)

In [120]:
ljt.head(25)

Unnamed: 0,LeaderID,CareerString_1,CareerDateString_2022_1,IsJob_1,MultipleSubstrings_1,CareerStartYear_1,CareerStartMonth_1,CareerSubstring_1,OrgString_1,PrimaryInstitution_1,...,CareerStartMonth_2,CareerSubstring_2,OrgString_2,PrimaryInstitution_2,OrgName_2,Position_2,Notes_2,IsElected_2,CareerStartDate_2,CareerStartDate_2_min
0,강관주,1961. 체육과학연구소 연구원,,True,1,1961,,,체육과학연구소,체육연구원,...,,만경대예술단 섭외부 부부장,만경대예술단 섭외부,평양만경대예술단,섭외부,부부장,,,197300,197300
1,강관주,1961. 체육과학연구소 연구원,,True,1,1961,,,체육과학연구소,체육연구원,...,,,당 문화부,노동당,문화예술부,과장,,,197300,197300
2,강관주,"1973. 만경대예술단 섭외부 부부장, 당 문화부 과장",,True,2,1973,,만경대예술단 섭외부 부부장,만경대예술단 섭외부,평양만경대예술단,...,5.0,,대외문화연락협회,조선대외문화연락협회,,국장,,,197505,197505
3,강관주,당 문화부 과장,1973.0,True,1,1973,,,당 문화부,노동당,...,5.0,,대외문화연락협회,조선대외문화연락협회,,국장,,,197505,197505
4,강관주,대외문화연락협회 국장,1975.05,True,1,1975,5.0,,대외문화연락협회,조선대외문화연락협회,...,,,노동당 조직지도부,노동당,당중앙위원회_조직지도부,부부장,,,198600,198600
5,강관주,1986. 당 조직지도부 부부장,,True,1,1986,,,노동당 조직지도부,노동당,...,,,노동당 중앙위원회 통일전선부,노동당,당중앙위원회_통일전선부_문화교류국,부부장,,,198800,198800
6,강관주,1986. 당 조직지도부 부부장,,True,1,1986,,,노동당 조직지도부,노동당,...,,,조선노동당 통일전선부,노동당,당중앙위원회_통일전선부,부부장,,,198800,198800
7,강관주,1988. 당 통일전선부 #조총련# 담당 부부장,,True,1,1988,,,노동당 중앙위원회 통일전선부,노동당,...,3.0,,조선노동당 통일전선부 제1부,노동당,당중앙위원회_통일전선부,부장,,,198903,198903
8,강관주,조선노동당 통일전선부 조선인총연합회 담당 부부장,1988.0,True,1,1988,,,조선노동당 통일전선부,노동당,...,3.0,,조선노동당 통일전선부 제1부,노동당,당중앙위원회_통일전선부,부장,,,198903,198903
9,강관주,조선노동당 통일전선부 제1부부장,1989.03,True,1,1989,3.0,,조선노동당 통일전선부 제1부,노동당,...,8.0,,조국통일평화위원회,정무원,조국평화통일위원회,부위원장,,,199008,199008


# 5. export new query

In [118]:
# refresh leaderjobtransition with new data
ljt.to_excel(path_tables + filename_leaderjobtransition,index=False)