In [57]:
import pandas as pd
import numpy as np
from datetime import date

# combined data - 2 tables

In [58]:
path_tables = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 2 tables/"

In [59]:
filename_careers = "careers.xlsx"
careers = pd.read_excel(path_tables + filename_careers,dtype="str")
careers.shape

(9001, 12)

In [81]:
careers.columns

Index(['CareerString', 'CareerDateString_2022', 'IsJob', 'MultipleSubstrings',
       'CareerStartYear', 'CareerStartMonth', 'CareerSubstring', 'OrgString',
       'PrimaryInstitution', 'OrgName', 'Position', 'Notes'],
      dtype='object')

In [60]:
filename_orgtree = "orgtree.xlsx"
ot = pd.read_excel(path_tables + filename_orgtree,dtype="str")
ot.shape

(1742, 9)

In [61]:
filename_leadercareerlink = "leadercareerlink.xlsx"
lclink = pd.read_excel(path_tables + filename_leadercareerlink,dtype="str")
lclink.shape

(12617, 3)

In [85]:
filename_pop = "pi_org_pos.xlsx"
pop = pd.read_excel(path_tables + filename_pop,dtype="str")
pop.shape

(2611, 7)

In [62]:
# filename_leaderjoblink = "leaderjoblink.xlsx"

In [63]:
# filename_joborglink = "joborglink.xlsx"

# combined data - 3 queries

In [64]:
path_queries = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 3 queries/"

In [65]:
filename_leaderjobtransition = "leaderjobtransition.xlsx"
# ljt = pd.read_excel(path_queries + filename_leaderjobtransition,dtype="str")

In [66]:
today = date.today()
print(today)

2023-07-19


# functions

In [67]:
def merge_results(m):
    
    print("\nMerge Results...")
    print("")
    print("\tshape     :",m.shape)
    print("\tleft_only :",m[m["_merge"]=="left_only"].shape)
    print("\tboth      :",m[m["_merge"]=="both"].shape)
    print("\tright_only:",m[m["_merge"]=="right_only"].shape)

In [68]:
# using this on (PI,OrgName) will ensure unique & non-null keys
# using this on a larger df will ensure unique rows and non-null keys, but not unique keys

def unique_non_null_rows(olddf):
    
    df = olddf.copy()
    
    
    ### drop duplicates
    df = df.drop_duplicates(keep="first",ignore_index=True)

    ### drop null rows
    df.dropna(how="all",axis=0,inplace=True)
    
    # drop rows with null PI
    df = df[~(df["PrimaryInstitution"].isna())]
        
    ### drop rows where PI contains stop words
    stop_words_lower = ["uncertain","current","deprecated","please_revise"]
    df = df[ ~ df["PrimaryInstitution"].str.lower().isin(stop_words_lower)]
            

    print("\nUnique Non-null Rows...")
    print("")
    print("\tNon-unique rows:",olddf.shape)
    print("\tUnique rows    :",df.shape)

    ### sort
    df = df.sort_values(["PrimaryInstitution","OrgName"])
    
    return df

# 0. prepare tables: ElectedPositions, PositionHierarchy, OrgHierarchy

### Elected Postions from PI-OrgName-Position

In [69]:
key_columns = ["PrimaryInstitution","OrgName"]
keyorg = ot[key_columns].drop_duplicates(keep="first")
keyorg.shape

(1742, 2)

In [70]:
keyorg = unique_non_null_rows(keyorg)


Unique Non-null Rows...

	Non-unique rows: (1742, 2)
	Unique rows    : (1742, 2)


In [71]:
jobs = careers[(careers["IsJob"]=="True") & (careers["CareerStartYear"].notna())]

In [72]:
key_columns2 = ["PrimaryInstitution","OrgName","Position"]
keyjobs = jobs[key_columns2].drop_duplicates(keep="first")
keyjobs.shape

(1954, 3)

In [73]:
keyjobs = unique_non_null_rows(keyjobs)


Unique Non-null Rows...

	Non-unique rows: (1954, 3)
	Unique rows    : (1902, 3)


In [74]:
pop_pre = keyorg.merge(keyjobs,how="outer",on=key_columns,indicator=True)
pop_pre.shape

(2626, 4)

In [75]:
merge_results(pop_pre)


Merge Results...

	shape     : (2626, 4)
	left_only : (724, 4)
	both      : (1902, 4)
	right_only: (0, 4)


In [76]:
pop_pre[pop_pre["_merge"]=="left_only"]

Unnamed: 0,PrimaryInstitution,OrgName,Position,_merge
3,6.15공동선언실천남북공동위원회북측위원회,교직원분과,,left_only
4,6.15공동선언실천남북공동위원회북측위원회,노동자분과,,left_only
5,6.15공동선언실천남북공동위원회북측위원회,농업근로자분과,,left_only
6,6.15공동선언실천남북공동위원회북측위원회,문학예술분과,,left_only
7,6.15공동선언실천남북공동위원회북측위원회,보건분과,,left_only
8,6.15공동선언실천남북공동위원회북측위원회,사무국,,left_only
9,6.15공동선언실천남북공동위원회북측위원회,언론분과,,left_only
10,6.15공동선언실천남북공동위원회북측위원회,여성분과,,left_only
11,6.15공동선언실천남북공동위원회북측위원회,종교분과,,left_only
12,6.15공동선언실천남북공동위원회북측위원회,청년학생분과,,left_only


In [77]:
pop_pre["IsElected"]=np.nan
pop_pre["PositionRank"]=np.nan
pop_pre["OrgRank"]=np.nan

In [22]:
# populate PositionRank

In [78]:
# populate OrgRank

In [79]:
# pop_pre.to_excel(path_tables + "pi_org_pos.xlsx",index=False)

# 1. evaluate pop table: ElectedPositions, PositionHierarchy, OrgHierarchy

In [86]:
pop.shape

(2611, 7)

In [91]:
position_list = pop.sort_values("Position").Position.unique()

In [162]:
position_list

array(['10국장', '1등서기관', '1부국장', '1비서', '2군단장', '2등 서기관', '2등서기관', '2부국장',
       '2비서', '3등 서기관', '3등서기관', '3비서', '4비서', '5비서', 'Uncertain',
       'uncertain', '간부', '간부국장', '강좌장', '건설운수부장', '건설총국장', '검열위원장',
       '경공업부장', '경제연구사', '경제정책검열부장', '경제참사', '경호대장', '고등교육상', '고문',
       '고문위원장', '공동의장', '공동즤아', '공병국장', '공사', '공업부장', '공업상', '공장장', '과장',
       '과학기숡국장', '관리위원장', '관장', '교무부장', '교수', '교원', '교장', '국가부주석', '국장',
       '국제기구총국장', '군단장', '군사위원', '급냉구조연구실장', '기계설계총국장', '기사', '기사장', '기자',
       '내무국장', '노동자', '논설위원', '농업과학연구원장', '농업과학원장', '농촌경리위원장', '단장',
       '담당지도원', '당 비서', '당비서', '대미특별대표', '대사', '대사대리', '대외사업국장', '대의원',
       '대표', '대학생 담당 비서', '도시건설국장', '명예교수', '명예부위원장', '명예위원장', '명예참사',
       '명예총재', '무관', '무관보', '문화부장', '문화예술과장', '미주과장', '법제위원', '보위국장',
       '보위사령관', '보위원장', '보좌관', '보천보전자악단장', '부과장', '부구장', '부국장', '부군단장',
       '부기사장', '부단장', '부대표', '부부장', '부사령관', '부사장', '부상', '부소장', '부수상',
       '부실장', '부원', '부원장', '부위원장', '부의장', '부이사장', '부장', '부주석', '부주필',
     

In [92]:
position_list.to_excel(path_tables + "position_list.xlsx",index=False)

AttributeError: 'numpy.ndarray' object has no attribute 'to_excel'

### PositionHierarchy

### OrgTransition Hierarchy

# 1. deprecate old version

In [25]:
table_name = "leaderjobtransition"
table_name

'leaderjobtransition'

In [26]:
filename_leaderjobtransition

'leaderjobtransition.xlsx'

In [27]:
filename_deprecated = table_name + "_deprecated_"+str(today)+".xlsx"
filename_deprecated

'leaderjobtransition_deprecated_2023-07-19.xlsx'

In [28]:
# deprecate old query
# oldtable = pd.read_excel(path_queries + filename_careers,dtype="str")
# oldtable.to_excel(path_queries + filename_deprecated,index=False)

# 2. select on careers: IsJob=True, remove elected positions

In [116]:
# get jobs from careers
jobs = careers[(careers["IsJob"]=="True") & (careers["CareerStartYear"].notna())]

In [117]:
jobs.shape

(6397, 12)

In [118]:
# remove elected positions
key_columns = ["PrimaryInstitution","OrgName","Position"]
jobs = jobs.merge(pop,on=key_columns,how="left")
jobs.shape

(6397, 16)

In [127]:
jobs.drop(columns=["_merge"],inplace=True)

In [128]:
jobs.head(5)

Unnamed: 0,CareerString,CareerDateString_2022,IsJob,MultipleSubstrings,CareerStartYear,CareerStartMonth,CareerSubstring,OrgString,PrimaryInstitution,OrgName,Position,Notes,IsElected,PositionRank,OrgRank
0,노동성 상(유임),1999.02,True,1,1999,2.0,,노동성,내각,노동성,상,,,,
3,인민무력부 작전국 국장,1975.0,True,1,1975,,,인민무력부 작전국,정무원,인민무력부A_작전국,국장,,,,
11,｢근로자｣사 부주필,1972.04,True,1,1972,4.0,,｢근로자｣사,노동당,당중앙위원회_선전선동부_근로자사,부주필,,,,
12,｢근로자｣사 책임주필,1974.12,True,1,1974,12.0,,｢근로자｣사,노동당,당중앙위원회_선전선동부_근로자사,책임주필,,,,
13,｢민족화해협의회｣ 부회장,1999.1,True,1,1999,10.0,,｢민족화해협의회｣,민족화해협의회,,부회장,,,,


In [129]:
jobs.IsElected.unique()

array([nan], dtype=object)

In [130]:
jobs = jobs[jobs["IsElected"].isna()]
jobs.shape

(4753, 15)

# 3. merge leadercareerlink & careers tables

In [131]:
lclink.head(2)

Unnamed: 0,LeaderID,CareerString,CareerDateString_2022
0,리선권,개성공단 남북공동위원회 통행통신통관 분과위원회,2013.09 ~ 2014.01
1,조경철,故 김정일 국가장의위원회 위원,2011.12


In [132]:
# leader-job query - left join - includes IsJob True & False
key_columns = ["CareerString","CareerDateString_2022"]
lc = lclink.merge(jobs,on=key_columns,how="left",indicator=True)
lc.shape

(12883, 17)

In [134]:
merge_results(lc)


Merge Results...

	shape     : (12883, 17)
	left_only : (7968, 17)
	both      : (4915, 17)
	right_only: (0, 17)


In [136]:
# no "left-only" cases when IsJob=True
lc[(lc["IsJob"]=="True") & (lc["_merge"]=="left_only")]

Unnamed: 0,LeaderID,CareerString,CareerDateString_2022,IsJob,MultipleSubstrings,CareerStartYear,CareerStartMonth,CareerSubstring,OrgString,PrimaryInstitution,OrgName,Position,Notes,IsElected,PositionRank,OrgRank,_merge


In [138]:
# leader-job query - inner join - only includes IsJob = True
key_columns = ["CareerString","CareerDateString_2022"]
lc = lclink.merge(jobs,on=key_columns,how="inner",indicator=False)
lc = lc.sort_values("LeaderID",ignore_index=True)
lc.shape

(4915, 16)

In [139]:
# instantiate CareerStartDate
lc["CareerStartDate"]=np.nan
lc.loc[lc["CareerStartMonth"].isna(),"CareerStartDate"] = lc[lc["CareerStartMonth"].isna()].apply(lambda x: str(x["CareerStartYear"]) + "00",axis=1)
lc.loc[(lc["CareerStartMonth"].notna()) & (lc.apply(lambda x: len(str(x["CareerStartMonth"])),axis=1)==2),"CareerStartDate"] = lc[(lc["CareerStartMonth"].notna()) & (lc.apply(lambda x: len(str(x["CareerStartMonth"])),axis=1)==2)].apply(lambda x: str(x["CareerStartYear"]) + str(x["CareerStartMonth"]),axis=1)
lc.loc[(lc["CareerStartMonth"].notna()) & (lc.apply(lambda x: len(str(x["CareerStartMonth"])),axis=1)==1),"CareerStartDate"] = lc[(lc["CareerStartMonth"].notna()) & (lc.apply(lambda x: len(str(x["CareerStartMonth"])),axis=1)==1)].apply(lambda x: str(x["CareerStartYear"]) + "0" + str(x["CareerStartMonth"]),axis=1)

In [140]:
lc.shape

(4915, 17)

In [141]:
pd.set_option('display.max_rows', None)

In [142]:
choigo = lc[lc["PrimaryInstitution"]=="최고인민회의"]
po_columns = ["PrimaryInstitution","OrgName","Position"]
choigo[po_columns].drop_duplicates(po_columns,keep="first").sort_values(po_columns)

Unnamed: 0,PrimaryInstitution,OrgName,Position
256,최고인민회의,법안심의위원회,원장
980,최고인민회의,상임위원회,명예부위원장
590,최고인민회의,상임위원회,참사
1396,최고인민회의,상임위원회,책임주필
1390,최고인민회의,상임위원회_민주조선사,책임주필
927,최고인민회의,외교분과위원회,고문
4683,최고인민회의,외교위원회,자문위원
4392,최고인민회의,중앙검찰소,장
660,최고인민회의,중앙재판소,중앙재판소장
667,최고인민회의,최고재판소,최고재판소장


In [144]:
# remove 최고인민회의
# lc = lc[~((lc["PrimaryInstitution"]=="최고인민회의") & (lc["OrgName"].isna()))]
lc.shape

(4915, 17)

In [147]:
# remove 노동당, 당중앙위원회
# lc = lc[~((lc["PrimaryInstitution"]=="노동당") & (lc["OrgName"]=="당중앙위원회"))]
lc.shape

(4915, 17)

In [148]:
t = lc[(lc["LeaderID"]=="강관주") & (lc["CareerStartYear"].notna())]
t = t.drop_duplicates(["CareerStartYear","PrimaryInstitution","OrgName"],keep="first")
t = t.sort_values("CareerStartDate",ignore_index=True)
t.shape

(17, 17)

In [149]:
t

Unnamed: 0,LeaderID,CareerString,CareerDateString_2022,IsJob,MultipleSubstrings,CareerStartYear,CareerStartMonth,CareerSubstring,OrgString,PrimaryInstitution,OrgName,Position,Notes,IsElected,PositionRank,OrgRank,CareerStartDate
0,강관주,1961. 체육과학연구소 연구원,,True,1,1961,,,체육과학연구소,체육연구원,체육과학연구소,연구원,,,,,196100
1,강관주,"1973. 만경대예술단 섭외부 부부장, 당 문화부 과장",,True,2,1973,,만경대예술단 섭외부 부부장,만경대예술단 섭외부,평양만경대예술단,섭외부,부부장,,,,,197300
2,강관주,"1973. 만경대예술단 섭외부 부부장, 당 문화부 과장",,True,2,1973,,당 문화부 과장,당 문화부,노동당,문화예술부,과장,,,,,197300
3,강관주,1975. 5 대외문화연락협회 국장,,True,1,1975,5.0,,대외문화연락협회,조선대외문화연락협회,,국장,,,,,197505
4,강관주,1986. 당 조직지도부 부부장,,True,1,1986,,,노동당 조직지도부,노동당,당중앙위원회_조직지도부,부부장,,,,,198600
5,강관주,1988. 당 통일전선부 #조총련# 담당 부부장,,True,1,1988,,,노동당 중앙위원회 통일전선부,노동당,당중앙위원회_통일전선부_문화교류국,부부장,,,,,198800
6,강관주,조선노동당 통일전선부 조선인총연합회 담당 부부장,1988.0,True,1,1988,,,조선노동당 통일전선부,노동당,당중앙위원회_통일전선부,부부장,,,,,198800
7,강관주,조선노동당 통일전선부 제1부부장,1989.03,True,1,1989,3.0,,조선노동당 통일전선부 제1부,노동당,당중앙위원회_통일전선부,부장,,,,,198903
8,강관주,조국평화통일위원회 부위원장,1990.08,True,1,1990,8.0,,조국통일평화위원회,정무원,조국평화통일위원회,부위원장,,,,,199008
9,강관주,1990. 8 조국평화통일위원회 부위원장,,True,1,1990,8.0,,조국평화통일위원회,내각,조국평화통일위원회,부위원장,,,,,199008


In [150]:
tr = t.merge(t,on="LeaderID",how="inner",suffixes=("_1","_2"))

In [151]:
# sort based on CareerStartDates
tr = tr.sort_values(["CareerStartDate_1","CareerStartDate_2"])
tr.shape

(289, 33)

In [152]:
# drop keys where CareerStartDate_1 <= CareerStartDate_2
tr = tr[tr.apply(lambda x: x["CareerStartDate_1"] < x["CareerStartDate_2"],axis=1)]
tr.shape

(131, 33)

In [153]:
# smallest next startdate
dates = tr[["CareerStartDate_1","CareerStartDate_2"]]

In [154]:
dates

Unnamed: 0,CareerStartDate_1,CareerStartDate_2
1,196100,197300
2,196100,197300
3,196100,197505
4,196100,198600
5,196100,198800
6,196100,198800
7,196100,198903
8,196100,199008
9,196100,199008
10,196100,199109


In [155]:
mindate = dates.groupby("CareerStartDate_1",as_index=False).agg({"CareerStartDate_2":"min"})
mindate.shape

(11, 2)

In [156]:
mindate

Unnamed: 0,CareerStartDate_1,CareerStartDate_2
0,196100,197300
1,197300,197505
2,197505,198600
3,198600,198800
4,198800,198903
5,198903,199008
6,199008,199109
7,199109,199301
8,199301,199310
9,199310,199702


In [157]:
mindate.columns = ["CareerStartDate_1","CareerStartDate_2_min"]

In [158]:
tr2 = tr.merge(mindate,on="CareerStartDate_1",how="left")
tr2.shape

(131, 34)

In [159]:
tr3 = tr2[tr2.apply(lambda x: x["CareerStartDate_2"] == x["CareerStartDate_2_min"],axis=1)]
tr3.shape

(21, 34)

In [160]:
tr3

Unnamed: 0,LeaderID,CareerString_1,CareerDateString_2022_1,IsJob_1,MultipleSubstrings_1,CareerStartYear_1,CareerStartMonth_1,CareerSubstring_1,OrgString_1,PrimaryInstitution_1,...,OrgString_2,PrimaryInstitution_2,OrgName_2,Position_2,Notes_2,IsElected_2,PositionRank_2,OrgRank_2,CareerStartDate_2,CareerStartDate_2_min
0,강관주,1961. 체육과학연구소 연구원,,True,1,1961,,,체육과학연구소,체육연구원,...,만경대예술단 섭외부,평양만경대예술단,섭외부,부부장,,,,,197300,197300
1,강관주,1961. 체육과학연구소 연구원,,True,1,1961,,,체육과학연구소,체육연구원,...,당 문화부,노동당,문화예술부,과장,,,,,197300,197300
16,강관주,"1973. 만경대예술단 섭외부 부부장, 당 문화부 과장",,True,2,1973,,만경대예술단 섭외부 부부장,만경대예술단 섭외부,평양만경대예술단,...,대외문화연락협회,조선대외문화연락협회,,국장,,,,,197505,197505
17,강관주,"1973. 만경대예술단 섭외부 부부장, 당 문화부 과장",,True,2,1973,,당 문화부 과장,당 문화부,노동당,...,대외문화연락협회,조선대외문화연락협회,,국장,,,,,197505,197505
44,강관주,1975. 5 대외문화연락협회 국장,,True,1,1975,5.0,,대외문화연락협회,조선대외문화연락협회,...,노동당 조직지도부,노동당,당중앙위원회_조직지도부,부부장,,,,,198600,198600
57,강관주,1986. 당 조직지도부 부부장,,True,1,1986,,,노동당 조직지도부,노동당,...,노동당 중앙위원회 통일전선부,노동당,당중앙위원회_통일전선부_문화교류국,부부장,,,,,198800,198800
58,강관주,1986. 당 조직지도부 부부장,,True,1,1986,,,노동당 조직지도부,노동당,...,조선노동당 통일전선부,노동당,당중앙위원회_통일전선부,부부장,,,,,198800,198800
69,강관주,1988. 당 통일전선부 #조총련# 담당 부부장,,True,1,1988,,,노동당 중앙위원회 통일전선부,노동당,...,조선노동당 통일전선부 제1부,노동당,당중앙위원회_통일전선부,부장,,,,,198903,198903
70,강관주,조선노동당 통일전선부 조선인총연합회 담당 부부장,1988.0,True,1,1988,,,조선노동당 통일전선부,노동당,...,조선노동당 통일전선부 제1부,노동당,당중앙위원회_통일전선부,부장,,,,,198903,198903
89,강관주,조선노동당 통일전선부 제1부부장,1989.03,True,1,1989,3.0,,조선노동당 통일전선부 제1부,노동당,...,조국통일평화위원회,정무원,조국평화통일위원회,부위원장,,,,,199008,199008


In [161]:
some_columns = ["LeaderID","CareerStartDate_1","PrimaryInstitution_1","OrgName_1","Position_1","CareerStartDate_2","PrimaryInstitution_2","OrgName_2","Position_2"]
tr3[some_columns]

Unnamed: 0,LeaderID,CareerStartDate_1,PrimaryInstitution_1,OrgName_1,Position_1,CareerStartDate_2,PrimaryInstitution_2,OrgName_2,Position_2
0,강관주,196100,체육연구원,체육과학연구소,연구원,197300,평양만경대예술단,섭외부,부부장
1,강관주,196100,체육연구원,체육과학연구소,연구원,197300,노동당,문화예술부,과장
16,강관주,197300,평양만경대예술단,섭외부,부부장,197505,조선대외문화연락협회,,국장
17,강관주,197300,노동당,문화예술부,과장,197505,조선대외문화연락협회,,국장
44,강관주,197505,조선대외문화연락협회,,국장,198600,노동당,당중앙위원회_조직지도부,부부장
57,강관주,198600,노동당,당중앙위원회_조직지도부,부부장,198800,노동당,당중앙위원회_통일전선부_문화교류국,부부장
58,강관주,198600,노동당,당중앙위원회_조직지도부,부부장,198800,노동당,당중앙위원회_통일전선부,부부장
69,강관주,198800,노동당,당중앙위원회_통일전선부_문화교류국,부부장,198903,노동당,당중앙위원회_통일전선부,부장
70,강관주,198800,노동당,당중앙위원회_통일전선부,부부장,198903,노동당,당중앙위원회_통일전선부,부장
89,강관주,198903,노동당,당중앙위원회_통일전선부,부장,199008,정무원,조국평화통일위원회,부위원장


# 3. export new query

In [56]:
# refresh leaderjobtransition with new data
ljt_new.to_excel(path_tables + filename_careers,index=False)

NameError: name 'ljt_new' is not defined