In [23]:
import pandas as pd
import numpy as np
from datetime import date

# combined data - 2 tables

In [24]:
path_tables = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 2 tables/"

In [25]:
filename_careers = "careers.xlsx"
careers = pd.read_excel(path_tables + filename_careers,dtype="str")
careers.shape

(9001, 12)

In [26]:
careers.columns

Index(['CareerString', 'CareerDateString_2022', 'IsJob', 'MultipleSubstrings',
       'CareerStartYear', 'CareerStartMonth', 'CareerSubstring', 'OrgString',
       'PrimaryInstitution', 'OrgName', 'Position', 'Notes'],
      dtype='object')

In [27]:
filename_orgtree = "orgtree.xlsx"
ot = pd.read_excel(path_tables + filename_orgtree,dtype="str")
ot.shape

(1742, 9)

In [28]:
filename_leadercareerlink = "leadercareerlink.xlsx"
lclink = pd.read_excel(path_tables + filename_leadercareerlink,dtype="str")
lclink.shape

(12617, 3)

In [29]:
filename_positions_elected = "positions_elected.xlsx"
elected = pd.read_excel(path_tables + filename_positions_elected,dtype="str")
elected.shape

(105, 5)

In [30]:
filename_pop = "pi_org_pos.xlsx"
# pop = pd.read_excel(path_tables + filename_pop,dtype="str")
# pop.shape

In [31]:
# position hierarchy table

In [32]:
# filename_leaderjoblink = "leaderjoblink.xlsx"

In [33]:
# filename_joborglink = "joborglink.xlsx"

# functions

In [34]:
def merge_results(m):
    
    print("\nMerge Results...")
    print("")
    print("\tshape     :",m.shape)
    print("\tleft_only :",m[m["_merge"]=="left_only"].shape)
    print("\tboth      :",m[m["_merge"]=="both"].shape)
    print("\tright_only:",m[m["_merge"]=="right_only"].shape)

In [35]:
# using this on (PI,OrgName) will ensure unique & non-null keys
# using this on a larger df will ensure unique rows and non-null keys, but not unique keys

def unique_non_null_rows(olddf):
    
    df = olddf.copy()
    
    
    ### drop duplicates
    df = df.drop_duplicates(keep="first",ignore_index=True)

    ### drop null rows
    df.dropna(how="all",axis=0,inplace=True)
    
    # drop rows with null PI
    df = df[~(df["PrimaryInstitution"].isna())]
        
    ### drop rows where PI contains stop words
    stop_words_lower = ["uncertain","current","deprecated","please_revise"]
    df = df[ ~ df["PrimaryInstitution"].str.lower().isin(stop_words_lower)]
            

    print("\nUnique Non-null Rows...")
    print("")
    print("\tNon-unique rows:",olddf.shape)
    print("\tUnique rows    :",df.shape)

    ### sort
    df = df.sort_values(["PrimaryInstitution","OrgName"])
    
    return df

# 0. prepare pop table: ElectedPositions, PositionHierarchy, OrgHierarchy

### Elected Postions from PI-OrgName-Position

In [97]:
ot.head(2)

Unnamed: 0,InsideGov,PrimaryInstitution,OrgName,ImmediateSuperiorOrg,SupervisesParent,LinkToNext_PI,LinkToNext_Org,LinkToNext_Year,Notes
0,0,4.15문화창작단,,,,Current,Current,Current,
1,0,6.15공동선언실천남북공동위원회북측위원회,교직원분과,,,Current,Current,Current,


In [105]:
ot_columns = ["InsideGov","PrimaryInstitution","OrgName"]
keyorg = ot[ot_columns].drop_duplicates(keep="first")
keyorg.shape

(1742, 3)

In [106]:
key_columns = ["PrimaryInstitution","OrgName"]
unique_non_null_rows(keyorg)


Unique Non-null Rows...

	Non-unique rows: (1742, 3)
	Unique rows    : (1742, 3)


Unnamed: 0,InsideGov,PrimaryInstitution,OrgName
0,0,4.15문화창작단,
1,0,6.15공동선언실천남북공동위원회북측위원회,교직원분과
2,0,6.15공동선언실천남북공동위원회북측위원회,노동자분과
3,0,6.15공동선언실천남북공동위원회북측위원회,농업근로자분과
4,0,6.15공동선언실천남북공동위원회북측위원회,문학예술분과
...,...,...,...
1737,0,흥남가스화건설장함경남도지구계획위원회,
1738,0,희천기계공장,분공장
1739,0,희천기계공장,
1740,0,희천정밀기계공장,분공장


In [107]:
jobs = careers[(careers["IsJob"]=="True") & (careers["CareerStartYear"].notna())]

In [108]:
key_columns2 = ["PrimaryInstitution","OrgName","Position"]
keyjobs = jobs[key_columns2].drop_duplicates(keep="first")
keyjobs.shape

(1902, 3)

In [109]:
keyjobs = unique_non_null_rows(keyjobs)


Unique Non-null Rows...

	Non-unique rows: (1902, 3)
	Unique rows    : (1851, 3)


In [110]:
pop = keyorg.merge(keyjobs,how="outer",on=key_columns,indicator=True)
pop.shape

(2575, 5)

In [111]:
merge_results(pop)


Merge Results...

	shape     : (2575, 5)
	left_only : (724, 5)
	both      : (1851, 5)
	right_only: (0, 5)


In [112]:
# in orgtree, not in careers
pop[pop["_merge"]=="left_only"]

Unnamed: 0,InsideGov,PrimaryInstitution,OrgName,Position,_merge
3,0,6.15공동선언실천남북공동위원회북측위원회,교직원분과,,left_only
4,0,6.15공동선언실천남북공동위원회북측위원회,노동자분과,,left_only
5,0,6.15공동선언실천남북공동위원회북측위원회,농업근로자분과,,left_only
6,0,6.15공동선언실천남북공동위원회북측위원회,문학예술분과,,left_only
7,0,6.15공동선언실천남북공동위원회북측위원회,보건분과,,left_only
...,...,...,...,...,...
2555,1,호위사령부,호위총국_평양경비사령부,,left_only
2562,0,화학공업협회,,,left_only
2564,0,활쏘기협회,,,left_only
2570,0,흥남가스화건설장함경남도지구계획위원회,,,left_only


In [113]:
pop.drop(columns=["_merge"],inplace=True)

In [66]:
# populate PositionRank

In [67]:
# populate OrgRank

In [68]:
# pop_pre.to_excel(path_tables + "pi_org_pos.xlsx",index=False)

# 0. create position_list 

In [69]:
pop.shape

(2575, 3)

In [70]:
position_list = pop["Position"].drop_duplicates().to_frame()
type(position_list)

pandas.core.frame.DataFrame

In [71]:
position_list.sort_values("Position",inplace=True,ignore_index=True)
position_list

Unnamed: 0,Position
0,1등서기관
1,1부국장
2,1비서
3,2등서기관
4,2부국장
...,...
237,회장
238,후방총국장
239,후보
240,후보위원


In [72]:
position_list.to_excel(path_tables + "position_list.xlsx",index=False)

# 0. add ElectedPositions

In [114]:
# add IsElected
pop2 = pop.merge(elected,on=key_columns2,how="outer",indicator=True)

In [115]:
pop2.head(2)

Unnamed: 0,InsideGov,PrimaryInstitution,OrgName,Position,IsElected,PositionRank,_merge
0,0,4.15문화창작단,,단장,,,left_only
1,0,4.15문화창작단,,부단장,,,left_only


In [116]:
# "both": IsElected = 1; "left_only": IsElected = nan
merge_results(pop2)


Merge Results...

	shape     : (2575, 7)
	left_only : (2470, 7)
	both      : (105, 7)
	right_only: (0, 7)


In [117]:
pop2.drop(columns=["_merge"],inplace=True)

In [118]:
pop2.head(5)

Unnamed: 0,InsideGov,PrimaryInstitution,OrgName,Position,IsElected,PositionRank
0,0,4.15문화창작단,,단장,,
1,0,4.15문화창작단,,부단장,,
2,0,4.15문화창작단,,,,
3,0,6.15공동선언실천남북공동위원회북측위원회,교직원분과,,,
4,0,6.15공동선언실천남북공동위원회북측위원회,노동자분과,,,


# 0. remove null Positions 

In [119]:
pop3 = pop2[pop2.Position.notna()]
pop3

Unnamed: 0,InsideGov,PrimaryInstitution,OrgName,Position,IsElected,PositionRank
0,0,4.15문화창작단,,단장,,
1,0,4.15문화창작단,,부단장,,
16,0,7.7연합기업소,,기사장,,
17,0,北-러시아 친선의원단,,위원장,,
18,0,北-중국 친선의원단,,위원장,,
...,...,...,...,...,...,...
2568,0,황해제철연합기업소,,지배인,,
2569,0,후방군관학교,,교장,,
2571,0,희천기계공장,분공장,지배인,,
2573,0,희천정밀기계공장,분공장,지배인,,


In [120]:
# confirm all p-o-p columns are unique
unique_non_null_rows(pop3[key_columns2])


Unique Non-null Rows...

	Non-unique rows: (1810, 3)
	Unique rows    : (1810, 3)


Unnamed: 0,PrimaryInstitution,OrgName,Position
0,4.15문화창작단,,단장
1,4.15문화창작단,,부단장
2,7.7연합기업소,,기사장
3,北-러시아 친선의원단,,위원장
4,北-중국 친선의원단,,위원장
...,...,...,...
1805,황해제철연합기업소,,지배인
1806,후방군관학교,,교장
1807,희천기계공장,분공장,지배인
1808,희천정밀기계공장,분공장,지배인


# 0. divide by PrimaryInstitution & export

In [121]:
# divide

# 내각, 정무원 - Jacob
# 노동당 - Esther
# 나머지 - Jeongsu

# include InGov=False
# include IsElected=True

In [122]:
jacob_pi = ["내각","정무원"]
esther_pi = ["노동당"]

In [123]:
jacob = pop3[pop3.PrimaryInstitution.isin(jacob_pi)]
jacob.shape

(705, 6)

In [124]:
jacob.PrimaryInstitution.unique()

array(['내각', '정무원'], dtype=object)

In [125]:
esther = pop3[pop3.PrimaryInstitution.isin(esther_pi)]
esther.shape

(263, 6)

In [126]:
esther.PrimaryInstitution.unique()

array(['노동당'], dtype=object)

In [127]:
jeongsu = pop3[~ pop3.PrimaryInstitution.isin(jacob_pi + esther_pi)]
jeongsu.shape

(842, 6)

In [128]:
jeongsu.PrimaryInstitution.unique()

array(['4.15문화창작단', '7.7연합기업소', '北-러시아 친선의원단', '北-중국 친선의원단 ', '강건종합군관학교',
       '강동지구탄광연합기업소', '강서구역청산엽동농장', '강선제강소', '강원도임업연합기업소', '개성무역총회사',
       '개성방직공장', '개천군협동농장', '공산청년동맹', '광명성경제연합회', '구성공작기계공장', '국가개발은행',
       '국립교향악단', '국립평양예술단', '국무위원회', '국방위원회', '국제무도경기위원회', '국제문제연구소',
       '금성정치대학', '금속및기계수출업총회사', '금수산의사당', '길주펄프공장', '김일성-김정일주의청년동맹',
       '김일성경호대', '김일성군사종합대학', '김일성김정일기금위원회', '김일성사회주의청년동맹', '김책공군대학',
       '김책제철연합기업소', '남북경협제도실무협의회', '남포유리공장', '남흥청년화학연합기업소', '내각B',
       '노농적위군', '농업과학원', '단군민족통일협의회', '단천수산사업소', '대성경제연합체', '대안전기공장',
       '대안중기계연합기업소', '동아시아경기대회협의회', '만경대혁명학원', '만수대예술국장', '만수대예술극장',
       '묘향무역회사', '무산광산연합기업소', '문평제련소', '민족화해협의회', '백두산건축연구원', '백두산창작단',
       '범민족통일음악회', '보위사령부', '보천보전자악단', '부령합금철공장', '북-가나 친선협회',
       '북-가나 친선협회 ', '북-기니비사우 친선협회', '북-나미비아 친선협회', '북-나이지리아 친선협회',
       '북-네팔 친선협회', '북-네팔 친선협회 ', '북-독일 친선의원단', '북-독일 친선협회',
       '북-라오스 친선의원단', '북-라오스 친선협회', '북-라오스 친선협회 ',
       '북-라틴아메리카 및 카리브해 지역 친선협회', '북-라틴아메리카친선협회', '북-러

In [129]:
# export tables
jacob.to_excel(path_tables + "position_rank_jacob.xlsx",index=True)
esther.to_excel(path_tables + "position_rank_esther.xlsx",index=True)
jeongsu.to_excel(path_tables + "position_rank_jeongsu.xlsx",index=True)