In [1]:
import pandas as pd
import numpy as np
from datetime import date

# combined data - 2 tables

In [2]:
path_tables = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 2 tables/"

In [3]:
filename_careers = "careers.xlsx"
careers = pd.read_excel(path_tables + filename_careers,dtype="str")
careers.shape

(9001, 12)

In [4]:
careers.columns

Index(['CareerString', 'CareerDateString_2022', 'IsJob', 'MultipleSubstrings',
       'CareerStartYear', 'CareerStartMonth', 'CareerSubstring', 'OrgString',
       'PrimaryInstitution', 'OrgName', 'Position', 'Notes'],
      dtype='object')

In [5]:
filename_orgtree = "orgtree.xlsx"
ot = pd.read_excel(path_tables + filename_orgtree,dtype="str")
ot.shape

(1742, 9)

In [6]:
filename_leadercareerlink = "leadercareerlink.xlsx"
lclink = pd.read_excel(path_tables + filename_leadercareerlink,dtype="str")
lclink.shape

(12617, 3)

In [7]:
filename_positions_elected = "positions_elected.xlsx"
elected = pd.read_excel(path_tables + filename_positions_elected,dtype="str")
elected.shape

(105, 5)

In [22]:
filename_pop = "pi_org_pos.xlsx"
# pop = pd.read_excel(path_tables + filename_pop,dtype="str")
# pop.shape

In [8]:
# position hierarchy table

In [9]:
# filename_leaderjoblink = "leaderjoblink.xlsx"

In [10]:
# filename_joborglink = "joborglink.xlsx"

# functions

In [11]:
def merge_results(m):
    
    print("\nMerge Results...")
    print("")
    print("\tshape     :",m.shape)
    print("\tleft_only :",m[m["_merge"]=="left_only"].shape)
    print("\tboth      :",m[m["_merge"]=="both"].shape)
    print("\tright_only:",m[m["_merge"]=="right_only"].shape)

In [12]:
# using this on (PI,OrgName) will ensure unique & non-null keys
# using this on a larger df will ensure unique rows and non-null keys, but not unique keys

def unique_non_null_rows(olddf):
    
    df = olddf.copy()
    
    
    ### drop duplicates
    df = df.drop_duplicates(keep="first",ignore_index=True)

    ### drop null rows
    df.dropna(how="all",axis=0,inplace=True)
    
    # drop rows with null PI
    df = df[~(df["PrimaryInstitution"].isna())]
        
    ### drop rows where PI contains stop words
    stop_words_lower = ["uncertain","current","deprecated","please_revise"]
    df = df[ ~ df["PrimaryInstitution"].str.lower().isin(stop_words_lower)]
            

    print("\nUnique Non-null Rows...")
    print("")
    print("\tNon-unique rows:",olddf.shape)
    print("\tUnique rows    :",df.shape)

    ### sort
    df = df.sort_values(["PrimaryInstitution","OrgName"])
    
    return df

# 0. prepare pop table: ElectedPositions, PositionHierarchy, OrgHierarchy

### Elected Postions from PI-OrgName-Position

In [13]:
key_columns = ["PrimaryInstitution","OrgName"]
keyorg = ot[key_columns].drop_duplicates(keep="first")
keyorg.shape

(1742, 2)

In [14]:
keyorg = unique_non_null_rows(keyorg)


Unique Non-null Rows...

	Non-unique rows: (1742, 2)
	Unique rows    : (1742, 2)


In [15]:
jobs = careers[(careers["IsJob"]=="True") & (careers["CareerStartYear"].notna())]

In [16]:
key_columns2 = ["PrimaryInstitution","OrgName","Position"]
keyjobs = jobs[key_columns2].drop_duplicates(keep="first")
keyjobs.shape

(1902, 3)

In [17]:
keyjobs = unique_non_null_rows(keyjobs)


Unique Non-null Rows...

	Non-unique rows: (1902, 3)
	Unique rows    : (1851, 3)


In [18]:
pop = keyorg.merge(keyjobs,how="outer",on=key_columns,indicator=True)
pop.shape

(2575, 4)

In [19]:
merge_results(pop)


Merge Results...

	shape     : (2575, 4)
	left_only : (724, 4)
	both      : (1851, 4)
	right_only: (0, 4)


In [20]:
# in orgtree, not in careers
pop[pop["_merge"]=="left_only"]

Unnamed: 0,PrimaryInstitution,OrgName,Position,_merge
3,6.15공동선언실천남북공동위원회북측위원회,교직원분과,,left_only
4,6.15공동선언실천남북공동위원회북측위원회,노동자분과,,left_only
5,6.15공동선언실천남북공동위원회북측위원회,농업근로자분과,,left_only
6,6.15공동선언실천남북공동위원회북측위원회,문학예술분과,,left_only
7,6.15공동선언실천남북공동위원회북측위원회,보건분과,,left_only
...,...,...,...,...
2555,호위사령부,호위총국_평양경비사령부,,left_only
2562,화학공업협회,,,left_only
2564,활쏘기협회,,,left_only
2570,흥남가스화건설장함경남도지구계획위원회,,,left_only


In [24]:
# populate PositionRank

In [25]:
# populate OrgRank

In [26]:
# pop_pre.to_excel(path_tables + "pi_org_pos.xlsx",index=False)

# 0. create position_list 

In [14]:
pop.shape

(2611, 7)

In [15]:
position_list = pop["Position"].drop_duplicates().to_frame()
type(position_list)

pandas.core.frame.DataFrame

In [16]:
position_list.sort_values("Position",inplace=True,ignore_index=True)
position_list

Unnamed: 0,Position
0,10국장
1,1등서기관
2,1부국장
3,1비서
4,2군단장
...,...
290,후방총국장
291,후보
292,후보위원
293,후보위원(보선)


In [17]:
position_list.to_excel(path_tables + "position_list.xlsx",index=False)

# 1. evaluate pop table: ElectedPositions, PositionHierarchy, OrgHierarchy

In [None]:
# add IsElected
pop_pre2 = pop_pre.merge()

### PositionHierarchy

In [32]:
# divide

# 내각, 정무원 - Jacob
# 노동당 - Esther
# 나머지 - Jeongsu

# include InGov=False
# include IsElected=True

In [22]:
pop = pop_pre

In [29]:
pop.groupby(["PrimaryInstitution"],as_index=False).count().sort_values(["Position"],ascending=False).head(10)

Unnamed: 0,PrimaryInstitution,OrgName,Position,_merge
48,내각,609,452,627
51,노동당,279,263,289
223,정무원,277,253,285
443,최고인민회의,92,92,98
441,총참모부,156,88,166
25,국방위원회,39,32,50
433,중앙인민위원회,24,30,31
23,국무위원회,104,26,109
329,조선문학예술총동맹,36,15,37
399,조선적십자회,12,13,16
