In [1]:
import pandas as pd
import numpy as np

# combined data - 2 tables

In [2]:
path_tables = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 2 tables/"

In [3]:
filename_careers = "careers.xlsx"

In [4]:
filename_leadercareerlink = "leadercareerlink.xlsx"

In [5]:
filename_leaderjoblink = "leaderjoblink.xlsx"

In [6]:
filename_joborglink = "joborglink.xlsx"

# combined data - 1 cleaning/cleaning step 2 - career/

In [7]:
path_cleaning = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 1 cleaning/cleaning step 2 - career/"

In [8]:
subpath_2_1 = "2.1 career_undivided_unparsed_uncoded/"
subpath_2_2 = "2.2 career_divided_unparsed_uncoded/"
subpath_2_3 = "2.3 joborglink/"
subpath_2_4 = "2.4 orgtree/"
subpath_2_5 = "2.5 position/"
subpath_2_6 = "2.6 career_reassembled/"

In [9]:
# orgtree_4_5_jeonsgu_cleaned

# original filein was: orgtree_4_5_jeonsgu_cleaned
# but after original, then recursively edit: orgtree_4_6_0

# filein_ot = "orgtree_4_5_jeonsgu_cleaned.xlsx"
filein_ot = "orgtree_4_6_0.xlsx"
ot = pd.read_excel(path_cleaning + subpath_2_4 + filein_ot,dtype="str")
ot.shape

(1765, 9)

In [10]:
# careers_4_0
filein_careers = "careers_4_0.xlsx"
careers = pd.read_excel(path_cleaning + subpath_2_6 + filein_careers,dtype="str")
careers.shape

(9001, 12)

# functions: format orgtree

In [11]:
# using this on (PI,OrgName) will ensure unique & non-null keys
# using this on a larger df will ensure unique rows and non-null keys, but not unique keys

def unique_non_null_rows(olddf):
    
    df = olddf.copy()
    
    
    ### drop duplicates
    df = df.drop_duplicates(keep="first",ignore_index=True)

    ### drop null rows
    df.dropna(how="all",axis=0,inplace=True)
    
    # drop rows with null PI
    df = df[~(df["PrimaryInstitution"].isna())]
        
    ### drop rows where PI contains stop words
    stop_words_lower = ["uncertain","current","deprecated","please_revise"]
    df = df[ ~ df["PrimaryInstitution"].str.lower().isin(stop_words_lower)]
            

    print("\nUnique Non-null Rows...")
    print("")
    print("\tNon-unique rows:",olddf.shape)
    print("\tUnique rows    :",df.shape)

    ### sort
    df = df.sort_values(["PrimaryInstitution","OrgName"])
    
    return df

In [12]:
def verify_unique_rows(df):
    
    key_columns = ["PrimaryInstitution","OrgName"]
    print("\nVerifying Unique Rows...")
    print("")
    print("\tDuplicate Rows:",df[df.duplicated(keep=False)].shape)
    print("\tDuplicate Keys:",df[df.duplicated(key_columns,keep=False)].shape)
    print("\tNull Rows     :",df[df["PrimaryInstitution"].isna() & df["OrgName"].isna()].shape)

In [13]:
def merge_results(m):
    
    print("\nMerge Results...")
    print("")
    print("\tshape     :",m.shape)
    print("\tleft_only :",m[m["_merge"]=="left_only"].shape)
    print("\tboth      :",m[m["_merge"]=="both"].shape)
    print("\tright_only:",m[m["_merge"]=="right_only"].shape)

In [14]:
def concat_PI_OrgName_keys_to_orgtree(ot,keys,drop_merge_column=True):
    
    # prepare keys
    key_columns = ["PrimaryInstitution","OrgName"]
    keys.columns = key_columns
    keys = unique_non_null_rows(keys)
    
    # verify unique keys
    verify_unique_rows(keys)


    m = ot.merge(keys,on=key_columns,how="outer",indicator=True)
    merge_results(m)
    
    # drop "_merge" column
    if drop_merge_column:
        m.drop(columns=["_merge"],inplace=True)
    
    # verify unique keys in orgtree
    verify_unique_rows(m)
    
    return m

In [15]:
# function that recursively adds upper-level OrgName to the series

# for each PI-Orgname in the hierarchy

def extract_uppperlevel_org(original_org):
    
    orglist = [original_org]
    # if the Orgname contains "_", then add the upper-level PI-Orgname pair to the series and run the function again
    if "_" in original_org:
        upper_org = "_".join(original_org.split("_")[:-1])
        orglist += extract_uppperlevel_org(upper_org)
    
    # once done adding upper-level orgs, then return the orglist
    return orglist

def enumerate_org_hierarchy(df):
    
    olddf = df
    newdf = pd.DataFrame(columns=df.columns)
    
    while(len(list(df.index))>0):
        
        # extract one row and save one row from df
        row = df.iloc[0]
        df = df.iloc[1:]
        
        pi = str(row.PrimaryInstitution)
        org = str(row.OrgName)
        
        if(org!="nan"):
            orglist = extract_uppperlevel_org(org)
            orglist +=[np.nan]

        else:
            orglist = [np.nan]
            
        # print(orglist)

        minidf = pd.DataFrame(data={"PrimaryInstitution":[pi]*len(orglist),"OrgName":orglist})
        
        # print(pi,org,orglist,minidf)

        newdf = pd.concat([newdf,minidf],ignore_index=True)
        
    
    # unique, non-null keys
    newdf = unique_non_null_rows(newdf)
    
    # verify unique keys in enumerated hierarchy
    verify_unique_rows(newdf)
    
    # stats
    print("\nEnumerated Hierarchy Stats...")
    print("")
    print("\tOriginal keys  :",olddf.shape)
    print("\tEnumerated keys:",newdf.shape)
    
    newdf.sort_values(by=list(df.columns),inplace=True,ignore_index=True)

    return newdf    

# 1. concat career (PI, Org) to orgtree

In [16]:
careers.IsJob.unique()

array(['False', 'True'], dtype=object)

In [17]:
key_columns = ["PrimaryInstitution","OrgName"]
career_keys = careers.loc[careers.IsJob=="True",key_columns]

In [18]:
ot2 = concat_PI_OrgName_keys_to_orgtree(ot,career_keys,drop_merge_column=True)


Unique Non-null Rows...

	Non-unique rows: (6733, 2)
	Unique rows    : (1051, 2)

Verifying Unique Rows...

	Duplicate Rows: (0, 2)
	Duplicate Keys: (0, 2)
	Null Rows     : (0, 2)

Merge Results...

	shape     : (1765, 10)
	left_only : (693, 10)
	both      : (1072, 10)
	right_only: (0, 10)

Verifying Unique Rows...

	Duplicate Rows: (48, 9)
	Duplicate Keys: (56, 9)
	Null Rows     : (0, 9)


# 2. format orgtree

### Step 1. concat (PI-OrgName) from LinkToNext_

In [19]:
link_keys = ot[["LinkToNext_PI","LinkToNext_Org"]]

In [20]:
ot3 = concat_PI_OrgName_keys_to_orgtree(ot2,link_keys,drop_merge_column=True)


Unique Non-null Rows...

	Non-unique rows: (1765, 2)
	Unique rows    : (238, 2)

Verifying Unique Rows...

	Duplicate Rows: (0, 2)
	Duplicate Keys: (0, 2)
	Null Rows     : (0, 2)

Merge Results...

	shape     : (1774, 10)
	left_only : (1504, 10)
	both      : (261, 10)
	right_only: (9, 10)

Verifying Unique Rows...

	Duplicate Rows: (48, 9)
	Duplicate Keys: (56, 9)
	Null Rows     : (0, 9)


### Step 2. enumerate Org hierarchy from (PI, OrgName) and concat to orgtree

In [21]:
key_columns = ["PrimaryInstitution","OrgName"]
ot_keys = ot3[key_columns]
verify_unique_rows(ot_keys)


Verifying Unique Rows...

	Duplicate Rows: (56, 2)
	Duplicate Keys: (56, 2)
	Null Rows     : (0, 2)


In [22]:
ot_keys_enumerated = enumerate_org_hierarchy(ot_keys)


Unique Non-null Rows...

	Non-unique rows: (4350, 2)
	Unique rows    : (1739, 2)

Verifying Unique Rows...

	Duplicate Rows: (0, 2)
	Duplicate Keys: (0, 2)
	Null Rows     : (0, 2)

Enumerated Hierarchy Stats...

	Original keys  : (1774, 2)
	Enumerated keys: (1739, 2)


In [23]:
ot4 = concat_PI_OrgName_keys_to_orgtree(ot3,ot_keys_enumerated,drop_merge_column=True)


Unique Non-null Rows...

	Non-unique rows: (1739, 2)
	Unique rows    : (1739, 2)

Verifying Unique Rows...

	Duplicate Rows: (0, 2)
	Duplicate Keys: (0, 2)
	Null Rows     : (0, 2)

Merge Results...

	shape     : (1775, 10)
	left_only : (0, 10)
	both      : (1774, 10)
	right_only: (1, 10)

Verifying Unique Rows...

	Duplicate Rows: (48, 9)
	Duplicate Keys: (56, 9)
	Null Rows     : (0, 9)


In [24]:
# confirm that PrimaryInstitution is not listed inside OrgName inappropriately
ot4[ot4.apply(lambda row: str(row.PrimaryInstitution) in str(row.OrgName),axis=1)]

Unnamed: 0,InsideGov,PrimaryInstitution,OrgName,ImmediateSuperiorOrg,SupervisesParent,LinkToNext_PI,LinkToNext_Org,LinkToNext_Year,Notes
135,0,사회주의노동청년동맹,평양시사회주의노동청년동맹,,,김일성사회주의청년동맹,uncertain,1996,
697,1,내각,내각사무국,,,Current,Current,Current,
698,1,내각,내각정치국,,,Current,Current,Current,
1023,1,노동당,당중앙위원회_선전선동부_조선노동당출판사,당중앙위원회_선전선동부,,Current,Current,Current,
1024,1,노동당,당중앙위원회_선전선동부_조선노동당출판사_근로자사,당중앙위원회_선전선동부_조선노동당출판사,,Current,Current,Current,
1430,1,내각,내각사무국_호위처,내각사무국,,uncertain,uncertain,uncertain,


### Step 3. update ImmediateSuperiorOrg - write fxn for this!!

In [25]:
# set ImmediateSuperiorOrg to NaN
ot4["ImmediateSuperiorOrg"]=np.nan

In [26]:
# update ImmediateSuperiorOrg for those OrgName that have one
ot4.loc[ot4.OrgName.str.contains("_",na=False),"ImmediateSuperiorOrg"] = ot4[ot4.OrgName.str.contains("_",na=False)].apply(lambda row: "_".join(row.OrgName.split("_")[:-1]),axis=1)

In [27]:
# verify ImmediateSuperiorOrg is correctly formatted
ot4[ot4.OrgName.str.contains("_",na=False)]

Unnamed: 0,InsideGov,PrimaryInstitution,OrgName,ImmediateSuperiorOrg,SupervisesParent,LinkToNext_PI,LinkToNext_Org,LinkToNext_Year,Notes
51,0,김일성사회주청년동맹,중앙위원회_체육부,중앙위원회,,김일성김정일주의청년동맹,중앙위원회_체육부,1996,
62,0,내각,교육성_고등교육부,교육성,,내각,교육위원회_고등교육성,2010,
63,0,내각,교육성_고등교육부_김일성종합대학,교육성_고등교육부,,내각,교육위원회_고등교육성_김일성종합대학,2010,
64,0,내각,교육성_고등교육부_남포대학,교육성_고등교육부,,내각,교육위원회_고등교육성_남포대학,2010,
65,0,내각,교육성_고등교육부_인민경제대학,교육성_고등교육부,,내각,교육위원회_고등교육성_인민경제대학,2010,
...,...,...,...,...,...,...,...,...,...
1768,,내각,고등교육성_평양체육대학,고등교육성,,,,,
1769,,내각,교육위원회_고등교육성_송도정치경제대학,교육위원회_고등교육성,,,,,
1770,,내각,교육위원회_고등교육성_평양음악대학,교육위원회_고등교육성,,,,,
1771,,노동당,비서국_인민무력부B_정치안전국,비서국_인민무력부B,,,,,


In [28]:
# confirm all ImmediateSuperiorOrg are contained in OrgName
iso = ot4[["ImmediateSuperiorOrg"]]
iso.columns = ["OrgName"]
iso = iso.drop_duplicates(keep="first")
iso.shape

(186, 1)

In [29]:
iso.head(2)

Unnamed: 0,OrgName
0,
51,중앙위원회


In [30]:
miso = ot4.merge(iso,on="OrgName",how="outer",indicator=True)
miso.shape

(1775, 10)

In [31]:
# confirms all ImmediateSuperiorOrg are also in OrgName
miso[miso["_merge"]=="right_only"]

Unnamed: 0,InsideGov,PrimaryInstitution,OrgName,ImmediateSuperiorOrg,SupervisesParent,LinkToNext_PI,LinkToNext_Org,LinkToNext_Year,Notes,_merge


### Step 4. replace null vars with Please_Revise - write fxn for this!!

In [32]:
replace_var = "InsideGov"
ot4.loc[ot4[replace_var].isna(),replace_var]="Please_Revise"

In [33]:
replace_var = "LinkToNext_PI"
ot4.loc[ot4[replace_var].isna(),replace_var]="Please_Revise"

In [34]:
replace_var = "LinkToNext_Org"
ot4.loc[ot4[replace_var].isna(),replace_var]="Please_Revise"

In [35]:
replace_var = "LinkToNext_Year"
ot4.loc[ot4[replace_var].isna(),replace_var]="Please_Revise"

In [36]:
ot4

Unnamed: 0,InsideGov,PrimaryInstitution,OrgName,ImmediateSuperiorOrg,SupervisesParent,LinkToNext_PI,LinkToNext_Org,LinkToNext_Year,Notes
0,0,4.15문화창작단,,,,Current,Current,Current,
1,0,6.15공동선언실천남북공동위원회북측위원회,교직원분과,,,Current,Current,Current,
2,0,6.15공동선언실천남북공동위원회북측위원회,노동자분과,,,Current,Current,Current,
3,0,6.15공동선언실천남북공동위원회북측위원회,농업근로자분과,,,Current,Current,Current,
4,0,6.15공동선언실천남북공동위원회북측위원회,문학예술분과,,,Current,Current,Current,
...,...,...,...,...,...,...,...,...,...
1770,Please_Revise,내각,교육위원회_고등교육성_평양음악대학,교육위원회_고등교육성,,Please_Revise,Please_Revise,Please_Revise,
1771,Please_Revise,노동당,비서국_인민무력부B_정치안전국,비서국_인민무력부B,,Please_Revise,Please_Revise,Please_Revise,
1772,Please_Revise,사회주의애국청년동맹,돌격대,,,Please_Revise,Please_Revise,Please_Revise,
1773,Please_Revise,사회주의애국청년동맹,중앙위원회_체육부,중앙위원회,,Please_Revise,Please_Revise,Please_Revise,


### Step 5. export orgtree

In [37]:
filein_ot

'orgtree_4_6_0.xlsx'

In [38]:
fileout_ot = "orgtree_4_6_0.xlsx"

In [39]:
ot4.to_excel(path_cleaning + subpath_2_4 + fileout_ot,index=False)

### Step 6. manually update Please_Revise and iteratively re-run 2.5.3 until these are null

In [40]:
ot4[ot4["InsideGov"].str.lower()=="please_revise"].shape

(10, 9)

In [41]:
ot4[ot4["LinkToNext_PI"].str.lower()=="please_revise"].shape

(10, 9)

In [42]:
ot4[ot4["LinkToNext_Org"].str.lower()=="please_revise"].shape

(42, 9)

In [43]:
ot4[ot4["LinkToNext_Year"].str.lower()=="please_revise"].shape

(10, 9)