In [37]:
import pandas as pd
import numpy as np

# combined data - 2 tables

In [38]:
path_tables = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 2 tables/"

In [39]:
filename_careers = "careers.xlsx"

In [40]:
filename_leadercareerlink = "leadercareerlink.xlsx"

In [41]:
filename_leaderjoblink = "leaderjoblink.xlsx"

In [42]:
filename_joborglink = "joborglink.xlsx"

# combined data - 1 cleaning/cleaning step 2 - career/

In [43]:
path_cleaning = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 1 cleaning/cleaning step 2 - career/"

In [44]:
subpath_2_1 = "2.1 career_undivided_unparsed_uncoded/"
subpath_2_2 = "2.2 career_divided_unparsed_uncoded/"
subpath_2_3 = "2.3 joborglink/"
subpath_2_4 = "2.4 orgtree/"
subpath_2_5 = "2.5 position/"
subpath_2_6 = "2.6 career_reassembled/"

In [45]:
# orgtree_4_5_jeonsgu_cleaned
filein_ot = "orgtree_4_5_jeonsgu_cleaned.xlsx"
ot2 = pd.read_excel(path_cleaning + subpath_2_4 + filein_ot,dtype="str")
ot2.shape

(1380, 9)

In [46]:
# careers_4_0
filein_careers = "careers_4_0.xlsx"
careers = pd.read_excel(path_cleaning + subpath_2_6 + filein_careers,dtype="str")
careers.shape

(9001, 12)

# functions: format orgtree

In [47]:
# using this on (PI,OrgName) will ensure unique & non-null keys
# using this on a larger df will ensure unique rows and non-null keys, but not unique keys

def unique_non_null_rows(olddf):
    
    df = olddf.copy()
    
    
    ### drop duplicates
    df = df.drop_duplicates(keep="first",ignore_index=True)

    ### drop null rows
    df.dropna(how="all",axis=0,inplace=True)
    
    # drop rows with null PI
    df = df[~(df["PrimaryInstitution"].isna())]
        
    ### drop rows where PI contains stop words
    stop_words_lower = ["uncertain","current","deprecated","please_revise"]
    df = df[ ~ df["PrimaryInstitution"].str.lower().isin(stop_words_lower)]
            

    print("\nUnique Non-null Rows...")
    print("")
    print("\tNon-unique rows:",olddf.shape)
    print("\tUnique rows    :",df.shape)

    ### sort
    df = df.sort_values(["PrimaryInstitution","OrgName"])
    
    return df

In [48]:
def verify_unique_rows(df):
    
    key_columns = ["PrimaryInstitution","OrgName"]
    print("\nVerifying Unique Rows...")
    print("")
    print("\tDuplicate Rows:",df[df.duplicated(keep=False)].shape)
    print("\tDuplicate Keys:",df[df.duplicated(key_columns,keep=False)].shape)
    print("\tNull Rows     :",df[df["PrimaryInstitution"].isna() & df["OrgName"].isna()].shape)

In [49]:
def merge_results(m):
    
    print("\nMerge Results...")
    print("")
    print("\tshape     :",m.shape)
    print("\tleft_only :",m[m["_merge"]=="left_only"].shape)
    print("\tboth      :",m[m["_merge"]=="both"].shape)
    print("\tright_only:",m[m["_merge"]=="right_only"].shape)

In [50]:
def concat_PI_OrgName_keys_to_orgtree(ot,keys,drop_merge_column=True):
    
    # prepare keys
    key_columns = ["PrimaryInstitution","OrgName"]
    keys.columns = key_columns
    keys = unique_non_null_rows(keys)
    
    # verify unique keys
    verify_unique_rows(keys)


    m = ot.merge(keys,on=key_columns,how="outer",indicator=True)
    merge_results(m)
    
    # drop "_merge" column
    if drop_merge_column:
        m.drop(columns=["_merge"],inplace=True)
    
    # verify unique keys in orgtree
    verify_unique_rows(m)
    
    return m

In [51]:
# function that recursively adds upper-level OrgName to the series

# for each PI-Orgname in the hierarchy

def extract_uppperlevel_org(original_org):
    
    orglist = [original_org]
    # if the Orgname contains "_", then add the upper-level PI-Orgname pair to the series and run the function again
    if "_" in original_org:
        upper_org = "_".join(original_org.split("_")[:-1])
        orglist += extract_uppperlevel_org(upper_org)
    
    # once done adding upper-level orgs, then return the orglist
    return orglist

def enumerate_org_hierarchy(df):
    
    olddf = df
    newdf = pd.DataFrame(columns=df.columns)
    
    while(len(list(df.index))>0):
        
        # extract one row and save one row from df
        row = df.iloc[0]
        df = df.iloc[1:]
        
        pi = str(row.PrimaryInstitution)
        org = str(row.OrgName)
        
        if(org!="nan"):
            orglist = extract_uppperlevel_org(org)
            orglist +=[np.nan]

        else:
            orglist = [np.nan]
            
        # print(orglist)

        minidf = pd.DataFrame(data={"PrimaryInstitution":[pi]*len(orglist),"OrgName":orglist})
        
        # print(pi,org,orglist,minidf)

        newdf = pd.concat([newdf,minidf],ignore_index=True)
        
    
    # unique, non-null keys
    newdf = unique_non_null_rows(newdf)
    
    # verify unique keys in enumerated hierarchy
    verify_unique_rows(newdf)
    
    # stats
    print("\nEnumerated Hierarchy Stats...")
    print("")
    print("\tOriginal keys  :",olddf.shape)
    print("\tEnumerated keys:",newdf.shape)
    
    newdf.sort_values(by=list(df.columns),inplace=True,ignore_index=True)

    return newdf    

# 1. concat career (PI, Org) to orgtree

In [52]:
careers.IsJob.unique()

array(['False', 'True'], dtype=object)

In [53]:
key_columns = ["PrimaryInstitution","OrgName"]
career_keys = careers.loc[careers.IsJob=="True",key_columns]

In [54]:
ot2 = concat_PI_OrgName_keys_to_orgtree(ot,career_keys,drop_merge_column=True)


Unique Non-null Rows...

	Non-unique rows: (6733, 2)
	Unique rows    : (1034, 2)

Verifying Unique Rows...

	Duplicate Rows: (0, 2)
	Duplicate Keys: (0, 2)
	Null Rows     : (0, 2)

Merge Results...

	shape     : (1669, 10)
	left_only : (635, 10)
	both      : (745, 10)
	right_only: (289, 10)

Verifying Unique Rows...

	Duplicate Rows: (0, 9)
	Duplicate Keys: (0, 9)
	Null Rows     : (0, 9)


# 2. format orgtree

### Step 1. concat (PI-OrgName) from LinkToNext_

In [55]:
link_keys = ot[["LinkToNext_PI","LinkToNext_Org"]]

In [56]:
ot3 = concat_PI_OrgName_keys_to_orgtree(ot2,link_keys,drop_merge_column=True)


Unique Non-null Rows...

	Non-unique rows: (1380, 2)
	Unique rows    : (211, 2)

Verifying Unique Rows...

	Duplicate Rows: (0, 2)
	Duplicate Keys: (0, 2)
	Null Rows     : (0, 2)

Merge Results...

	shape     : (1695, 10)
	left_only : (1484, 10)
	both      : (185, 10)
	right_only: (26, 10)

Verifying Unique Rows...

	Duplicate Rows: (0, 9)
	Duplicate Keys: (0, 9)
	Null Rows     : (0, 9)


### Step 2. enumerate Org hierarchy from (PI, OrgName) and concat to orgtree

In [57]:
key_columns = ["PrimaryInstitution","OrgName"]
ot_keys = ot3[key_columns]
verify_unique_rows(ot_keys)


Verifying Unique Rows...

	Duplicate Rows: (0, 2)
	Duplicate Keys: (0, 2)
	Null Rows     : (0, 2)


In [58]:
ot_keys_enumerated = enumerate_org_hierarchy(ot_keys)


Unique Non-null Rows...

	Non-unique rows: (4162, 2)
	Unique rows    : (1711, 2)

Verifying Unique Rows...

	Duplicate Rows: (0, 2)
	Duplicate Keys: (0, 2)
	Null Rows     : (0, 2)

Enumerated Hierarchy Stats...

	Original keys  : (1695, 2)
	Enumerated keys: (1711, 2)


In [59]:
ot4 = concat_PI_OrgName_keys_to_orgtree(ot3,ot_keys_enumerated,drop_merge_column=True)


Unique Non-null Rows...

	Non-unique rows: (1711, 2)
	Unique rows    : (1711, 2)

Verifying Unique Rows...

	Duplicate Rows: (0, 2)
	Duplicate Keys: (0, 2)
	Null Rows     : (0, 2)

Merge Results...

	shape     : (1711, 10)
	left_only : (0, 10)
	both      : (1695, 10)
	right_only: (16, 10)

Verifying Unique Rows...

	Duplicate Rows: (0, 9)
	Duplicate Keys: (0, 9)
	Null Rows     : (0, 9)


In [60]:
# confirm that PrimaryInstitution is not listed inside OrgName inappropriately
ot4[ot4.apply(lambda row: str(row.PrimaryInstitution) in str(row.OrgName),axis=1)]

Unnamed: 0,InsideGov,PrimaryInstitution,OrgName,ImmediateSuperiorOrg,SupervisesParent,LinkToNext_PI,LinkToNext_Org,LinkToNext_Year,Notes
249,1.0,내각,내각사무국,,,Current,Current,Current,
250,1.0,내각,내각정치국,,,Current,Current,Current,
573,1.0,노동당,당중앙위원회_선전선동부_조선노동당출판사,당중앙위원회_선전선동부,,Current,Current,Current,
574,1.0,노동당,당중앙위원회_선전선동부_조선노동당출판사_근로자사,당중앙위원회_선전선동부_조선노동당출판사,,Current,Current,Current,
686,0.0,사회주의노동청년동맹,평양시사회주의노동청년동맹,,,김일성사회주의청년동맹,uncertain,1996,
1397,,내각,내각사무국_호위처,,,,,,


### Step 3. update ImmediateSuperiorOrg - write fxn for this!!

In [61]:
# set ImmediateSuperiorOrg to NaN
ot4["ImmediateSuperiorOrg"]=np.nan

In [62]:
# update ImmediateSuperiorOrg for those OrgName that have one
ot4.loc[ot4.OrgName.str.contains("_",na=False),"ImmediateSuperiorOrg"] = ot4[ot4.OrgName.str.contains("_",na=False)].apply(lambda row: "_".join(row.OrgName.split("_")[:-1]),axis=1)

In [63]:
# verify ImmediateSuperiorOrg is correctly formatted
ot4[ot4.OrgName.str.contains("_",na=False)]

Unnamed: 0,InsideGov,PrimaryInstitution,OrgName,ImmediateSuperiorOrg,SupervisesParent,LinkToNext_PI,LinkToNext_Org,LinkToNext_Year,Notes
30,1,국무위원회,국가보위성_정치국,국가보위성,,Current,Current,Current,
32,1,국무위원회,국가체육지도위원회_대외사업국,국가체육지도위원회,,Current,Current,Current,
33,1,국무위원회,국가체육지도위원회_제1부,국가체육지도위원회,,Current,Current,Current,
34,1,국무위원회,국가체육지도위원회_체육기술연맹,국가체육지도위원회,,Current,Current,Current,
35,1,국무위원회,국가체육지도위원회_체육지도국,국가체육지도위원회,,Current,Current,Current,
...,...,...,...,...,...,...,...,...,...
1705,,총참모부,작전총국_x처,작전총국,,,,,
1706,,총참모부,작전총국_x처_x군,작전총국_x처,,,,,
1707,,총참모부,작전총국_x처_x군_제10사단,작전총국_x처_x군,,,,,
1708,,총참모부,작전총국_x처_x군단,작전총국_x처,,,,,


In [64]:
# confirm all ImmediateSuperiorOrg are contained in OrgName
iso = ot4[["ImmediateSuperiorOrg"]]
iso.columns = ["OrgName"]
iso = iso.drop_duplicates(keep="first")
iso.shape

(186, 1)

In [65]:
iso.head(2)

Unnamed: 0,OrgName
0,
30,국가보위성


In [66]:
miso = ot4.merge(iso,on="OrgName",how="outer",indicator=True)
miso.shape

(1711, 10)

In [67]:
# confirms all ImmediateSuperiorOrg are also in OrgName
miso[miso["_merge"]=="right_only"]

Unnamed: 0,InsideGov,PrimaryInstitution,OrgName,ImmediateSuperiorOrg,SupervisesParent,LinkToNext_PI,LinkToNext_Org,LinkToNext_Year,Notes,_merge


### Step 4. replace null vars with Please_Revise - write fxn for this!!

In [68]:
replace_var = "InsideGov"
ot4.loc[ot4[replace_var].isna(),replace_var]="Please_Revise"

In [69]:
replace_var = "LinkToNext_PI"
ot4.loc[ot4[replace_var].isna(),replace_var]="Please_Revise"

In [70]:
replace_var = "LinkToNext_Org"
ot4.loc[ot4[replace_var].isna(),replace_var]="Please_Revise"

In [71]:
replace_var = "LinkToNext_Year"
ot4.loc[ot4[replace_var].isna(),replace_var]="Please_Revise"

In [72]:
ot4

Unnamed: 0,InsideGov,PrimaryInstitution,OrgName,ImmediateSuperiorOrg,SupervisesParent,LinkToNext_PI,LinkToNext_Org,LinkToNext_Year,Notes
0,0,4.15문화창작단,,,,Current,Current,Current,
1,0,6.15공동선언실천남북공동위원회북측위원회,교직원분과,,,Current,Current,Current,
2,0,6.15공동선언실천남북공동위원회북측위원회,노동자분과,,,Current,Current,Current,
3,0,6.15공동선언실천남북공동위원회북측위원회,농업근로자분과,,,Current,Current,Current,
4,0,6.15공동선언실천남북공동위원회북측위원회,문학예술분과,,,Current,Current,Current,
...,...,...,...,...,...,...,...,...,...
1706,Please_Revise,총참모부,작전총국_x처_x군,작전총국_x처,,Please_Revise,Please_Revise,Please_Revise,
1707,Please_Revise,총참모부,작전총국_x처_x군_제10사단,작전총국_x처_x군,,Please_Revise,Please_Revise,Please_Revise,
1708,Please_Revise,총참모부,작전총국_x처_x군단,작전총국_x처,,Please_Revise,Please_Revise,Please_Revise,
1709,Please_Revise,총참모부,작전총국_x처_제x군단,작전총국_x처,,Please_Revise,Please_Revise,Please_Revise,


### Step 5. export orgtree

In [73]:
filein_ot

'orgtree_4_5_jeonsgu_cleaned.xlsx'

In [74]:
fileout_ot = "orgtree_4_6_0.xlsx"

In [75]:
ot4.to_excel(path_cleaning + subpath_2_4 + fileout_ot,index=False)

### Step 6. manually update Please_Revise and iteratively re-run 2.5.3 until these are null

In [76]:
ot4[ot4["InsideGov"].str.lower()=="please_revise"].shape

(331, 9)

In [77]:
ot4[ot4["LinkToNext_PI"].str.lower()=="please_revise"].shape

(331, 9)

In [78]:
ot4[ot4["LinkToNext_Org"].str.lower()=="please_revise"].shape

(349, 9)

In [79]:
ot4[ot4["LinkToNext_Year"].str.lower()=="please_revise"].shape

(331, 9)