In [175]:
import pandas as pd
import numpy as np

# combined data - 2 tables

In [176]:
path_tables = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 2 tables/"

In [177]:
filename_careers = "careers.xlsx"

In [178]:
filename_leadercareerlink = "leadercareerlink.xlsx"

In [179]:
filename_leaderjoblink = "leaderjoblink.xlsx"

In [180]:
filename_joborglink = "joborglink.xlsx"

# combined data - 1 cleaning/cleaning step 2 - career/

In [181]:
path_cleaning = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 1 cleaning/cleaning step 2 - career/"

In [182]:
subpath_2_1 = "2.1 career_undivided_unparsed_uncoded/"
subpath_2_2 = "2.2 career_divided_unparsed_uncoded/"
subpath_2_3 = "2.3 joborglink/"
subpath_2_4 = "2.4 orgtree/"
subpath_2_5 = "2.5 position/"
subpath_2_6 = "2.6 career_reassembled/"

In [183]:
# orgtree_4_5_jeonsgu_cleaned

# original filein was: orgtree_4_5_jeonsgu_cleaned
# but after original, then recursively edit: orgtree_4_6_0

# filein_ot = "orgtree_4_5_jeonsgu_cleaned.xlsx"
filein_ot = "orgtree_4_6_0.xlsx"
ot = pd.read_excel(path_cleaning + subpath_2_4 + filein_ot,dtype="str")
ot.shape

(1781, 9)

In [184]:
# careers_4_0
filein_careers = "careers_4_0.xlsx"
careers = pd.read_excel(path_cleaning + subpath_2_6 + filein_careers,dtype="str")
careers.shape

(9001, 12)

# functions: format orgtree

In [185]:
# using this on (PI,OrgName) will ensure unique & non-null keys
# using this on a larger df will ensure unique rows and non-null keys, but not unique keys

def unique_non_null_rows(olddf):
    
    df = olddf.copy()
    
    
    ### drop duplicates
    df = df.drop_duplicates(keep="first",ignore_index=True)

    ### drop null rows
    df.dropna(how="all",axis=0,inplace=True)
    
    # drop rows with null PI
    df = df[~(df["PrimaryInstitution"].isna())]
        
    ### drop rows where PI contains stop words
    stop_words_lower = ["uncertain","current","deprecated","please_revise"]
    df = df[ ~ df["PrimaryInstitution"].str.lower().isin(stop_words_lower)]
            

    print("\nUnique Non-null Rows...")
    print("")
    print("\tNon-unique rows:",olddf.shape)
    print("\tUnique rows    :",df.shape)

    ### sort
    df = df.sort_values(["PrimaryInstitution","OrgName"])
    
    return df

In [186]:
def verify_unique_rows(df):
    
    key_columns = ["PrimaryInstitution","OrgName"]
    print("\nVerifying Unique Rows...")
    print("")
    print("\tDuplicate Rows:",df[df.duplicated(keep=False)].shape)
    print("\tDuplicate Keys:",df[df.duplicated(key_columns,keep=False)].shape)
    print("\tNull Rows     :",df[df["PrimaryInstitution"].isna() & df["OrgName"].isna()].shape)

In [187]:
def merge_results(m):
    
    print("\nMerge Results...")
    print("")
    print("\tshape     :",m.shape)
    print("\tleft_only :",m[m["_merge"]=="left_only"].shape)
    print("\tboth      :",m[m["_merge"]=="both"].shape)
    print("\tright_only:",m[m["_merge"]=="right_only"].shape)

In [188]:
def concat_PI_OrgName_keys_to_orgtree(ot,keys,drop_merge_column=True):
    
    # prepare keys
    key_columns = ["PrimaryInstitution","OrgName"]
    keys.columns = key_columns
    keys = unique_non_null_rows(keys)
    
    # verify unique keys
    verify_unique_rows(keys)


    m = ot.merge(keys,on=key_columns,how="outer",indicator=True)
    merge_results(m)
    
    # drop "_merge" column
    if drop_merge_column:
        m.drop(columns=["_merge"],inplace=True)
    
    # remove duplicate rows
    m = unique_non_null_rows(m)
    
    # verify unique keys in orgtree
    verify_unique_rows(m)
    
    return m

In [189]:
# function that recursively adds upper-level OrgName to the series

# for each PI-Orgname in the hierarchy

def extract_uppperlevel_org(original_org):
    
    orglist = [original_org]
    # if the Orgname contains "_", then add the upper-level PI-Orgname pair to the series and run the function again
    if "_" in original_org:
        upper_org = "_".join(original_org.split("_")[:-1])
        orglist += extract_uppperlevel_org(upper_org)
    
    # once done adding upper-level orgs, then return the orglist
    return orglist

def enumerate_org_hierarchy(df):
    
    olddf = df
    newdf = pd.DataFrame(columns=df.columns)
    
    while(len(list(df.index))>0):
        
        # extract one row and save one row from df
        row = df.iloc[0]
        df = df.iloc[1:]
        
        pi = str(row.PrimaryInstitution)
        org = str(row.OrgName)
        
        if(org!="nan"):
            orglist = extract_uppperlevel_org(org)
            orglist +=[np.nan]

        else:
            orglist = [np.nan]
            
        # print(orglist)

        minidf = pd.DataFrame(data={"PrimaryInstitution":[pi]*len(orglist),"OrgName":orglist})
        
        # print(pi,org,orglist,minidf)

        newdf = pd.concat([newdf,minidf],ignore_index=True)
        
    
    # unique, non-null keys
    newdf = unique_non_null_rows(newdf)
    
    # verify unique keys in enumerated hierarchy
    verify_unique_rows(newdf)
    
    # stats
    print("\nEnumerated Hierarchy Stats...")
    print("")
    print("\tOriginal keys  :",olddf.shape)
    print("\tEnumerated keys:",newdf.shape)
    
    newdf.sort_values(by=list(df.columns),inplace=True,ignore_index=True)

    return newdf    

# 1. concat career (PI, Org) to orgtree

In [190]:
careers.IsJob.unique()

array(['False', 'True'], dtype=object)

In [191]:
key_columns = ["PrimaryInstitution","OrgName"]
career_keys = careers.loc[careers.IsJob=="True",key_columns]

In [192]:
ot2 = concat_PI_OrgName_keys_to_orgtree(ot,career_keys,drop_merge_column=True)


Unique Non-null Rows...

	Non-unique rows: (6733, 2)
	Unique rows    : (1050, 2)

Verifying Unique Rows...

	Duplicate Rows: (0, 2)
	Duplicate Keys: (0, 2)
	Null Rows     : (0, 2)

Merge Results...

	shape     : (1781, 10)
	left_only : (709, 10)
	both      : (1072, 10)
	right_only: (0, 10)

Unique Non-null Rows...

	Non-unique rows: (1781, 9)
	Unique rows    : (1751, 9)

Verifying Unique Rows...

	Duplicate Rows: (0, 9)
	Duplicate Keys: (16, 9)
	Null Rows     : (0, 9)


In [194]:
key_columns = ["PrimaryInstitution","OrgName"]
ot2[ot2.duplicated(key_columns,keep=False)]

Unnamed: 0,InsideGov,PrimaryInstitution,OrgName,ImmediateSuperiorOrg,SupervisesParent,LinkToNext_PI,LinkToNext_Org,LinkToNext_Year,Notes
131,0,국방과학원,,,,Current,Current,Current,
132,1,국방과학원,,,,Current,Current,Current,
182,0,김일성사회주의청년동맹,중앙위원회,,,사회주의애국청년동맹,중앙위원회,2021,
183,0,김일성사회주의청년동맹,중앙위원회,,,김일성김정일주의청년동맹,중앙위원회,1996,
185,0,김일성사회주의청년동맹,,,,사회주의애국청년동맹,,2021,
186,0,김일성사회주의청년동맹,,,,김일성김정일주의청년동맹,,2016,
187,0,김일성사회주의청년동맹,,,,김일성김정일주의청년동맹,,1996,
610,1,내각,,,,Current,Current,Current,
611,1,내각,,,,current,current,current,
740,1,노동당,,,,Current,Current,Current,


# 2. format orgtree

### Step 1. concat (PI-OrgName) from LinkToNext_

In [148]:
link_keys = ot[["LinkToNext_PI","LinkToNext_Org"]]

In [149]:
ot3 = concat_PI_OrgName_keys_to_orgtree(ot2,link_keys,drop_merge_column=True)


Unique Non-null Rows...

	Non-unique rows: (1781, 2)
	Unique rows    : (244, 2)

Verifying Unique Rows...

	Duplicate Rows: (0, 2)
	Duplicate Keys: (0, 2)
	Null Rows     : (0, 2)

Merge Results...

	shape     : (1781, 10)
	left_only : (1503, 10)
	both      : (278, 10)
	right_only: (0, 10)

Verifying Unique Rows...

	Duplicate Rows: (47, 9)
	Duplicate Keys: (58, 9)
	Null Rows     : (0, 9)


### Step 2. enumerate Org hierarchy from (PI, OrgName) and concat to orgtree

In [150]:
key_columns = ["PrimaryInstitution","OrgName"]
ot_keys = ot3[key_columns]
verify_unique_rows(ot_keys)


Verifying Unique Rows...

	Duplicate Rows: (58, 2)
	Duplicate Keys: (58, 2)
	Null Rows     : (0, 2)


In [151]:
ot_keys_enumerated = enumerate_org_hierarchy(ot_keys)


Unique Non-null Rows...

	Non-unique rows: (4372, 2)
	Unique rows    : (1742, 2)

Verifying Unique Rows...

	Duplicate Rows: (0, 2)
	Duplicate Keys: (0, 2)
	Null Rows     : (0, 2)

Enumerated Hierarchy Stats...

	Original keys  : (1781, 2)
	Enumerated keys: (1742, 2)


In [152]:
ot4 = concat_PI_OrgName_keys_to_orgtree(ot3,ot_keys_enumerated,drop_merge_column=True)


Unique Non-null Rows...

	Non-unique rows: (1742, 2)
	Unique rows    : (1742, 2)

Verifying Unique Rows...

	Duplicate Rows: (0, 2)
	Duplicate Keys: (0, 2)
	Null Rows     : (0, 2)

Merge Results...

	shape     : (1781, 10)
	left_only : (0, 10)
	both      : (1781, 10)
	right_only: (0, 10)

Verifying Unique Rows...

	Duplicate Rows: (47, 9)
	Duplicate Keys: (58, 9)
	Null Rows     : (0, 9)


In [153]:
# confirm that PrimaryInstitution is not listed inside OrgName inappropriately
ot4[ot4.apply(lambda row: str(row.PrimaryInstitution) in str(row.OrgName),axis=1)]

Unnamed: 0,InsideGov,PrimaryInstitution,OrgName,ImmediateSuperiorOrg,SupervisesParent,LinkToNext_PI,LinkToNext_Org,LinkToNext_Year,Notes
306,1,내각,내각사무국,,,Current,Current,Current,
307,1,내각,내각사무국_호위처,내각사무국,,uncertain,uncertain,uncertain,
308,1,내각,내각정치국,,,Current,Current,Current,
688,1,노동당,당중앙위원회_선전선동부_조선노동당출판사,당중앙위원회_선전선동부,,Current,Current,Current,
689,1,노동당,당중앙위원회_선전선동부_조선노동당출판사_근로자사,당중앙위원회_선전선동부_조선노동당출판사,,Current,Current,Current,
904,0,사회주의노동청년동맹,평양시사회주의노동청년동맹,,,김일성사회주의청년동맹,uncertain,1996,


### Step 3. update ImmediateSuperiorOrg - write fxn for this!!

In [154]:
# set ImmediateSuperiorOrg to NaN
ot4["ImmediateSuperiorOrg"]=np.nan

In [155]:
# update ImmediateSuperiorOrg for those OrgName that have one
ot4.loc[ot4.OrgName.str.contains("_",na=False),"ImmediateSuperiorOrg"] = ot4[ot4.OrgName.str.contains("_",na=False)].apply(lambda row: "_".join(row.OrgName.split("_")[:-1]),axis=1)

In [156]:
# verify ImmediateSuperiorOrg is correctly formatted
ot4[ot4.OrgName.str.contains("_",na=False)]

Unnamed: 0,InsideGov,PrimaryInstitution,OrgName,ImmediateSuperiorOrg,SupervisesParent,LinkToNext_PI,LinkToNext_Org,LinkToNext_Year,Notes
37,1,국무위원회,국가보위성_정치국,국가보위성,,Current,Current,Current,
39,1,국무위원회,국가체육지도위원회_대외사업국,국가체육지도위원회,,Current,Current,Current,
40,1,국무위원회,국가체육지도위원회_제1부,국가체육지도위원회,,Current,Current,Current,
41,1,국무위원회,국가체육지도위원회_체육기술연맹,국가체육지도위원회,,Current,Current,Current,
42,1,국무위원회,국가체육지도위원회_체육지도국,국가체육지도위원회,,Current,Current,Current,
...,...,...,...,...,...,...,...,...,...
1761,1,호위사령부,호위총국_제2국,호위총국,,Current,Current,Current,
1762,1,호위사령부,호위총국_제3호위부,호위총국,,Current,Current,Current,
1763,1,호위사령부,호위총국_제4국,호위총국,,Current,Current,Current,
1764,1,호위사령부,호위총국_제55처,호위총국,,Current,Current,Current,전시 지휘소


In [157]:
# confirm all ImmediateSuperiorOrg are contained in OrgName
iso = ot4[["ImmediateSuperiorOrg"]]
iso.columns = ["OrgName"]
iso = iso.drop_duplicates(keep="first")
iso.shape

(184, 1)

In [158]:
iso.head(2)

Unnamed: 0,OrgName
0,
37,국가보위성


In [159]:
miso = ot4.merge(iso,on="OrgName",how="outer",indicator=True)
miso.shape

(1781, 10)

In [160]:
# confirms all ImmediateSuperiorOrg are also in OrgName
miso[miso["_merge"]=="right_only"]

Unnamed: 0,InsideGov,PrimaryInstitution,OrgName,ImmediateSuperiorOrg,SupervisesParent,LinkToNext_PI,LinkToNext_Org,LinkToNext_Year,Notes,_merge


### Step 4. replace null vars with Please_Revise - write fxn for this!!

In [161]:
replace_var = "InsideGov"
ot4.loc[ot4[replace_var].isna(),replace_var]="Please_Revise"

In [162]:
replace_var = "LinkToNext_PI"
ot4.loc[ot4[replace_var].isna(),replace_var]="Please_Revise"

In [163]:
replace_var = "LinkToNext_Org"
ot4.loc[(ot4["LinkToNext_PI"]=="Please_Revise") & (ot4[replace_var].isna()),replace_var]="Please_Revise"

In [164]:
replace_var = "LinkToNext_Year"
ot4.loc[ot4[replace_var].isna(),replace_var]="Please_Revise"

In [165]:
ot4

Unnamed: 0,InsideGov,PrimaryInstitution,OrgName,ImmediateSuperiorOrg,SupervisesParent,LinkToNext_PI,LinkToNext_Org,LinkToNext_Year,Notes
0,0,4.15문화창작단,,,,Current,Current,Current,
1,0,6.15공동선언실천남북공동위원회북측위원회,교직원분과,,,Current,Current,Current,
2,0,6.15공동선언실천남북공동위원회북측위원회,노동자분과,,,Current,Current,Current,
3,0,6.15공동선언실천남북공동위원회북측위원회,농업근로자분과,,,Current,Current,Current,
4,0,6.15공동선언실천남북공동위원회북측위원회,문학예술분과,,,Current,Current,Current,
...,...,...,...,...,...,...,...,...,...
1776,0,흥남가스화건설장함경남도지구계획위원회,,,,Uncertain,Uncertain,Uncertain,
1777,0,희천기계공장,분공장,,,Current,Current,Current,
1778,0,희천기계공장,,,,Current,Current,Current,
1779,0,희천정밀기계공장,분공장,,,Current,Current,Current,


### Step 5. export orgtree

In [166]:
filein_ot

'orgtree_4_6_0.xlsx'

In [167]:
fileout_ot = "orgtree_4_6_0.xlsx"

In [168]:
ot4.to_excel(path_cleaning + subpath_2_4 + fileout_ot,index=False)

### Step 6. manually update Please_Revise and iteratively re-run 2.5.3 until these are null

In [169]:
ot4[ot4["InsideGov"].str.lower()=="please_revise"].shape

(0, 9)

In [170]:
ot4[ot4["LinkToNext_PI"].str.lower()=="please_revise"].shape

(0, 9)

In [171]:
ot4[ot4["LinkToNext_Org"].str.lower()=="please_revise"].shape

(0, 9)

In [172]:
ot4[ot4["LinkToNext_Year"].str.lower()=="please_revise"].shape

(0, 9)

In [173]:
# verify no duplicate keys in ot
key_columns = ["PrimaryInstitution","OrgName"]
ot_keys = ot4[key_columns]
verify_unique_rows(ot_keys)


Verifying Unique Rows...

	Duplicate Rows: (58, 2)
	Duplicate Keys: (58, 2)
	Null Rows     : (0, 2)
