In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_rows', None)

# Tables

### 1. LeaderCareerLink (LeaderID, CareerString, CareerDateString_2022)

matches LeaderID with (CareerString, CareerDateString_2022)

- LeaderID
- CareerString
- CareerDateString_2022

### 2. CareerOrgLink (CareerString, CareerDateString_2022, CareerSubstring)

matches (CareerString,CareerDateString_2022,CareerSubstring) with (InstitutionType,PrimaryInstitution,OrgName,Position)

- CareerString
- CareerDateString_2022
- IsJob
- MultipleSubstrings
- CareerStartYear
- CareerStartMonth
- CareerSubstring
- OrgString
- InstitutionType
- PrimaryInstitution
- OrgName
- Position
- Notes

### 3. Orgtree (InstitutionType, PrimaryInstitution, OrgName)*

contains variables relevant to all (PI, Org) including PI/Org Types, Positions, Org/Pos Ranks, PI/Org Links, Aliases

- InstitutionType
- OrgType
- PrimaryInstitution
- OrgName
- PI_Index
- OrgRank
- P1
- P2
- P3
- Alias_OrgName
- LinkToNext_PI
- LinkToNext_Org
- LinkToNext_Year
- Notes

In [3]:
path_tables = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 2 tables/"

In [4]:
# current tables
filename_careerorglink = "careerorglink.xlsx"
filename_leadercareerlink = "leadercareerlink.xlsx"
filename_orgtree = "orgtree.xlsx"

In [5]:
# deprecated tables
# filename_careers = "careers.xlsx"
# filename_leaderjoblink = "leaderjoblink.xlsx"
# filename_joborglink = "joborglink.xlsx"

# Data - initial orgtree + manually cleaned data in 2.7 orgtree position & rank

In [6]:
path_cleaning = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 1 cleaning/cleaning step 2 - career/"

In [7]:
subpath_2_1 = "2.1 career_undivided_unparsed_uncoded/"
subpath_2_2 = "2.2 career_divided_unparsed_uncoded/"
subpath_2_3 = "2.3 joborglink/"
subpath_2_4 = "2.4 orgtree/"
subpath_2_5 = "2.5 position/"
subpath_2_6 = "2.6 career_reassembled/"
subpath_2_7 = "2.7 orgtree position & rank/"

In [8]:
# 3.0 careerorglink_0.xlsx
# 3.0 careerorglink_1_Jacob.xlsx

# filename_careerorglink_old = "3.0 careerorglink_0.xlsx"
filename_careerorglink_old = "3.0 careerorglink_1_Jacob.xlsx"
col = pd.read_excel(path_cleaning + subpath_2_7 + filename_careerorglink_old,dtype="str")
col.shape

(9002, 12)

In [9]:
col[col["CareerString"].isna()]

Unnamed: 0,CareerString,CareerDateString_2022,IsJob,MultipleSubstrings,CareerStartYear,CareerStartMonth,CareerSubstring,InstitutionType,PrimaryInstitution,OrgName,Position,Notes


In [10]:
# orgtree (orgtree_position_rank)
# 3.0 orgtree_0 
# 3.0 orgtree_1_Jacob

# constructing new file from:
## 2022 북한_기관별_인명록_북한정보포털 게재용
## orgtree table

# filein_orgtree_old = "3.0 orgtree_0.xlsx"
filein_orgtree_old = "3.0 orgtree_1_Jacob.xlsx"
opr = pd.read_excel(path_cleaning + subpath_2_7 + filein_orgtree_old,dtype="str")
opr.shape

(2367, 19)

In [12]:
filename_mismatch_old = "3.0 mismatch_통합.xlsx"
# m = pd.read_excel(path_cleaning + subpath_2_7 + filename_mismatch_old,dtype="str")
# m.shape

In [13]:
# m.head(2)

In [14]:
# updated tables
print("col:\t",col.shape)
# print("m:\t",m.shape)
print("opr:\t",opr.shape)

col:	 (9002, 12)
opr:	 (2367, 19)


# Functions: format orgtree

In [15]:
# using this on (PI,OrgName) will ensure unique & non-null keys
# using this on a larger df will ensure unique rows and non-null keys, but not unique keys

def unique_non_null_rows(olddf):
    
    df = olddf.copy()
    
    
    ### drop duplicates
    df = df.drop_duplicates(keep="first",ignore_index=True)

    ### drop null rows
    df.dropna(how="all",axis=0,inplace=True)
    
    # drop rows with null PI
    df = df[~(df["PrimaryInstitution"].isna())]
        
    ### drop rows where PI contains stop words
    stop_words_lower = ["uncertain","current","deprecated","please_revise"]
    df = df[ ~ df["PrimaryInstitution"].str.lower().isin(stop_words_lower)]
            

    print("\nUnique Non-null Rows...")
    print("")
    print("\tNon-unique rows:",olddf.shape)
    print("\tUnique rows    :",df.shape)

    ### sort
    df = df.sort_values(["PrimaryInstitution","OrgName"])
    
    return df

In [16]:
def verify_unique_rows(df):
    
    key_columns = ["PrimaryInstitution","OrgName"]
    print("\nVerifying Unique Rows...")
    print("")
    print("\tDuplicate Rows:",df[df.duplicated(keep=False)].shape)
    print("\tDuplicate Keys:",df[df.duplicated(key_columns,keep=False)].shape)
    print("\tNull Rows     :",df[df["PrimaryInstitution"].isna() & df["OrgName"].isna()].shape)

In [17]:
def merge_results(m):
    
    print("\nMerge Results...")
    print("")
    print("\tshape     :",m.shape)
    print("\tleft_only :",m[m["_merge"]=="left_only"].shape)
    print("\tboth      :",m[m["_merge"]=="both"].shape)
    print("\tright_only:",m[m["_merge"]=="right_only"].shape)

In [18]:
def update_opr_index(opr,max_level):
    
    # create and clear out existing Level Indicies
    opr["OrgRank"]=np.nan
    opr["L1_Index"]=np.nan
    opr["L2_Index"]=np.nan
    opr["L3_Index"]=np.nan
    opr["L4_Index"]=np.nan
    opr["L5_Index"]=np.nan
    
    # pad PI_Index with missing 0s up to the deepest level (4 or 5?)
    opr["PI_Index"] = opr.apply(lambda x: str(x["PI_Index"]) + ((max_level-1)-str(x["PI_Index"]).count(".")) * ".0",axis=1)
    
    # calculate OrgRank
    opr["OrgRank"] = opr.apply(lambda x: (max_level-1) - str(x["PI_Index"]).count(".0"),axis=1)
    
    # populate Level Indices using PI_Index
    opr.loc[opr["PI_Index"].notna(),"L1_Index"] = opr.loc[opr["PI_Index"].notna()].apply(lambda x: str(x["PI_Index"]).split(".")[0],axis=1)
    opr.loc[opr["PI_Index"].notna(),"L2_Index"] = opr.loc[opr["PI_Index"].notna()].apply(lambda x: str(x["PI_Index"]).split(".")[1],axis=1)
    opr.loc[opr["PI_Index"].notna(),"L3_Index"] = opr.loc[opr["PI_Index"].notna()].apply(lambda x: str(x["PI_Index"]).split(".")[2],axis=1)
    opr.loc[opr["PI_Index"].notna(),"L4_Index"] = opr.loc[opr["PI_Index"].notna()].apply(lambda x: str(x["PI_Index"]).split(".")[3],axis=1)
    opr.loc[opr["PI_Index"].notna(),"L5_Index"] = opr.loc[opr["PI_Index"].notna()].apply(lambda x: str(x["PI_Index"]).split(".")[4],axis=1)

    # sort by Level1, Level2, Level3
    sort_columns = ["L1_Index","L2_Index","L3_Index","L4_Index","L5_Index"]
    #opr.L1_Index.astype(int,errors="ignore")
    #opr.L2_Index.astype(int,errors="ignore")
    #opr.L3_Index.astype(int,errors="ignore")
    opr.sort_values(sort_columns,inplace=True)
    
    return opr

In [19]:
# this function is called by: clean_positions(opr_old):

# written to operate on a dataframe row. 
# use in df.apply()

def compare_position_ranks_vectorized(row):
    
    print_row_flag = False
    
    p1 = row["P1"].split(",")
    p2 = row["P2"].split(",")
    p3 = row["P3"].split(",")
    
    pall = (p1+p2+p3)
    punique = list(set(pall))
    
    pall = [e for e in pall if e!="nan"]
    punique = [e for e in punique if e!="nan"]
    
    # print(len(pall),len(punique))

    if len(pall)>len(punique):
        print(row["PrimaryInstitution"],row["OrgName"])
        print(p1,p2,p3,pall)
        print("")
        print_row_flag = True
        
    return print_row_flag    


In [20]:
# opr = orgtree

def clean_positions(opr_old):
    
    opr_new = opr_old.copy(deep=True)
    
    # remove duplicates within each P1-P3 
    
    opr_new["P1"] = opr_new.apply(lambda x: ",".join(list(set([item.strip() for item in str(x["P1"]).split(",")]))),axis=1)    
    opr_new["P2"] = opr_new.apply(lambda x: ",".join(list(set([item.strip() for item in str(x["P2"]).split(",")]))),axis=1)
    opr_new["P3"] = opr_new.apply(lambda x: ",".join(list(set([item.strip() for item in str(x["P3"]).split(",")]))),axis=1)
    
    print("Duplicates have been removed within each orgtree.P1-P3\n\n")
    
    
    # identify duplicates across each P1-P3 
    
    # calls the function above: def compare_position_ranks_vectorized(row):

    num_of_duplicates = sum(opr_new.apply(compare_position_ranks_vectorized,axis=1))
    
    if num_of_duplicates>0:
        
        print("\n",num_of_duplicates,"duplicates have been identified across orgtree.P1-P3")
        
    else:
        
        print("\nNo duplicates have been identified across orgtree.P1-P3")    
    
    
    return opr_new    

In [21]:
# function identifies which careerorglink.Position are not contained in orgtree.P1-P3

def identify_positions_notin_orgtree(row):
    
    return_row_flag = False
    
    pos = row["Position"]
    
    pall = []
    
    if row["P1"] is not np.nan: p1 = row["P1"].split(","); pall+=p1
    if row["P2"] is not np.nan: p2 = row["P2"].split(","); pall+=p2
    if row["P3"] is not np.nan: p3 = row["P3"].split(","); pall+=p3

    if pos not in pall:
        # print(pos,pall)
        return_row_flag = True
        
    return return_row_flag

In [22]:
### function: adds Positions to orgtree.P1-P3 using a hand-edited table, addPOS (3.0 careerorglink_1_Jacob_resolve_JR.xlsx)

# we can't resolve these Positions with a simultaneous merge because each PI,OrgName may have multiple Positions
# and each Position would be added into a separate row of P1-P3
# instead, we have to resolve each addPOS row, one at a time

def add_Positions_to_orgtree(addPOS,opr_in):

    opr2 = opr_in.copy(deep=True)

    addPOSrange = range(0,addPOS.shape[0])
    addPOSrange

    for i in addPOSrange:

        # print(addPOS.iloc[i,2])
        rowPI = addPOS.iloc[i]["PrimaryInstitution"]
        rowOrg = addPOS.iloc[i]["OrgName"]
        rowPos = addPOS.iloc[i]["Position"]
        rowRes = addPOS.iloc[i]["Resolution"]

        rowRank = "P1"
        if rowRes == "2. Add Position to P2": rowRank="P2"
        if rowRes == "3. Add Position to P3": rowRank="P3"

        # print(rowPI,rowOrg is np.nan)

        if rowOrg is np.nan:
            row_condition = (opr2["PrimaryInstitution"]==rowPI) & (opr2["OrgName"].isnull())
        else:
            row_condition = (opr2["PrimaryInstitution"]==rowPI) & (opr2["OrgName"]==rowOrg)


        oprRankValue = list(opr2.loc[row_condition,rowRank].values)

        oprRankValueNull = "nan" in oprRankValue

        rowPosNotInOprRankValue = rowPos not in oprRankValue[0]

        newPos = ""

        if rowPosNotInOprRankValue:

            if oprRankValueNull:
                newPos = rowPos

            else:
                newPos = oprRankValue[0] + "," + rowPos
                # newPos = newPos.append(rowPos)

            # before resolving
            print(opr2.loc[row_condition,["PrimaryInstitution","OrgName",rowRank]])

            opr2.loc[row_condition,rowRank] = newPos

            # after resolving
            print(opr2.loc[row_condition,["PrimaryInstitution","OrgName",rowRank]])

        else:

            print("rowPos is already contained in the OprRankValue.")
            
            
    return opr2

In [23]:
# opr = update_opr_index(opr,5)

In [24]:
# export opr

# fileout_opr = "1.0 orgtree_position_rank.xlsx"
# opr.to_excel(path_cleaning + subpath_2_7 + fileout_opr,index=False)

# Recall from earlier cleaning

#### Three Cases of  OrgName in NK elite career data

- Case 0: OrgName in 기관별인명록: code these from 1-199
- Case 1: OrgName not in 기관별인명록, but contained in data AND LinkToNext_Year not current: code as 500+
- Case 2: OrgName not in 기관별인명록, but contained in data AND LinkToNext_Year is current: code as 200+

#### Case 0: OrgName in 기관별인명록

- Update 1.0 orgtree_position_rank with OrgName in 기관별인명록
- Code these from 1-199
- Run above routine # 1. orgtree_position_rank: update all besides 노동당, 내각, 정무원
    - to update opr & pr_else3
- Proceed to Cases 1 & 2

#### Resolve. Validation values for resolving null Position_3P

1. OrgName & Position ok. Add Position to P1
2. OrgName & Position ok. Add Position to P2
3. OrgName & Position ok. Add Position to P3
4. Change_OrgName and/or Change_Position
5. Uncertain OrgName
6. Uncertain Position
7. NotJob
8. Multiple CareerSubstring
9. Other - see Notes

# Task 1. Prep & Merge Mismatch

- NotJob
    - careers: change IsJob from True to False
    - m: remove rows 
    - opr: NA
  
- MultipleSubstrings
    - careers: expand rows
    - m: expand rows and code
    - opr: NA
    
- Merge mismatch files

# Task 2. Edit & Validate PI, OrgName, Type, Index

#### Step 0. Uncertain OrgName & Position
    - careerorglink: alter OrgName, Position to Uncertain
    - mismatches: remove rows containing uncertain OrgName & Position 
    - opr: NA
    
#### handle mismatches (Changes, InstitutionType)
- Step 1. using mismatches (Change_PI, Change_Org, Change_Pos), edit careerorglink & mismatches (PI, OrgName, Position) 
- Step 2. for all mismatches (InstitutionType), edit careerorglink & orgtree (InstitutionType)
- Step 3. for mismatches (IType, PI, OrgName) not in orgtree, add to orgtree with PI_Index 801

####  handle orgtree (Changes, InstitutionType)
- Step 4. using orgtree (Change_ x 3), edit careerorglink, mismatches & orgtree (InstitutionType, PI, OrgName)
- Step 5. using orgtree, update InstitutionType in col, for those jobs not in mismatch

#### handle mismatches (Positions)
- Step 6. using mismatches Position, update orgtree P1, P2, P3

# Task 3. Validate careerorglink & orgtree

#### Validated Primary Keys
- Verify unique primary keys for:
    - careerorglink (CareerString,CareerDateString,CareerSubstring)
    - orgtree (PI, OrgName)
- careerorglink (PI, OrgName) contained in orgtree

#### Coded Empty Values
- Identify null InstitutionType, Position

#### Validated Positions
- validate orgtree Positions
    - remove duplicates within each P1-P3
    - resolve duplicates across P1-P3
- Verify all careerorglink.Position are contained in orgtree.P1-P3

### Step. Verify unique primary keys

#### careerorglink primary key is unique

In [25]:
col_key_columns = ["CareerString","CareerDateString_2022","CareerSubstring"]

In [26]:
col_key_condition = col.IsJob==True & col.duplicated(col_key_columns,keep=False)

In [27]:
col[col_key_condition].shape

(0, 12)

#### orgtree primary key is unique

In [28]:
opr_key_columns = ["PrimaryInstitution","OrgName"]

In [29]:
opr_key_condition = opr.duplicated(opr_key_columns,keep=False)

In [30]:
opr[opr_key_condition].sort_values(opr_key_columns).shape

(0, 19)

### Step. careerorglinks (PI, OrgName) have a unique match in orgtree

In [31]:
col_opr_key_condition = (col.IsJob=="True") & (col.OrgName!="UNCERTAIN")

In [32]:
col_opr_keys_columns = opr_key_columns + ["Position"]

In [33]:
col_opr_keys = col.loc[col_opr_key_condition,col_opr_keys_columns]
col_opr_keys.shape

(6549, 3)

In [34]:
col_opr_keys = unique_non_null_rows(col_opr_keys)


Unique Non-null Rows...

	Non-unique rows: (6549, 3)
	Unique rows    : (1761, 3)


In [35]:
comerge = col_opr_keys.merge(opr,on=opr_key_columns,how="outer",indicator=True)
merge_results(comerge)


Merge Results...

	shape     : (3057, 21)
	left_only : (0, 21)
	both      : (1761, 21)
	right_only: (1296, 21)


In [36]:
# left-only indicate careerorglink (PI,OrgName) with no match in orgtree
comerge[comerge["_merge"]=="left_only"].shape

(0, 21)

### Step. Fill in null variables: PI, InstitutionType, Position

#### PI in careerorglink

In [37]:
# PI is null
col[(col.IsJob=="True") & (col.PrimaryInstitution.isnull())].shape

(0, 12)

In [38]:
# PI is UNCERTAIN
col[(col.IsJob=="True") & (col.OrgName=="UNCERTAIN")].shape

(182, 12)

#### InstitutionType in careerorglink

In [39]:
# InstitutionType is null
col[(col.IsJob=="True") & (col.InstitutionType.isnull())].shape

(0, 12)

In [40]:
# InstitutionType is UNCERTAIN
col[(col.IsJob=="True") & (col.OrgName=="UNCERTAIN")].shape

(182, 12)

#### Position in careerorglink

In [41]:
# Position is null
col[(col.IsJob=="True") & (col.Position.isnull())].shape

(0, 12)

In [42]:
# Position is UNCERTAIN 
col[(col.IsJob=="True") & (col.OrgName=="UNCERTAIN")].shape

(182, 12)

#### InstitutionType in orgtree

In [43]:
# InstitutionType is null
opr[(opr.InstitutionType.isnull())].shape

(0, 19)

In [44]:
# InstitutionType is UNCERTAIN
opr[(opr.InstitutionType.str.upper()=="UNCERTAIN")].shape

(39, 19)

### Step. Mismatched InstitutionTypes

#### compare mismatched InstitutionTypes

In [45]:
opr_key_columns = ["PrimaryInstitution","OrgName"]

In [46]:
col_columns = opr_key_columns + ["InstitutionType"]
opr_columns = col_columns

In [47]:
compareIT = col[col_columns].merge(opr[opr_columns],on=opr_key_columns,how="inner",suffixes=("_col","_opr"),indicator=True)
merge_results(compareIT)


Merge Results...

	shape     : (6657, 5)
	left_only : (0, 5)
	both      : (6657, 5)
	right_only: (0, 5)


In [48]:
compareIT[compareIT.InstitutionType_col != compareIT.InstitutionType_opr].shape

(0, 5)

### Step. clean orgtree P1, P2, P3

#### remove duplicates within & across P1, P2, P3

In [49]:
opr = clean_positions(opr)

Duplicates have been removed within each orgtree.P1-P3



No duplicates have been identified across orgtree.P1-P3


### Step. Verify all careerorglink.Position contained in orgtree.P1-P3

In [50]:
opr_key_columns = ["PrimaryInstitution","OrgName"]

In [51]:
col_columns = opr_key_columns + ["IsJob","Position"]

In [52]:
opr_columns = opr_key_columns + ["P1","P2","P3"]

#### format careerorglink

In [53]:
# only select jobs with certain PI, Position
condition1 = (col.IsJob=="True") & (col.PrimaryInstitution!="UNCERTAIN") & (col.Position!="UNCERTAIN")

In [54]:
col_merge = col.loc[condition1,col_columns]
col_merge.shape

(6360, 4)

In [55]:
col_merge.drop_duplicates(inplace=True)
col_merge.shape

(1690, 4)

#### merge

In [56]:
comparePOS = col_merge.merge(opr[opr_columns],on=opr_key_columns,how="inner",indicator=True)
merge_results(comparePOS)


Merge Results...

	shape     : (1682, 8)
	left_only : (0, 8)
	both      : (1682, 8)
	right_only: (0, 8)


In [57]:
comparePOS.head(2)

Unnamed: 0,PrimaryInstitution,OrgName,IsJob,Position,P1,P2,P3,_merge
0,4.15문화창작단,,True,단장,단장,부단장,,both
1,4.15문화창작단,,True,부단장,단장,부단장,,both


#### identify careerorglink Positions not in orgtree P1-P3

In [58]:
comparePOS_edit = comparePOS[comparePOS.apply(identify_positions_notin_orgtree,axis=1)]
comparePOS_edit.shape

(0, 8)

In [59]:
# set condition if Positions are not in orgtree
positions_notin_orgtree = False
if comparePOS_edit.shape[0] > 0:
    positions_notin_orgtree = True
positions_notin_orgtree

False

#### if comparePOS_edit is not null, then run the following procedures 

In [60]:
# input handcoded table

if positions_notin_orgtree:

    filename_handcoded = "3.0 careerorglink_1_Jacob_resolve_JR.xlsx"
    comparePOS_resolved = pd.read_excel(path_cleaning + subpath_2_7 + filename_handcoded,dtype="str")

    print(comparePOS_resolved.shape)
    
else:
    print("All good! All careerorglink.Positions have already been incorporated into ortgree.")

All good! All careerorglink.Positions have already been incorporated into ortgree.


In [61]:
# identify those rows

if positions_notin_orgtree:

    notresolved_key_columns = ['PrimaryInstitution', 'OrgName', 'Position']
    ## which have been handcoded, but not incorporated
    ## which have not been handcoded and require resolution and incorporation
    notresolved = comparePOS_resolved.merge(comparePOS_edit[notresolved_key_columns],on=notresolved_key_columns,how="outer",indicator=True)
    merge_results(notresolved)
    
else:
    print("All good! All careerorglink.Positions have already been incorporated into ortgree.")

All good! All careerorglink.Positions have already been incorporated into ortgree.


In [62]:
# left-only - have been resolved and incorporated - ignore these
# both - have been hand-coded, but not incorporated - subset these and write a function to incorporate them
# right-only - have not been handcoded - I can resolve these manually without handcoding

#### right-only - have not been handcoded - resolve manually without handcoding

In [63]:
if positions_notin_orgtree:

    resolvemanuallyPOS = notresolved[notresolved["_merge"]=="right_only"]
    print(resolvemanuallyPOS.shape)
    
else:
    print("All good! All careerorglink.Positions have already been incorporated into ortgree.")

All good! All careerorglink.Positions have already been incorporated into ortgree.


#### both - have been hand-coded, but not incorporated - subset these and write a function to incorporate them

In [64]:
if positions_notin_orgtree:

    incorporatePOS = notresolved[notresolved["_merge"]=="both"]
    print(incorporatePOS.shape)
    
else:
    print("All good! All careerorglink.Positions have already been incorporated into ortgree.")

All good! All careerorglink.Positions have already been incorporated into ortgree.


#### both & UNCERTAIN: change careerorglink.Position to UNCERTAIN

In [65]:
if positions_notin_orgtree:

    uncertainPOS = incorporatePOS[incorporatePOS["Resolution"]=="4. OrgName is UNCERTAIN"]
    print(uncertainPOS.shape)
    
    if uncertainPOS.shape[0] > 0:
        uncertain_key_columns = ['PrimaryInstitution', 'OrgName', 'Position']
        col2 = col.merge(uncertainPOS[uncertain_key_columns],on=uncertain_key_columns,how="outer",indicator=True)
        col2.loc[col2["_merge"]=="both","Position"]="UNCERTAIN"
        
        # if satsified that col2 has been successfully edited
        col = col2[col.columns]
        
        print("All UNCERTAIN careerorglink.Positions have been incorporated into ortgree.")
        
    else:
        print("All good! All UNCERTAIN careerorglink.Positions have already been incorporated into ortgree.")
    
else:
    print("All good! All careerorglink.Positions have already been incorporated into ortgree.")

All good! All careerorglink.Positions have already been incorporated into ortgree.


#### both & ADD P1,P2,P3: add careerorglink.Position to orgtree.P1-P3

In [66]:
if positions_notin_orgtree:

    addPOS = incorporatePOS[incorporatePOS["Resolution"]!="4. OrgName is UNCERTAIN"]
    print(addPOS.shape)
    
    if addPOS.shape[0] > 0:
        
        # run once to add Positions to opr2
        # run a second time to confirm all positions have been added
        opr2 = add_Positions_to_orgtree(addPOS,opr)
        
        # if satsified that opr2 has been successfully edited
        opr = opr2
        
        print("All P1,P2,P3 careerorglink.Positions have been incorporated into ortgree.P1-P3.")


    else:
        print("Already done. All P1,P2,P3 careerorglink.Positions have already been incorporated into ortgree.P1-P3.")
    
else:
    print("Already done. All careerorglink.Positions have already been incorporated into ortgree.")

Already done. All careerorglink.Positions have already been incorporated into ortgree.


# Format & Export Tables

In [95]:
export_tables = False
export_cleaning = False

In [82]:
# current versions of tables for export
col_export = col
opr_export = opr
# m_export = m

In [83]:
# export filenames
filename_careerorglink = "careerorglink.xlsx"
filename_orgtree = "orgtree.xlsx"

# cyclically revise and overwrite the same files
filename_careerorglink_new = "3.0 careerorglink_1_Jacob.xlsx"
# filename_mismatch_new = "3.0 mismatch_통합.xlsx"
filename_orgtree_new = "3.0 orgtree_1_Jacob.xlsx"

#### format careerorglink

In [84]:
careerorglink_columns = ['CareerString', 'CareerDateString_2022',
                         'IsJob', 'MultipleSubstrings','CareerStartYear', 'CareerStartMonth',
                         'CareerSubstring','InstitutionType', 'PrimaryInstitution', 'OrgName', 'Position', 'Notes']

In [85]:
# check that col has all necessary columns
[item for item in careerorglink_columns if item not in col_export.columns]

[]

In [86]:
# remove these items
[item for item in col_export.columns if item not in careerorglink_columns]

[]

In [87]:
col_export = col_export[careerorglink_columns]

#### format orgtree

In [88]:
orgtree_columns = ['InstitutionType', 'OrgType', 'PrimaryInstitution','OrgName',
                   'PI_Index', 'OrgRank', 'P1', 'P2', 'P3','Alias_OrgName',
                   'LinkToNext_PI','LinkToNext_Org','LinkToNext_Year','Notes',
                   'L1_Index', 'L2_Index','L3_Index', 'L4_Index', 'L5_Index']

In [89]:
# check that orgtree has all necessary columns
[item for item in orgtree_columns if item not in opr_export.columns]

[]

In [90]:
# remove these items
[item for item in opr_export.columns if item not in orgtree_columns]

[]

In [91]:
opr_export = opr_export[orgtree_columns]

In [92]:
# update indices & ranks
opr_export = update_opr_index(opr_export,6)

#### export to cleaning

In [93]:
if export_cleaning:
    col_export.to_excel(path_cleaning + subpath_2_7 + filename_careerorglink_new,index=False)
    opr_export.to_excel(path_cleaning + subpath_2_7 + filename_orgtree_new,index=False)
    # m_export.to_excel(path_cleaning + subpath_2_7 + filename_mismatch_new,index=False)

#### export to tables

In [94]:
if export_tables:
    col_export.to_excel(path_tables + filename_careerorglink,index=False)
    opr_export.to_excel(path_tables + filename_orgtree,index=False)

# Recode & Recategorize some (PI,OrgName)



#### Some More Validation for Later: PI, OrgName
- For 200 & 500 series PrimaryInstitutions, search whether they are contained in OrgName, within a PrimaryInstitution
- Orgs in Orgtree but not in Careers or 기관별인명록 (200, 500 series): delete these from orgtree

#### Do Later
- 중앙위원회
- 인민군, 총참모부, 인민무력부, 
- 국방위원회: 인민군?
- 중앙인민위원회: 정권기관
- 국제친선기관: I will add and match existing ones
- 인민회의 or 인민위원회?
- 노동당 총정치국 --> 인민군 총정치국
- add 북조선노동당 as an alias to 노동당
- alias the PrimaryInstitutions, 내각, 내각A, 내각B, 등 at least when calculating whether transitions are between or across institutions
- 노동당 - fix the old party organization
    - https://encykorea.aks.ac.kr/Article/E0070188
    - 1947년 북조선로동당 ‘5과’로 창설되었다. 6·25전쟁기 남한지역 내 게릴라부대의 지하당 공작 과정에서 대남 공작기구를 확대하면서 당 조직부 내의 연락부를 분리했다. 1961년 4차 당대회에서 남한에서의 지하당 조직 확대와 통일전선 강화 등의 결정과 함께 내무성 등의 대남 공작기구를 통합해 중앙당 ‘연락국’을 신설했다. 1964년 2월 조선로동당 중앙위원회 4기 8차 전원회의에서 ‘3대 혁명력량 강화’노선을 채택하면서 종래의 연락국을 ‘대남사업총국’으로 개칭했다.
    - 1966년 10월 12일 당 중앙위원회 4기 14차 전원회의에서 비서국이 신설되면서 대남사업담당비서 산하에 ‘연락부’, ‘문화부’, ‘조사부’, ‘인민무력부정착국’, ‘조총련’부서를 두었으며, 1978년 1월 통일전선부가, 1990년대 후반 35호실(대외정보조사부)과 작전부가 신설됨으로써 대남 담당 부서의 체계가 구축되었다. 남한 내 정당, 사회단체, 군부에 대한 공작 거점 및 공작 전술 연구개발 업무를 수행하고 있다. 특히 간첩교육과 파견을 직접 담당하고 있으며, 조총련에 대한 활동 지도도 담당하고 있다.

- 내각 & 정무원
    - How different was their organization?
    - https://encykorea.aks.ac.kr/Article/E0066727 - Let's alias. It seems like the name simply changed from 내각 (1948) --> 정무원 (1972) --> 내각 (1998)
    - Do we need to duplicate the entire 내각 structure for 정무원?
    - At the very least, should we swap out 내각 for 정무원 in cases where data are labelled as 내각 but during 정무원 periods (pre-1998)?
    - If the internal organization was quite similar, then maybe we could just alias 내각 and 정무원 and not worry about replicating the organization three
- 인도기관?
    - 국방위원회
    - 국무위원회
- resolve differences between InstitutionType, PrimaryInstitution, OrgName
    - for PI=총참모부, rename PI=인민군, OrgName = 총참모부_; and remove duplicates. e.g., 총참모부_작전총국
    - 노동당, 당중앙위원회
    - many organizations for whom 중앙위원회 is their main leadership body
    - 인민군. remove as PrimaryInstitution, and use as InstitutionType instead
    - https://encykorea.aks.ac.kr/Article/E0066687
    - 인민무력부, 인민무력성, 
    - PrimaryInstitutions
        - 최고사령부
        - 총참모부
        - 주요 사령부
        - 등
- Make sure X.0.0.0 positions are not being duplicated with X.1.0.0 positions, especially in the case of 중앙위원회
- Discuss X in PI_Index
    - 비서국A --> 정무국 --> 비서국B
        - check for duplicates: 당 비서국
    - 당중앙위원회_정치위원회 (폐지) 직급
    - 당중앙위원회_총정치국 - 당/인민군
    - 당중앙위원회_조선혁명박물관당위원회 & other 1.1.X.0.0 - double-check if they are included in 외곽단 - InGov?
    - Change_OrgName to 당중앙위원회_X
        - 비서국_인민무력부B
        - 비서국_인민무력부B_보위국
        - 비서국_인민무력부B_작전국
        - 비서국_인민무력부B_정치안전국

# Re-Index
- See PI-Index Coding Rules in Evernote
- Update X codes with 200 or 500. Delete any 200/500 which don't appear in our data. (They might have been orgs we corrected in the data.)
- 최고인민회의 X지역인민회의 - move from 200 Series to 0 Series
- Drop X지역위원회 and recode 200 Series to 0 Series
- Change coding of 당외곽 및 사회단체
    - if the orgs were found in the 별책, then change their codes from 200 series to 100 series
    - Match, rename and recode (from 500+ to 100+) 500 series 국제친선기관
- code data source rather than index

# Task 3. Add OrgType & other metadata

- Add OrgType


# Future Tasks

- Alias all 500 Series and concurrent Aliases
    - Include all names in orgtree
    - Alias names in both directions, forwards and backwards
    - use PI_Index to indicate a current or deprecated org.
    - I could use PI_Index to indicate what happened to the deprecated org: combined, renamed, deprecated, split. The problem with this is names which have been used cyclically, such as 내각.
- Future Integrity Checks & Data Cleaning
    - Identify inconsistencies in Position Rankings (e.g., 명예위원장)
    - Verify no contrary categories of positions overlapping (e.g., 위원장, 소장)
    - Verify (PI, OrgName) not found in 기관별인명록
    - Research X entries: e.g., 3.525.591: 내각_외무성_X국_대사관
    - Reconcile 기관 across three categories: 기타기관, 당외곽및사회단체, 당외곽및사회단체(별책)
    - Add more orgs
        - Expand any X지역위원회 into full list
        - 내각 - Level 2 or Level 3?
            - orgs after 내각 국토환경보호성_산림총국
            - orgs after 내각 대외경제성_민족경제협력위원회
            - orgs after 내각 보건성_중앙의약품관리소
            - orgs after 내 각 상업성_중앙도매소
            - 내각 정보산업성 우편국, 전화국, 체신관리국, 체신소, 
            - 내각 (45)
        - 당외곽및사회단체 (별책)
    - Discussion about matching levels
        - https://namu.wiki/w/ - consider the OrgRang of 김정은의 겸직
		- 조선민주주의인민공화국 국무위원회 위원장[국가원수]
		- 조선로동당 중앙군사위원장
		- 조선민주주의인민공화국무력 최고사령관
		- 조선로동당 중앙위원회 정치국 상무위원
    - Review UNCERTAIN OrgName & Positions
        - some of these seem resolvable