In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_rows', None)

# Tables

### 1. LeaderCareerLink (LeaderID, CareerString, CareerDateString_2022)

matches LeaderID with (CareerString, CareerDateString_2022)

- LeaderID
- CareerString
- CareerDateString_2022

### 2. CareerOrgLink (CareerString, CareerDateString_2022, CareerSubstring)

matches (CareerString,CareerDateString_2022,CareerSubstring) with (InstitutionType,PrimaryInstitution,OrgName,Position)

- CareerString
- CareerDateString_2022
- IsJob
- MultipleSubstrings
- CareerStartYear
- CareerStartMonth
- CareerSubstring
- OrgString
- InstitutionType
- PrimaryInstitution
- OrgName
- Position
- Notes

### 3. Orgtree (InstitutionType, PrimaryInstitution, OrgName)*

contains variables relevant to all (PI, Org) including PI/Org Types, Positions, Org/Pos Ranks, PI/Org Links, Aliases

- InstitutionType
- OrgType
- PrimaryInstitution
- OrgName
- PI_Index
- OrgRank
- P1
- P2
- P3
- Alias_OrgName
- LinkToNext_PI
- LinkToNext_Org
- LinkToNext_Year
- Notes

In [3]:
path_tables = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 2 tables/"

In [4]:
# current tables
filename_careerorglink = "careerorglink.xlsx"
filename_leadercareerlink = "leadercareerlink.xlsx"
filename_orgtree = "orgtree.xlsx"

In [5]:
# deprecated tables
# filename_careers = "careers.xlsx"
# filename_leaderjoblink = "leaderjoblink.xlsx"
# filename_joborglink = "joborglink.xlsx"

# Data - initial orgtree + manually cleaned data in 2.7 orgtree position & rank

In [6]:
path_cleaning = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 1 cleaning/cleaning step 2 - career/"

In [7]:
subpath_2_1 = "2.1 career_undivided_unparsed_uncoded/"
subpath_2_2 = "2.2 career_divided_unparsed_uncoded/"
subpath_2_3 = "2.3 joborglink/"
subpath_2_4 = "2.4 orgtree/"
subpath_2_5 = "2.5 position/"
subpath_2_6 = "2.6 career_reassembled/"
subpath_2_7 = "2.7 orgtree position & rank/"

In [8]:
# 3.0 careerorglink_0.xlsx
# 3.0 careerorglink_1_Jacob.xlsx

# filename_careerorglink_old = "3.0 careerorglink_0.xlsx"
filename_careerorglink_old = "3.0 careerorglink_1_Jacob.xlsx"
col = pd.read_excel(path_cleaning + subpath_2_7 + filename_careerorglink_old,dtype="str")
col.shape

(9002, 12)

In [9]:
col[col["CareerString"].isna()]

Unnamed: 0,CareerString,CareerDateString_2022,IsJob,MultipleSubstrings,CareerStartYear,CareerStartMonth,CareerSubstring,InstitutionType,PrimaryInstitution,OrgName,Position,Notes


In [10]:
# orgtree (orgtree_position_rank)
# 3.0 orgtree_0 
# 3.0 orgtree_1_Jacob

# constructing new file from:
## 2022 북한_기관별_인명록_북한정보포털 게재용
## orgtree table

# filein_orgtree_old = "3.0 orgtree_0.xlsx"
filein_orgtree_old = "3.0 orgtree_1_Jacob.xlsx"
opr = pd.read_excel(path_cleaning + subpath_2_7 + filein_orgtree_old,dtype="str")
opr.shape

(2364, 19)

In [11]:
opr.head(2)

Unnamed: 0,InstitutionType,OrgType,PrimaryInstitution,OrgName,PI_Index,OrgRank,P1,P2,P3,Alias_OrgName,LinkToNext_PI,LinkToNext_Org,LinkToNext_Year,Notes,L1_Index,L2_Index,L3_Index,L4_Index,L5_Index
0,노동당,,노동당,,1.0.0.0.0.0,0,총비서,,,,Current,Current,Current,,1,0,0,0,0
1,당외곽및사회단체_경제부문(별책),,신포원양수산연합기업소,,1.0.0.0.0.0,0,"지배인,당책임비서","당비서,기사장",,,,,,,1,0,0,0,0


In [12]:
filename_mismatch_old = "3.0 mismatch_통합.xlsx"
# m = pd.read_excel(path_cleaning + subpath_2_7 + filename_mismatch_old,dtype="str")
# m.shape

In [13]:
# m.head(2)

In [14]:
# updated tables
print("col:\t",col.shape)
# print("m:\t",m.shape)
print("opr:\t",opr.shape)

col:	 (9002, 12)
opr:	 (2364, 19)


# Functions: format orgtree

In [15]:
# using this on (PI,OrgName) will ensure unique & non-null keys
# using this on a larger df will ensure unique rows and non-null keys, but not unique keys

def unique_non_null_rows(olddf):
    
    df = olddf.copy()
    
    
    ### drop duplicates
    df = df.drop_duplicates(keep="first",ignore_index=True)

    ### drop null rows
    df.dropna(how="all",axis=0,inplace=True)
    
    # drop rows with null PI
    df = df[~(df["PrimaryInstitution"].isna())]
        
    ### drop rows where PI contains stop words
    stop_words_lower = ["uncertain","current","deprecated","please_revise"]
    df = df[ ~ df["PrimaryInstitution"].str.lower().isin(stop_words_lower)]
            

    print("\nUnique Non-null Rows...")
    print("")
    print("\tNon-unique rows:",olddf.shape)
    print("\tUnique rows    :",df.shape)

    ### sort
    df = df.sort_values(["PrimaryInstitution","OrgName"])
    
    return df

In [16]:
def verify_unique_rows(df):
    
    key_columns = ["PrimaryInstitution","OrgName"]
    print("\nVerifying Unique Rows...")
    print("")
    print("\tDuplicate Rows:",df[df.duplicated(keep=False)].shape)
    print("\tDuplicate Keys:",df[df.duplicated(key_columns,keep=False)].shape)
    print("\tNull Rows     :",df[df["PrimaryInstitution"].isna() & df["OrgName"].isna()].shape)

In [17]:
def merge_results(m):
    
    print("\nMerge Results...")
    print("")
    print("\tshape     :",m.shape)
    print("\tleft_only :",m[m["_merge"]=="left_only"].shape)
    print("\tboth      :",m[m["_merge"]=="both"].shape)
    print("\tright_only:",m[m["_merge"]=="right_only"].shape)

In [18]:
def update_opr_index(opr,max_level):
    
    # create and clear out existing Level Indicies
    opr["OrgRank"]=np.nan
    opr["L1_Index"]=np.nan
    opr["L2_Index"]=np.nan
    opr["L3_Index"]=np.nan
    opr["L4_Index"]=np.nan
    opr["L5_Index"]=np.nan
    
    # pad PI_Index with missing 0s up to the deepest level (4 or 5?)
    opr["PI_Index"] = opr.apply(lambda x: str(x["PI_Index"]) + ((max_level-1)-str(x["PI_Index"]).count(".")) * ".0",axis=1)
    
    # calculate OrgRank
    opr["OrgRank"] = opr.apply(lambda x: (max_level-1) - str(x["PI_Index"]).count(".0"),axis=1)
    
    # populate Level Indices using PI_Index
    opr.loc[opr["PI_Index"].notna(),"L1_Index"] = opr.loc[opr["PI_Index"].notna()].apply(lambda x: str(x["PI_Index"]).split(".")[0],axis=1)
    opr.loc[opr["PI_Index"].notna(),"L2_Index"] = opr.loc[opr["PI_Index"].notna()].apply(lambda x: str(x["PI_Index"]).split(".")[1],axis=1)
    opr.loc[opr["PI_Index"].notna(),"L3_Index"] = opr.loc[opr["PI_Index"].notna()].apply(lambda x: str(x["PI_Index"]).split(".")[2],axis=1)
    opr.loc[opr["PI_Index"].notna(),"L4_Index"] = opr.loc[opr["PI_Index"].notna()].apply(lambda x: str(x["PI_Index"]).split(".")[3],axis=1)
    opr.loc[opr["PI_Index"].notna(),"L5_Index"] = opr.loc[opr["PI_Index"].notna()].apply(lambda x: str(x["PI_Index"]).split(".")[4],axis=1)

    # sort by Level1, Level2, Level3
    sort_columns = ["L1_Index","L2_Index","L3_Index","L4_Index","L5_Index"]
    #opr.L1_Index.astype(int,errors="ignore")
    #opr.L2_Index.astype(int,errors="ignore")
    #opr.L3_Index.astype(int,errors="ignore")
    opr.sort_values(sort_columns,inplace=True)
    
    return opr

In [19]:
# opr = orgtree

def clean_positions(opr_old):
    
    opr_new = opr_old.copy(deep=True)
    
    opr_new["P1"] = opr_new.apply(lambda x: ",".join(list(set([item.strip() for item in str(x["P1"]).split(",")]))),axis=1)    
    opr_new["P2"] = opr_new.apply(lambda x: ",".join(list(set([item.strip() for item in str(x["P2"]).split(",")]))),axis=1)
    opr_new["P3"] = opr_new.apply(lambda x: ",".join(list(set([item.strip() for item in str(x["P3"]).split(",")]))),axis=1)
    
    return opr_new
    

In [150]:
# m = file containing resolved Positions
# opr_old = old orgtree

def update_positions(m,opr_old):
        
    opr_new = opr_old.copy(deep=True)

    ind_PI = list(m.columns).index("PrimaryInstitution")
    ind_Org = list(m.columns).index("OrgName")
    ind_Pos = list(m.columns).index("Position")
    ind_Res = list(m.columns).index("Resolution")
    # print(ind_PI,ind_Org,ind_Pos,ind_Res)
    
    row_range = range(0,m.shape[0])
    # print(row_range)
    
    for i in row_range:
        
        PI = m.iloc[i,ind_PI]
        Org = m.iloc[i,ind_Org]
        Pos = m.iloc[i,ind_Pos]
        Res = m.iloc[i,ind_Res]
        # print(i,PI,Org,Pos,Res)
        
        # select among P1, P2, P3 
        PosRank = "P1"
        if Res == "2. Add Position to P2": PosRank = "P2"
        if Res == "3. Add Position to P3": PosRank = "P3"
        
        # get current Positions
        row_criteria = (opr_new["PrimaryInstitution"]==PI) & (opr_new["OrgName"]==Org)
        
        currPos = list(opr_new.loc[row_criteria,PosRank])
        
        print("currPos:",currPos,currPos == [])
        
        # case 1: currPOS is [np.nan] 
        if currPos == []:
            newPos = Pos.strip()
            PosInCurrPos = False

        # case 2: currPos contains a list of strings
        else:
            currPos = currPos[0].split(",")
            newPos = currPos
            PosInCurrPos = Pos.strip() in currPos

            if not PosInCurrPos:
                newPos.append(Pos.strip())
            
            newPos = ",".join(newPos)
        
        print(i,PI,Org,Pos,PosRank,PosInCurrPos,currPos,newPos)
        
        # update DF with new Positions
        
        print("newPos:",newPos)
        
        print("before:",opr_new.loc[row_criteria,PosRank])

        opr_new.loc[row_criteria,PosRank] = newPos
        
        print("after:",opr_new.loc[row_criteria,PosRank])

        opr_new = clean_positions(opr_new)

    return opr_new

In [151]:
opr2 = update_positions(editPOS2_opr.head(5),opr)

currPos: [] True
0 사회주의애국청년동맹 nan 1비서 P1 False [] 1비서
newPos: 1비서
before: Series([], Name: P1, dtype: object)
after: Series([], Name: P1, dtype: object)
currPos: ['2등서기관'] False
1 정무원 외교부_X국_나미비아대사관 2등서기관 P3 True ['2등서기관'] 2등서기관
newPos: 2등서기관
before: 2044    2등서기관
Name: P3, dtype: object
after: 2044    2등서기관
Name: P3, dtype: object
currPos: ['2등서기관'] False
2 정무원 외교부_X국_나이지리아대사관 2등서기관 P3 True ['2등서기관'] 2등서기관
newPos: 2등서기관
before: 2045    2등서기관
Name: P3, dtype: object
after: 2045    2등서기관
Name: P3, dtype: object
currPos: ['2등서기관'] False
3 정무원 외교부_X국_부르키나파소대사관 2등서기관 P3 True ['2등서기관'] 2등서기관
newPos: 2등서기관
before: 2056    2등서기관
Name: P3, dtype: object
after: 2056    2등서기관
Name: P3, dtype: object
currPos: ['2등서기관'] False
4 정무원 외교부_X국_스웨덴대사관 2등서기관 P3 True ['2등서기관'] 2등서기관
newPos: 2등서기관
before: 2059    2등서기관
Name: P3, dtype: object
after: 2059    2등서기관
Name: P3, dtype: object


In [152]:
opr2.loc[(opr2.PrimaryInstitution=="사회주의애국청년동맹") & (opr2.OrgName.isnull()),"P1"]

339    nan
Name: P1, dtype: object

In [153]:
opr2.loc[(opr2.PrimaryInstitution=="사회주의애국청년동맹") & (opr2.OrgName.isnull()),"P1"] = "1비서"

In [154]:
opr2.loc[(opr2.PrimaryInstitution=="사회주의애국청년동맹") & (opr2.OrgName.isnull()),"P1"]

339    1비서
Name: P1, dtype: object

In [132]:
# opr = update_opr_index(opr,5)

In [22]:
# export opr

# fileout_opr = "1.0 orgtree_position_rank.xlsx"
# opr.to_excel(path_cleaning + subpath_2_7 + fileout_opr,index=False)

# Recall from earlier cleaning

#### Three Cases of  OrgName in NK elite career data

- Case 0: OrgName in 기관별인명록: code these from 1-199
- Case 1: OrgName not in 기관별인명록, but contained in data AND LinkToNext_Year not current: code as 500+
- Case 2: OrgName not in 기관별인명록, but contained in data AND LinkToNext_Year is current: code as 200+

#### Case 0: OrgName in 기관별인명록

- Update 1.0 orgtree_position_rank with OrgName in 기관별인명록
- Code these from 1-199
- Run above routine # 1. orgtree_position_rank: update all besides 노동당, 내각, 정무원
    - to update opr & pr_else3
- Proceed to Cases 1 & 2

#### Resolve. Validation values for resolving null Position_3P

1. OrgName & Position ok. Add Position to P1
2. OrgName & Position ok. Add Position to P2
3. OrgName & Position ok. Add Position to P3
4. Change_OrgName and/or Change_Position
5. Uncertain OrgName
6. Uncertain Position
7. NotJob
8. Multiple CareerSubstring
9. Other - see Notes

# Task 1. Prep & Merge Mismatch

- NotJob
    - careers: change IsJob from True to False
    - m: remove rows 
    - opr: NA
  
- MultipleSubstrings
    - careers: expand rows
    - m: expand rows and code
    - opr: NA
    
- Merge mismatch files

# Task 2. Edit & Validate PI, OrgName, Type, Index

#### Step 0. Uncertain OrgName & Position
    - careerorglink: alter OrgName, Position to Uncertain
    - mismatches: remove rows containing uncertain OrgName & Position 
    - opr: NA
    
#### handle mismatches (Changes, InstitutionType)
- Step 1. using mismatches (Change_PI, Change_Org, Change_Pos), edit careerorglink & mismatches (PI, OrgName, Position) 
- Step 2. for all mismatches (InstitutionType), edit careerorglink & orgtree (InstitutionType)
- Step 3. for mismatches (IType, PI, OrgName) not in orgtree, add to orgtree with PI_Index 801

####  handle orgtree (Changes, InstitutionType)
- Step 4. using orgtree (Change_ x 3), edit careerorglink, mismatches & orgtree (InstitutionType, PI, OrgName)
- Step 5. using orgtree, update InstitutionType in col, for those jobs not in mismatch

#### handle mismatches (Positions)
- Step 6. using mismatches Position, update orgtree P1, P2, P3

# Task 3. Validate careerorglink & orgtree

#### Completed
- Verify unique primary keys for:
    - careerorglink (CareerString,CareerDateString,CareerSubstring)
    - orgtree (PI, OrgName)
- careerorglink (PI, OrgName) contained in orgtree
- Identify null InstitutionType, Position

#### Do Now - Validate InstitutionType, P1-P3
- Verify careerorglink (Position) is contained in P1, P2, P3
- Verify no orgtree Positions overlapping within the same PI, OrgName

#### Do Later - Validate PI, OrgName
- For 200 & 500 series PrimaryInstitutions, search whether they are contained in OrgName, within a PrimaryInstitution
- Orgs in Orgtree but not in Careers or 기관별인명록 (200, 500 series): delete these from orgtree

### Step. Verify unique primary keys

#### careerorglink primary key is unique

In [23]:
col_key_columns = ["CareerString","CareerDateString_2022","CareerSubstring"]

In [24]:
col_key_condition = col.IsJob==True & col.duplicated(col_key_columns,keep=False)

In [25]:
col[col_key_condition].shape

(0, 12)

#### orgtree primary key is unique

In [26]:
opr_key_columns = ["PrimaryInstitution","OrgName"]

In [27]:
opr_key_condition = opr.duplicated(opr_key_columns,keep=False)

In [28]:
opr[opr_key_condition].sort_values(opr_key_columns).shape

(0, 19)

### Step. careerorglinks (PI, OrgName) have a unique match in orgtree

In [29]:
col_opr_key_condition = (col.IsJob=="True") & (col.OrgName!="UNCERTAIN")

In [30]:
col_opr_keys_columns = opr_key_columns + ["Position"]

In [31]:
col_opr_keys = col.loc[col_opr_key_condition,col_opr_keys_columns]
col_opr_keys.shape

(6549, 3)

In [32]:
col_opr_keys = unique_non_null_rows(col_opr_keys)


Unique Non-null Rows...

	Non-unique rows: (6549, 3)
	Unique rows    : (1783, 3)


In [33]:
comerge = col_opr_keys.merge(opr,on=opr_key_columns,how="outer",indicator=True)
merge_results(comerge)


Merge Results...

	shape     : (3079, 21)
	left_only : (0, 21)
	both      : (1783, 21)
	right_only: (1296, 21)


In [34]:
# left-only indicate careerorglink (PI,OrgName) with no match in orgtree
comerge[comerge["_merge"]=="left_only"].shape

(0, 21)

### Step. Fill in null variables: PI, InstitutionType, Position

#### PI in careerorglink

In [35]:
# PI is null
col[(col.IsJob=="True") & (col.PrimaryInstitution.isnull())].shape

(0, 12)

In [36]:
# PI is UNCERTAIN
col[(col.IsJob=="True") & (col.OrgName=="UNCERTAIN")].shape

(182, 12)

#### InstitutionType in careerorglink

In [37]:
# InstitutionType is null
col[(col.IsJob=="True") & (col.InstitutionType.isnull())].shape

(0, 12)

In [38]:
# InstitutionType is UNCERTAIN
col[(col.IsJob=="True") & (col.OrgName=="UNCERTAIN")].shape

(182, 12)

#### Position in careerorglink

In [39]:
# Position is null
col[(col.IsJob=="True") & (col.Position.isnull())].shape

(0, 12)

In [40]:
# Position is UNCERTAIN 
col[(col.IsJob=="True") & (col.OrgName=="UNCERTAIN")].shape

(182, 12)

#### InstitutionType in orgtree

In [41]:
# InstitutionType is null
opr[(opr.InstitutionType.isnull())].shape

(0, 19)

In [42]:
# InstitutionType is UNCERTAIN
opr[(opr.InstitutionType.str.upper()=="UNCERTAIN")].shape

(39, 19)

### Step. Mismatched InstitutionTypes

In [43]:
opr_key_columns = ["PrimaryInstitution","OrgName"]

In [44]:
col_columns = opr_key_columns + ["InstitutionType"]
opr_columns = col_columns

In [45]:
compareIT = col[col_columns].merge(opr[opr_columns],on=opr_key_columns,how="inner",suffixes=("_col","_opr"),indicator=True)
merge_results(compareIT)


Merge Results...

	shape     : (6657, 5)
	left_only : (0, 5)
	both      : (6657, 5)
	right_only: (0, 5)


#### compare mismatched InstitutionTypes

In [46]:
compareIT[compareIT.InstitutionType_col != compareIT.InstitutionType_opr].shape

(0, 5)

In [47]:
compareIT[compareIT.InstitutionType_col != compareIT.InstitutionType_opr]

Unnamed: 0,PrimaryInstitution,OrgName,InstitutionType_col,InstitutionType_opr,_merge


### Step. Verify careerorglink Position contained in orgtree P1, P2, P3

In [48]:
opr_key_columns = ["PrimaryInstitution","OrgName"]

In [49]:
col_columns = opr_key_columns + ["IsJob","Position"]

In [50]:
opr_columns = opr_key_columns + ["P1","P2","P3"]

#### format careerorglink

In [51]:
# only select jobs with certain PI, Position
condition1 = (col.IsJob=="True") & (col.PrimaryInstitution!="UNCERTAIN") & (col.Position!="UNCERTAIN")

In [52]:
col_merge = col.loc[condition1,col_columns]
col_merge.shape

(6539, 4)

In [53]:
col_merge.drop_duplicates(inplace=True)
col_merge.shape

(1780, 4)

#### merge

In [54]:
comparePOS = col_merge.merge(opr[opr_columns],on=opr_key_columns,how="inner",indicator=True)
merge_results(comparePOS)


Merge Results...

	shape     : (1772, 8)
	left_only : (0, 8)
	both      : (1772, 8)
	right_only: (0, 8)


In [55]:
comparePOS.columns

Index(['PrimaryInstitution', 'OrgName', 'IsJob', 'Position', 'P1', 'P2', 'P3',
       '_merge'],
      dtype='object')

In [56]:
# Position not contained in P1, P2, P3
condition2 = comparePOS.apply(lambda x:
                 (str(x["Position"]).strip() not in str(x["P1"]).strip())
                 & (str(x["Position"]).strip() not in str(x["P2"]).strip()) 
                 & (str(x["Position"]).strip() not in str(x["P3"]).strip()),axis=1)

In [57]:
comparePOS[condition2].shape

(385, 8)

In [58]:
comparePOS[condition2]

Unnamed: 0,PrimaryInstitution,OrgName,IsJob,Position,P1,P2,P3,_merge
25,사회주의애국청년동맹,,True,1비서,,,,both
26,사회주의애국청년동맹,,True,부위원장,,,,both
27,사회주의애국청년동맹,,True,비서,,,,both
28,사회주의애국청년동맹,,True,위원장,,,,both
29,사회주의애국청년동맹,,True,조직대표,,,,both
51,조선축구협회,,True,간부,위원장,부위원장,"집행위원,서기장,상무위원,서기장대리,부서기장",both
55,중앙당학교,,True,강좌장,"교장,당비서",부교장,,both
67,김일성경호대,,True,경호대장,,,,both
70,내각,외무성,True,부국장,상,"제1부상,부상","연구원,고문,과장,참사",both
73,내각,외무성,True,순회대사,상,"제1부상,부상","연구원,고문,과장,참사",both


In [69]:
opr[opr.PrimaryInstitution=="사회주의애국청년동맹"]

Unnamed: 0,InstitutionType,OrgType,PrimaryInstitution,OrgName,PI_Index,OrgRank,P1,P2,P3,Alias_OrgName,LinkToNext_PI,LinkToNext_Org,LinkToNext_Year,Notes,L1_Index,L2_Index,L3_Index,L4_Index,L5_Index
339,당외곽및사회단체_근로단체,,사회주의애국청년동맹,,12.0.0.0.0.0,0,,,,,Current,Current,Current,,12,0,0,0,0
340,당외곽및사회단체_근로단체,,사회주의애국청년동맹,중앙위원회,12.1.0.0.0.0,1,"위원장,1비서",부위원장,위원,,Current,Current,Current,,12,1,0,0,0
341,당외곽및사회단체_근로단체,,사회주의애국청년동맹,중앙검사위원회,12.2.0.0.0.0,1,위원장,부위원장,위원,,Current,Current,Current,,12,2,0,0,0
342,당외곽및사회단체_근로단체,,사회주의애국청년동맹,X지역위원회,12.91.0.0.0.0,1,위원장,부위원장,위원,,Current,Current,Current,,12,91,0,0,0
664,당외곽및사회단체_근로단체,,사회주의애국청년동맹,중앙위원회_노동청년부,201.201.201.0.0.0,2,"위원장,지도원",부위원장,과장,,,,Current,,201,201,201,0,0
665,당외곽및사회단체_근로단체,,사회주의애국청년동맹,중앙위원회_청소년과외교양지도국,201.201.201.0.0.0,2,총국장,,,,,,Current,,201,201,201,0,0


In [70]:
editPOS = comparePOS[condition2]

#### resolve Position flag

In [71]:
resolve_position_flag = True

#### export Positions to resolve rank

In [72]:
if resolve_position_flag:

    filename_editPOS = "3.0 careerorglink_1_Jacob_resolve.xlsx"
    # editPOS.to_excel(path_cleaning + subpath_2_7 + filename_editPOS,index=False)

#### manually resolve Position ranks 

#### import resolved Position ranks

In [73]:
if resolve_position_flag:

    filename_editPOS = "3.0 careerorglink_1_Jacob_resolve_JR.xlsx"
    editPOS2 = pd.read_excel(path_cleaning + subpath_2_7 + filename_editPOS,dtype="str")
    print(editPOS2.shape)

(668, 7)


  warn(msg)


In [74]:
# resolve Position ranks: use to update orgtree

if resolve_position_flag:
    editPOS2_opr = editPOS2[editPOS2.Resolution!="4. OrgName is UNCERTAIN"]
    print(editPOS2_opr.shape)

(581, 7)


In [75]:
# uncertain Positions: use to update careerorglink

if resolve_position_flag:
    editPOS2_col = editPOS2[editPOS2.Resolution=="4. OrgName is UNCERTAIN"]
    print(editPOS2_col.shape)

(87, 7)


In [80]:
editPOS2_opr[(editPOS2_opr.PrimaryInstitution=="사회주의애국청년동맹") & (editPOS2_opr.OrgName.isnull())]

Unnamed: 0,PrimaryInstitution,OrgName,Position,P1,P2,P3,Resolution
0,사회주의애국청년동맹,,1비서,,,,1. Add Position to P1
234,사회주의애국청년동맹,,부위원장,,,,2. Add Position to P2
309,사회주의애국청년동맹,,비서,,,,1. Add Position to P1
444,사회주의애국청년동맹,,위원장,,,,1. Add Position to P1
543,사회주의애국청년동맹,,조직대표,,,,1. Add Position to P1


In [77]:
if resolve_position_flag:
    opr2 = update_positions(editPOS2_opr,opr)

currPos: [] True
0 사회주의애국청년동맹 nan 1비서 P1 False [] 1비서
currPos: ['2등서기관'] False
1 정무원 외교부_X국_나미비아대사관 2등서기관 P3 True ['2등서기관'] 2등서기관
currPos: ['2등서기관'] False
2 정무원 외교부_X국_나이지리아대사관 2등서기관 P3 True ['2등서기관'] 2등서기관
currPos: ['2등서기관'] False
3 정무원 외교부_X국_부르키나파소대사관 2등서기관 P3 True ['2등서기관'] 2등서기관
currPos: ['2등서기관'] False
4 정무원 외교부_X국_스웨덴대사관 2등서기관 P3 True ['2등서기관'] 2등서기관
currPos: ['3등 서기관'] False
5 내각 외무성_X국_콩고대사관 3등 서기관 P3 True ['3등 서기관'] 3등 서기관
currPos: ['3등서기관'] False
6 내각 외무성_X국_부룬디대사관 3등서기관 P3 True ['3등서기관'] 3등서기관
currPos: ['3등서기관'] False
7 정무원 외교부_X국_유네스코대표부 3등서기관 P3 True ['3등서기관'] 3등서기관
currPos: ['3등서기관,참사'] False
8 정무원 외교부_X국_인도네시아대사관 3등서기관 P3 True ['3등서기관', '참사'] 3등서기관,참사
currPos: [] True
9 조선축구협회 nan 간부 P3 False [] 간부
currPos: [] True
10 중앙당학교 nan 강좌장 P3 False [] 강좌장
currPos: ['경제참사,참사관'] False
11 내각 외무성_유럽2국_독일대사관 경제참사 P3 True ['경제참사', '참사관'] 경제참사,참사관
currPos: ['경제참사'] False
12 정무원 외교부_X국_독일대사관 경제참사 P3 True ['경제참사'] 경제참사
currPos: [] True
13 김일성경호대 nan 경호대장 P1 False [] 경호대장
currPos: ['과장,고

currPos: [] True
132 남포유리공장 nan 부기사장 P3 False [] 부기사장
currPos: [] True
133 4.15문화창작단 nan 부단장 P2 False [] 부단장
currPos: [] True
134 백두산창작단 nan 부단장 P2 False [] 부단장
currPos: [] True
135 인민군판문점대표부 nan 부대표 P2 False [] 부대표
currPos: ['부대표'] False
136 정무원 외교부_X국_프랑스대표부 부대표 P2 True ['부대표'] 부대표
currPos: [] True
137 무산광산연합기업소 nan 부부장 P3 False [] 부부장
currPos: ['부부장'] False
138 정무원 건재공업부 부부장 P2 True ['부부장'] 부부장
currPos: ['부부장,부장,제1부부장'] False
139 정무원 고등교육부 부부장 P2 True ['부부장', '부장', '제1부부장'] 부부장,부장,제1부부장
currPos: ['부부장'] False
140 정무원 교육성_보통교육부 부부장 P2 True ['부부장'] 부부장
currPos: ['부부장,부장'] False
141 정무원 교통체신위원회_해운부 부부장 P2 True ['부부장', '부장'] 부부장,부장
currPos: ['부부장,부장,제1부부장'] False
142 정무원 금속공업부 부부장 P2 True ['부부장', '부장', '제1부부장'] 부부장,부장,제1부부장
currPos: ['부부장,부장,제1부부장'] False
143 정무원 기계공업부 부부장 P2 True ['부부장', '부장', '제1부부장'] 부부장,부장,제1부부장
currPos: ['부부장,부장'] False
144 정무원 노동행정부 부부장 P2 True ['부부장', '부장'] 부부장,부장
currPos: ['부부장'] False
145 정무원 대외경제사업부 부부장 P2 True ['부부장'] 부부장
currPos: ['부부장'] False
146 정무원 도시경영부 

currPos: ['사령관'] False
264 총참모부 전략군사령부 사령관 P1 True ['사령관'] 사령관
currPos: ['사무국장'] False
265 정무원 조국평화통일위원회 사무국장 P3 True ['사무국장'] 사무국장
currPos: [] True
266 내각 nan 사무장 P3 False [] 사무장
currPos: [] True
267 정무원 nan 사무장 P3 False [] 사무장
currPos: [] True
268 국가체육지도위원회 nan 사무차장 P3 False [] 사무차장
currPos: [] True
269 조선올림픽위원회 nan 사무차장 P3 False [] 사무차장
currPos: [] True
270 조선민주법률가협회 nan 사무총장 P3 False [] 사무총장
currPos: [] True
271 과학백과사전출판사 nan 사장 P1 False [] 사장
currPos: [] True
272 금속및기계수출입총회사 nan 사장 P1 False [] 사장
currPos: [] True
273 묘향무역회사 nan 사장 P1 False [] 사장
currPos: [] True
274 외국문출판사 nan 사장 P1 False [] 사장
currPos: [] True
275 조선문학창작사 nan 사장 P1 False [] 사장
currPos: [] True
276 조선중앙통신사 nan 사장 P1 False [] 사장
currPos: [] True
277 조선흑색금속수출회사 nan 사장 P1 False [] 사장
currPos: [] True
278 평양출판사 nan 사장 P1 False [] 사장
currPos: ['상'] False
279 내각 금속기계공업성 상 P1 True ['상'] 상
currPos: ['상'] False
280 내각 금속화학공업성 상 P1 True ['상'] 상
currPos: ['상'] False
281 내각 외무성A 상 P1 True ['상'] 상
currPos: ['상'] False
282 내각 인

currPos: ['위원장'] False
402 정무원 황해남도행정경제위원회 위원장 P1 True ['위원장'] 위원장
currPos: ['위원장'] False
403 정무원 황해북도행정경제위원회 위원장 P1 True ['위원장'] 위원장
currPos: [] True
404 조선과학기술총연맹 nan 위원장 P1 False [] 위원장
currPos: [] True
405 조선국제합영촉진위원회 nan 위원장 P1 False [] 위원장
currPos: [] True
406 조선그리스도교연맹 nan 위원장 P1 False [] 위원장
currPos: [] True
407 조선금연위원회 nan 위원장 P1 False [] 위원장
currPos: [] True
408 조선기자동맹 nan 위원장 P1 False [] 위원장
currPos: [] True
409 조선농업근로자동맹 nan 위원장 P1 False [] 위원장
currPos: [] True
410 조선대외문화연락협회 nan 위원장 P1 False [] 위원장
currPos: [] True
411 조선도서관협회 nan 위원장 P1 False [] 위원장
currPos: [] True
412 조선민족음악위원회 nan 위원장 P1 False [] 위원장
currPos: [] True
413 조선반핵평화협의회 nan 위원장 P1 False [] 위원장
currPos: [] True
414 조선옷협회 nan 위원장 P1 False [] 위원장
currPos: [] True
415 조선중앙방송위원회 nan 위원장 P1 False [] 위원장
currPos: [] True
416 조선직업총동맹 nan 위원장 P1 False [] 위원장
currPos: [] True
417 조선천도교회중앙지도위원회 nan 위원장 P1 False [] 위원장
currPos: [] True
418 조선혁명박물관 nan 위원장 P1 False [] 위원장
currPos: [] True
419 종군위안부및태평양전쟁피해자대책위원회 nan 위원장 

currPos: [] True
530 노동당 nan 책임비서 P1 False [] 책임비서
currPos: [] True
531 대안중기계연합기업소 nan 책임비서 P1 False [] 책임비서
currPos: [] True
532 무산광산연합기업소 nan 책임비서 P1 False [] 책임비서
currPos: [] True
533 승리자동차연합기업소 nan 책임비서 P1 False [] 책임비서
currPos: [] True
534 안주지구탄광연합기업소 nan 책임비서 P1 False [] 책임비서
currPos: [] True
535 수풍발전소 nan 책임자 P1 False [] 책임자
currPos: [] True
536 과학백과사전출판사 nan 책임주필 P1 False [] 책임주필
currPos: [] True
537 근로자사 nan 책임주필 P1 False [] 책임주필
currPos: [] True
538 노동신문사 nan 책임주필 P1 False [] 책임주필
currPos: [] True
539 민주조선사 nan 책임주필 P1 False [] 책임주필
currPos: ['과장,고문,후보위원,위원,지도원,책임지도원'] False
540 노동당 당중앙위원회 책임지도원 P3 True ['과장', '고문', '후보위원', '위원', '지도원', '책임지도원'] 과장,고문,후보위원,위원,지도원,책임지도원
currPos: ['책임지도원'] False
541 정무원 금속공업부 책임지도원 P3 True ['책임지도원'] 책임지도원
currPos: ['정책국장,서기장,책임참사,위원,참사,중앙위원'] False
542 내각 대외경제성_민족경제협력위원회 책임참사 P3 True ['정책국장', '서기장', '책임참사', '위원', '참사', '중앙위원'] 정책국장,서기장,책임참사,위원,참사,중앙위원
currPos: [] True
543 내각 nan 책임참사 P3 False [] 책임참사
currPos: [] True
544 정무원 nan 책임참사 P3 False [

In [81]:
opr2[(opr2.PrimaryInstitution=="사회주의애국청년동맹") & (opr2.OrgName.isnull())]

Unnamed: 0,InstitutionType,OrgType,PrimaryInstitution,OrgName,PI_Index,OrgRank,P1,P2,P3,Alias_OrgName,LinkToNext_PI,LinkToNext_Org,LinkToNext_Year,Notes,L1_Index,L2_Index,L3_Index,L4_Index,L5_Index
339,당외곽및사회단체_근로단체,,사회주의애국청년동맹,,12.0.0.0.0.0,0,,,,,Current,Current,Current,,12,0,0,0,0


# Format & Export Tables

In [None]:
export_tables = False
export_cleaning = False

In [None]:
# current versions of tables for export
col_export = col
opr_export = opr2
# m_export = m

In [None]:
# export filenames
filename_careerorglink = "careerorglink.xlsx"
filename_orgtree = "orgtree.xlsx"

# cyclically revise and overwrite the same files
filename_careerorglink_new = "3.0 careerorglink_1_Jacob.xlsx"
# filename_mismatch_new = "3.0 mismatch_통합.xlsx"
filename_orgtree_new = "3.0 orgtree_1_Jacob.xlsx"

#### format careerorglink

In [None]:
careerorglink_columns = ['CareerString', 'CareerDateString_2022',
                         'IsJob', 'MultipleSubstrings','CareerStartYear', 'CareerStartMonth',
                         'CareerSubstring','InstitutionType', 'PrimaryInstitution', 'OrgName', 'Position', 'Notes']

In [None]:
# check that col has all necessary columns
[item for item in careerorglink_columns if item not in col_export.columns]

In [None]:
# remove these items
[item for item in col_export.columns if item not in careerorglink_columns]

In [None]:
col_export = col_export[careerorglink_columns]

#### format orgtree

In [None]:
orgtree_columns = ['InstitutionType', 'OrgType', 'PrimaryInstitution','OrgName',
                   'PI_Index', 'OrgRank', 'P1', 'P2', 'P3','Alias_OrgName',
                   'LinkToNext_PI','LinkToNext_Org','LinkToNext_Year','Notes',
                   'L1_Index', 'L2_Index','L3_Index', 'L4_Index', 'L5_Index']

In [None]:
# check that orgtree has all necessary columns
[item for item in orgtree_columns if item not in opr_export.columns]

In [None]:
# remove these items
[item for item in opr_export.columns if item not in orgtree_columns]

In [None]:
opr_export = opr_export[orgtree_columns]

In [None]:
# update indices & ranks
opr_export = update_opr_index(opr_export,6)

#### export to cleaning

In [None]:
if export_cleaning:
    col_export.to_excel(path_cleaning + subpath_2_7 + filename_careerorglink_new,index=False)
    opr_export.to_excel(path_cleaning + subpath_2_7 + filename_orgtree_new,index=False)
    # m_export.to_excel(path_cleaning + subpath_2_7 + filename_mismatch_new,index=False)

#### export to tables

In [None]:
if export_tables:
    col_export.to_excel(path_tables + filename_careerorglink,index=False)
    opr_export.to_excel(path_tables + filename_orgtree,index=False)

# Recode & Recategorize some (PI,OrgName)

- 중앙위원회
- 인민군, 총참모부, 인민무력부, 
- 국방위원회: 인민군?
- 중앙인민위원회: 정권기관
- 국제친선기관: I will add and match existing ones
- 인민회의 or 인민위원회?
- 노동당 총정치국 --> 인민군 총정치국
- add 북조선노동당 as an alias to 노동당
- alias the PrimaryInstitutions, 내각, 내각A, 내각B, 등 at least when calculating whether transitions are between or across institutions
- 노동당 - fix the old party organization
    - https://encykorea.aks.ac.kr/Article/E0070188
    - 1947년 북조선로동당 ‘5과’로 창설되었다. 6·25전쟁기 남한지역 내 게릴라부대의 지하당 공작 과정에서 대남 공작기구를 확대하면서 당 조직부 내의 연락부를 분리했다. 1961년 4차 당대회에서 남한에서의 지하당 조직 확대와 통일전선 강화 등의 결정과 함께 내무성 등의 대남 공작기구를 통합해 중앙당 ‘연락국’을 신설했다. 1964년 2월 조선로동당 중앙위원회 4기 8차 전원회의에서 ‘3대 혁명력량 강화’노선을 채택하면서 종래의 연락국을 ‘대남사업총국’으로 개칭했다.
    - 1966년 10월 12일 당 중앙위원회 4기 14차 전원회의에서 비서국이 신설되면서 대남사업담당비서 산하에 ‘연락부’, ‘문화부’, ‘조사부’, ‘인민무력부정착국’, ‘조총련’부서를 두었으며, 1978년 1월 통일전선부가, 1990년대 후반 35호실(대외정보조사부)과 작전부가 신설됨으로써 대남 담당 부서의 체계가 구축되었다. 남한 내 정당, 사회단체, 군부에 대한 공작 거점 및 공작 전술 연구개발 업무를 수행하고 있다. 특히 간첩교육과 파견을 직접 담당하고 있으며, 조총련에 대한 활동 지도도 담당하고 있다.

- 내각 & 정무원
    - How different was their organization?
    - https://encykorea.aks.ac.kr/Article/E0066727 - Let's alias. It seems like the name simply changed from 내각 (1948) --> 정무원 (1972) --> 내각 (1998)
    - Do we need to duplicate the entire 내각 structure for 정무원?
    - At the very least, should we swap out 내각 for 정무원 in cases where data are labelled as 내각 but during 정무원 periods (pre-1998)?
    - If the internal organization was quite similar, then maybe we could just alias 내각 and 정무원 and not worry about replicating the organization three
- 인도기관?
    - 국방위원회
    - 국무위원회
- resolve differences between InstitutionType, PrimaryInstitution, OrgName
    - for PI=총참모부, rename PI=인민군, OrgName = 총참모부_; and remove duplicates. e.g., 총참모부_작전총국
    - 노동당, 당중앙위원회
    - many organizations for whom 중앙위원회 is their main leadership body
    - 인민군. remove as PrimaryInstitution, and use as InstitutionType instead
    - https://encykorea.aks.ac.kr/Article/E0066687
    - 인민무력부, 인민무력성, 
    - PrimaryInstitutions
        - 최고사령부
        - 총참모부
        - 주요 사령부
        - 등
- Make sure X.0.0.0 positions are not being duplicated with X.1.0.0 positions, especially in the case of 중앙위원회
- Discuss X in PI_Index
    - 비서국A --> 정무국 --> 비서국B
        - check for duplicates: 당 비서국
    - 당중앙위원회_정치위원회 (폐지) 직급
    - 당중앙위원회_총정치국 - 당/인민군
    - 당중앙위원회_조선혁명박물관당위원회 & other 1.1.X.0.0 - double-check if they are included in 외곽단 - InGov?
    - Change_OrgName to 당중앙위원회_X
        - 비서국_인민무력부B
        - 비서국_인민무력부B_보위국
        - 비서국_인민무력부B_작전국
        - 비서국_인민무력부B_정치안전국

# Re-Index
- See PI-Index Coding Rules in Evernote
- Update X codes with 200 or 500. Delete any 200/500 which don't appear in our data. (They might have been orgs we corrected in the data.)
- 최고인민회의 X지역인민회의 - move from 200 Series to 0 Series
- Drop X지역위원회 and recode 200 Series to 0 Series
- Change coding of 당외곽 및 사회단체
    - if the orgs were found in the 별책, then change their codes from 200 series to 100 series
    - Match, rename and recode (from 500+ to 100+) 500 series 국제친선기관
- code data source rather than index

# Task 3. Add OrgType & other metadata

- Add OrgType


# Future Tasks

- Alias all 500 Series and concurrent Aliases
    - Include all names in orgtree
    - Alias names in both directions, forwards and backwards
    - use PI_Index to indicate a current or deprecated org.
    - I could use PI_Index to indicate what happened to the deprecated org: combined, renamed, deprecated, split. The problem with this is names which have been used cyclically, such as 내각.
- Future Integrity Checks & Data Cleaning
    - Identify inconsistencies in Position Rankings (e.g., 명예위원장)
    - Verify no contrary categories of positions overlapping (e.g., 위원장, 소장)
    - Verify (PI, OrgName) not found in 기관별인명록
    - Research X entries: e.g., 3.525.591: 내각_외무성_X국_대사관
    - Reconcile 기관 across three categories: 기타기관, 당외곽및사회단체, 당외곽및사회단체(별책)
    - Add more orgs
        - Expand any X지역위원회 into full list
        - 내각 - Level 2 or Level 3?
            - orgs after 내각 국토환경보호성_산림총국
            - orgs after 내각 대외경제성_민족경제협력위원회
            - orgs after 내각 보건성_중앙의약품관리소
            - orgs after 내 각 상업성_중앙도매소
            - 내각 정보산업성 우편국, 전화국, 체신관리국, 체신소, 
            - 내각 (45)
        - 당외곽및사회단체 (별책)
    - Discussion about matching levels
        - https://namu.wiki/w/ - consider the OrgRang of 김정은의 겸직
		- 조선민주주의인민공화국 국무위원회 위원장[국가원수]
		- 조선로동당 중앙군사위원장
		- 조선민주주의인민공화국무력 최고사령관
		- 조선로동당 중앙위원회 정치국 상무위원
    - Review UNCERTAIN OrgName & Positions
        - some of these seem resolvable