In [1]:
import pandas as pd
import numpy as np

# Tables

### 1. LeaderCareerLink (LeaderID, CareerString, CareerDateString_2022)

matches LeaderID with (CareerString, CareerDateString_2022)

- LeaderID
- CareerString
- CareerDateString_2022

### 2. CareerOrgLink (CareerString, CareerDateString_2022, CareerSubstring)

matches (CareerString,CareerDateString_2022,CareerSubstring) with (InstitutionType,PrimaryInstitution,OrgName,Position)

- CareerString
- CareerDateString_2022
- IsJob
- MultipleSubstrings
- CareerStartYear
- CareerStartMonth
- CareerSubstring
- OrgString
- InstitutionType
- PrimaryInstitution
- OrgName
- Position
- Notes

### 3. Orgtree (InstitutionType, PrimaryInstitution, OrgName)*

contains variables relevant to all (PI, Org) including PI/Org Types, Positions, Org/Pos Ranks, PI/Org Links, Aliases

- InstitutionType
- OrgType
- PrimaryInstitution
- OrgName
- PI_Index
- OrgRank
- P1
- P2
- P3
- LinkToNext_PI
- LinkToNext_Org
- LinkToNext_Year
- Alias_OrgName
- Notes

In [2]:
path_tables = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 2 tables/"

In [3]:
# current tables
filename_careerorglink = "careerorglink.xlsx"
filename_leadercareerlink = "leadercareerlink.xlsx"
filename_orgtree = "orgtree.xlsx"

In [4]:
# deprecated tables
# filename_careers = "careers.xlsx"
# filename_leaderjoblink = "leaderjoblink.xlsx"
# filename_joborglink = "joborglink.xlsx"

# Data - initial orgtree + manually cleaned data in 2.7 orgtree position & rank

In [5]:
path_cleaning = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 1 cleaning/cleaning step 2 - career/"

In [6]:
subpath_2_1 = "2.1 career_undivided_unparsed_uncoded/"
subpath_2_2 = "2.2 career_divided_unparsed_uncoded/"
subpath_2_3 = "2.3 joborglink/"
subpath_2_4 = "2.4 orgtree/"
subpath_2_5 = "2.5 position/"
subpath_2_6 = "2.6 career_reassembled/"
subpath_2_7 = "2.7 orgtree position & rank/"

In [7]:
# 2.1 careerorglink.xlsx
filename_careerorglink_old = "2.1 careerorglink.xlsx"
col = pd.read_excel(path_cleaning + subpath_2_7 + filename_careerorglink_old,dtype="str")
col.shape

(9002, 13)

In [8]:
col[col["CareerString"].isna()]

Unnamed: 0,CareerString,CareerDateString_2022,IsJob,MultipleSubstrings,CareerStartYear,CareerStartMonth,CareerSubstring,OrgString,InstitutionType,PrimaryInstitution,OrgName,Position,Notes


In [9]:
# 1.1 orgtree_position_rank

# constructing new file from:
## 2022 북한_기관별_인명록_북한정보포털 게재용
## orgtree table

filein_orgtree_old = "1.1 orgtree_position_rank.xlsx"
opr = pd.read_excel(path_cleaning + subpath_2_7 + filein_orgtree_old,dtype="str")
opr.shape

(2267, 23)

In [10]:
opr.head(2)

Unnamed: 0,InsideGov,InstitutionType,OrgType,PrimaryInstitution,OrgName,PI_Index,OrgRank,P1,P2,P3,...,Notes,L1_Index,L2_Index,L3_Index,L4_Index,L5_Index,Change_InstitutionType,Change_PI,Change_OrgName,Alias_OrgName
0,1,인민군,,총참모부,김일성군사종합대학,11.201.0.0,2,,,,...,,11,3,201,0,0,인민군,김일성군사종합대학,,
1,1,인민군,,호위사령부,평양방어사령부,11.0.0.0,2,,,,...,,11,8,201,0,0,인민군,제91수도방어군단,,


In [11]:
# orgtree_copy

# copy of orgtree table

# filein_ot = "orgtree_copy.xlsx"
# ot = pd.read_excel(path_cleaning + subpath_2_7 + filein_ot,dtype="str")
# ot.shape

In [12]:
filename_mismatch_old = "2.1 mismatch_통합.xlsx"
m = pd.read_excel(path_cleaning + subpath_2_7 + filename_mismatch_old,dtype="str")
m.shape

(2245, 15)

In [13]:
m.head(2)

Unnamed: 0,CareerString,CareerStartYear,MultipleSubstrings,CareerSubstring,IsJob,InstitutionType,PrimaryInstitution,OrgName,Position,LinkToNext_Year,Resolution,Change_PI,Change_OrgName,Change_Position,Notes
0,유엔 주재 대표부 1등서기관,1975,1,,True,정권기관,내각,외무성_유엔대표부,1등서기관,Current,3. OrgName & Position ok. Add Position to P3,,,,
1,1975.10 유엔주재 북한대표부 1등서기관,1975,1,,True,정권기관,내각,외무성_유엔대표부,1등서기관,Current,3. OrgName & Position ok. Add Position to P3,,,,


# Functions: format orgtree

In [14]:
# using this on (PI,OrgName) will ensure unique & non-null keys
# using this on a larger df will ensure unique rows and non-null keys, but not unique keys

def unique_non_null_rows(olddf):
    
    df = olddf.copy()
    
    
    ### drop duplicates
    df = df.drop_duplicates(keep="first",ignore_index=True)

    ### drop null rows
    df.dropna(how="all",axis=0,inplace=True)
    
    # drop rows with null PI
    df = df[~(df["PrimaryInstitution"].isna())]
        
    ### drop rows where PI contains stop words
    stop_words_lower = ["uncertain","current","deprecated","please_revise"]
    df = df[ ~ df["PrimaryInstitution"].str.lower().isin(stop_words_lower)]
            

    print("\nUnique Non-null Rows...")
    print("")
    print("\tNon-unique rows:",olddf.shape)
    print("\tUnique rows    :",df.shape)

    ### sort
    df = df.sort_values(["PrimaryInstitution","OrgName"])
    
    return df

In [15]:
def verify_unique_rows(df):
    
    key_columns = ["PrimaryInstitution","OrgName"]
    print("\nVerifying Unique Rows...")
    print("")
    print("\tDuplicate Rows:",df[df.duplicated(keep=False)].shape)
    print("\tDuplicate Keys:",df[df.duplicated(key_columns,keep=False)].shape)
    print("\tNull Rows     :",df[df["PrimaryInstitution"].isna() & df["OrgName"].isna()].shape)

In [16]:
def merge_results(m):
    
    print("\nMerge Results...")
    print("")
    print("\tshape     :",m.shape)
    print("\tleft_only :",m[m["_merge"]=="left_only"].shape)
    print("\tboth      :",m[m["_merge"]=="both"].shape)
    print("\tright_only:",m[m["_merge"]=="right_only"].shape)

In [17]:
def update_opr_index(opr,max_level):
    
    # create and clear out existing Level Indicies
    opr["OrgRank"]=np.nan
    opr["L1_Index"]=np.nan
    opr["L2_Index"]=np.nan
    opr["L3_Index"]=np.nan
    opr["L4_Index"]=np.nan
    opr["L5_Index"]=np.nan
    
    # pad PI_Index with missing 0s up to the deepest level (4 or 5?)
    opr["PI_Index"] = opr.apply(lambda x: str(x["PI_Index"]) + ((max_level-1)-str(x["PI_Index"]).count(".")) * ".0",axis=1)
    
    # calculate OrgRank
    opr["OrgRank"] = opr.apply(lambda x: (max_level-1) - str(x["PI_Index"]).count(".0"),axis=1)
    
    # populate Level Indices using PI_Index
    opr.loc[opr["PI_Index"].notna(),"L1_Index"] = opr.loc[opr["PI_Index"].notna()].apply(lambda x: str(x["PI_Index"]).split(".")[0],axis=1)
    opr.loc[opr["PI_Index"].notna(),"L2_Index"] = opr.loc[opr["PI_Index"].notna()].apply(lambda x: str(x["PI_Index"]).split(".")[1],axis=1)
    opr.loc[opr["PI_Index"].notna(),"L3_Index"] = opr.loc[opr["PI_Index"].notna()].apply(lambda x: str(x["PI_Index"]).split(".")[2],axis=1)
    opr.loc[opr["PI_Index"].notna(),"L4_Index"] = opr.loc[opr["PI_Index"].notna()].apply(lambda x: str(x["PI_Index"]).split(".")[3],axis=1)
    opr.loc[opr["PI_Index"].notna(),"L5_Index"] = opr.loc[opr["PI_Index"].notna()].apply(lambda x: str(x["PI_Index"]).split(".")[4],axis=1)

    # sort by Level1, Level2, Level3
    sort_columns = ["L1_Index","L2_Index","L3_Index","L4_Index","L5_Index"]
    #opr.L1_Index.astype(int,errors="ignore")
    #opr.L2_Index.astype(int,errors="ignore")
    #opr.L3_Index.astype(int,errors="ignore")
    opr.sort_values(sort_columns,inplace=True)
    
    return opr

In [18]:
# opr = update_opr_index(opr,5)

In [19]:
# export opr

# fileout_opr = "1.0 orgtree_position_rank.xlsx"
# opr.to_excel(path_cleaning + subpath_2_7 + fileout_opr,index=False)

# Recall from earlier cleaning

#### Three Cases of  OrgName in NK elite career data

- Case 0: OrgName in 기관별인명록: code these from 1-199
- Case 1: OrgName not in 기관별인명록, but contained in data AND LinkToNext_Year not current: code as 500+
- Case 2: OrgName not in 기관별인명록, but contained in data AND LinkToNext_Year is current: code as 200+

#### Case 0: OrgName in 기관별인명록

- Update 1.0 orgtree_position_rank with OrgName in 기관별인명록
- Code these from 1-199
- Run above routine # 1. orgtree_position_rank: update all besides 노동당, 내각, 정무원
    - to update opr & pr_else3
- Proceed to Cases 1 & 2

#### Resolve. Validation values for resolving null Position_3P

1. OrgName & Position ok. Add Position to P1
2. OrgName & Position ok. Add Position to P2
3. OrgName & Position ok. Add Position to P3
4. Change_OrgName and/or Change_Position
5. Uncertain OrgName
6. Uncertain Position
7. NotJob
8. Multiple CareerSubstring
9. Other - see Notes

# Task 1. Prep & Merge Mismatch

- NotJob
    - careers: change IsJob from True to False
    - m: remove rows 
    - opr: NA
  
- MultipleSubstrings
    - careers: expand rows
    - m: expand rows and code
    - opr: NA
    
- Merge mismatch files

# Task 2. Edit & Validate PI, OrgName, Type, Index

### Step 0. Uncertain OrgName & Position
    - careerorglink: alter OrgName, Position to Uncertain
    - mismatches: remove rows containing uncertain OrgName & Position 
    - opr: NA

In [20]:
m.shape

(2245, 15)

In [21]:
m[m["Resolution"]=="5. Uncertain OrgName"].shape

(8, 15)

In [22]:
m[m["Resolution"]=="6. Uncertain Position"].shape

(12, 15)

In [23]:
m_no_org = m[m["Resolution"]=="5. Uncertain OrgName"]
m_no_pos = m[m["Resolution"]=="6. Uncertain Position"]
m1 = m[~(m["Resolution"]=="5. Uncertain OrgName") & ~(m["Resolution"]=="6. Uncertain Position")]
print(m_no_org.shape)
print(m_no_pos.shape)
print(m1.shape)

(8, 15)
(12, 15)
(2225, 15)


In [24]:
# for 5. Uncertain OrgName, mark OrgName as Uncertain

key_columns = ['CareerString', 'CareerStartYear','CareerSubstring']
col1 = col.merge(m_no_org[key_columns],on=key_columns,how="left",indicator=True)
merge_results(col1)


Merge Results...

	shape     : (9002, 14)
	left_only : (8994, 14)
	both      : (8, 14)
	right_only: (0, 14)


In [25]:
col1.loc[col1["_merge"]=="both","OrgName"]="UNCERTAIN"

In [26]:
col1.drop(columns=["_merge"],inplace=True)

In [27]:
# for 6. Uncertain Position, mark Position as Uncertain

key_columns = ['CareerString', 'CareerStartYear','CareerSubstring']
col2 = col1.merge(m_no_pos[key_columns],on=key_columns,how="left",indicator=True)
merge_results(col2)


Merge Results...

	shape     : (9002, 14)
	left_only : (8990, 14)
	both      : (12, 14)
	right_only: (0, 14)


In [28]:
col2.loc[col2["_merge"]=="both","Position"]="UNCERTAIN"

In [29]:
col2.drop(columns=["_merge"],inplace=True)

In [30]:
# check uncertain positions
col2[col2["Position"].str.upper()=="UNCERTAIN"].shape

(17, 13)

In [31]:
# check uncertain orgnames
col2[col2["OrgName"].str.upper()=="UNCERTAIN"].shape

(176, 13)

In [32]:
# confirm remaining Resolutions only include Org & Position changes
m1.Resolution.sort_values().unique()

array(['1. OrgName & Position ok. Add Position to P1',
       '2. OrgName & Position ok. Add Position to P2',
       '3. OrgName & Position ok. Add Position to P3'], dtype=object)

In [33]:
# updated tables
print("col2:\t",col2.shape)
print("m1:\t",m1.shape)
print("opr:\t",opr.shape)

col2:	 (9002, 13)
m1:	 (2225, 15)
opr:	 (2267, 23)


### Logic of Changing PI, OrgName, Position, InstitutionType

#### in mismatches, Change_PI, Change_OrgName
- indicate that CareerString was coded incorrectly
- DO NOT necessarily indicate that the original PI, OrgName pair are incorrect in orgtree

#### in orgtree, Change_PI, Change_OrgName
- indicate that the PI, OrgName are fundamentally incorrect and should be changed everywhere

### therefore, we should 

#### handle mismatches (Changes, InstitutionType)
- Step 1. using mismatches (Change_PI, Change_Org, Change_Pos), edit careerorglink & mismatches (PI, OrgName, Position) 
- Step 2. for all mismatches (InstitutionType), edit careerorglink & orgtree (InstitutionType)
- Step 3. for mismatches (IType, PI, OrgName) not in orgtree, add to orgtree with PI_Index 801

####  handle orgtree (Changes, InstitutionType)
- Step 4. using orgtree (Change_ x 3), edit careerorglink, mismatches & orgtree (InstitutionType, PI, OrgName)
- Step 5. using orgtree, update InstitutionType in col, for those jobs not in mismatch

#### handle mismatches (Positions)
- Step 6. using mismatches Position, update orgtree P1, P2, P3

### Step 1. using mismatches (Change_ x3), edit careerorglink & mismatches (PI, OrgName, Position) 

- careerorglink: Change PI, OrgName, Position
- mismatches: Change PI, OrgName, Position
- orgtree: NA
    - But we will check this later when we compare careerorglink & orgtree and validate (PI,OrgName) pairs.

In [34]:
#### select mismatches to merge with careerorglink
col_key_columns = ['CareerString', 'CareerStartYear','CareerSubstring',"PrimaryInstitution","OrgName"]
m1_col_columns = col_key_columns + ["Change_PI","Change_OrgName","Change_Position"]

In [35]:
m1.columns

Index(['CareerString', 'CareerStartYear', 'MultipleSubstrings',
       'CareerSubstring', 'IsJob', 'InstitutionType', 'PrimaryInstitution',
       'OrgName', 'Position', 'LinkToNext_Year', 'Resolution', 'Change_PI',
       'Change_OrgName', 'Change_Position', 'Notes'],
      dtype='object')

In [36]:
m1[m1["Change_PI"].notnull()].shape

(852, 15)

In [37]:
m1[m1["Change_OrgName"].notnull()].shape

(553, 15)

In [38]:
# Check no Change_OrgName have null Change_PI
m1[(m1["Change_PI"].isnull()) & (m1["Change_OrgName"].notnull())].shape

(0, 15)

In [39]:
# Check no Change_OrgName are UNCERTAIN. These should have already been removed.
m1[m1["Change_OrgName"].str.upper()=="UNCERTAIN"].shape

(0, 15)

In [40]:
m1_change = m1.loc[m1["Change_PI"].notnull(),m1_col_columns]
m1_change = m1_change.drop_duplicates(keep="first")
m1_change.shape

(816, 8)

#### careerorglink: change PI, OrgName, Position

In [41]:
col3 = col2.merge(m1_change,on=col_key_columns,how="left",suffixes=("","_m"),indicator=True)
merge_results(col3)


Merge Results...

	shape     : (9002, 17)
	left_only : (8179, 17)
	both      : (823, 17)
	right_only: (0, 17)


In [42]:
col3.columns

Index(['CareerString', 'CareerDateString_2022', 'IsJob', 'MultipleSubstrings',
       'CareerStartYear', 'CareerStartMonth', 'CareerSubstring', 'OrgString',
       'InstitutionType', 'PrimaryInstitution', 'OrgName', 'Position', 'Notes',
       'Change_PI', 'Change_OrgName', 'Change_Position', '_merge'],
      dtype='object')

In [43]:
col2.columns

Index(['CareerString', 'CareerDateString_2022', 'IsJob', 'MultipleSubstrings',
       'CareerStartYear', 'CareerStartMonth', 'CareerSubstring', 'OrgString',
       'InstitutionType', 'PrimaryInstitution', 'OrgName', 'Position',
       'Notes'],
      dtype='object')

In [44]:
col3[col3.Change_PI.notnull()].shape

(823, 17)

In [45]:
col3.loc[col3.Change_PI.notnull(),"PrimaryInstitution"] = col3.loc[col3.Change_PI.notnull(),"Change_PI"]

In [46]:
col3.loc[col3.Change_PI.notnull(),"OrgName"] = col3.loc[col3.Change_PI.notnull(),"Change_OrgName"]

In [47]:
col3.loc[col3.Change_Position.notnull(),"Position"] = col3.loc[col3.Change_Position.notnull(),"Change_Position"]

In [48]:
col3 = col3[col2.columns]
col3.shape

(9002, 13)

#### mismatches: change PI, OrgName, Position

In [49]:
m2 = m1.copy(deep=True)

In [50]:
m2.loc[m2.Change_PI.notnull(),"PrimaryInstitution"] = m1.loc[m1.Change_PI.notnull(),"Change_PI"]

In [51]:
m2.loc[m2.Change_PI.notnull(),"OrgName"] = m1.loc[m1.Change_PI.notnull(),"Change_OrgName"]

In [52]:
m2.loc[m2.Change_Position.notnull(),"Position"] = m1.loc[m1.Change_Position.notnull(),"Change_Position"]

In [53]:
m2.columns

Index(['CareerString', 'CareerStartYear', 'MultipleSubstrings',
       'CareerSubstring', 'IsJob', 'InstitutionType', 'PrimaryInstitution',
       'OrgName', 'Position', 'LinkToNext_Year', 'Resolution', 'Change_PI',
       'Change_OrgName', 'Change_Position', 'Notes'],
      dtype='object')

In [54]:
# mismatch: drop Changex3 columns. We've made all necessary changes.
m2.drop(columns=["Change_PI","Change_OrgName","Change_Position"],inplace=True)

In [55]:
# updated tables
print("col3:\t",col3.shape)
print("m2:\t",m2.shape)
print("opr:\t",opr.shape)

col3:	 (9002, 13)
m2:	 (2225, 12)
opr:	 (2267, 23)


### Step 2. for all mismatches InstitutionType, edit careerorglink & orgtree InstitutionType 

- careerorglink: Change InstitutionType
- mismatches: NA
- orgtree: Change InstitutionType

#### select mismatches to merge with careerorglink

In [56]:
col_key_columns = ['CareerString', 'CareerStartYear','CareerSubstring',"PrimaryInstitution","OrgName"]
m2_col_columns = col_key_columns + ["InstitutionType"]
m2_merge = m2[m2_col_columns]
m2_merge = m2_merge.drop_duplicates(keep="first")
m2_merge.shape

(2183, 6)

#### careerorglink: change InstitutionType

In [57]:
# merge col3 with m2 (complete & updated mismatches)
col4 = col3.merge(m2_merge,on=col_key_columns,how="left",suffixes=("","_m"),indicator=True)
merge_results(col4)


Merge Results...

	shape     : (9002, 15)
	left_only : (6804, 15)
	both      : (2198, 15)
	right_only: (0, 15)


In [58]:
col4.columns

Index(['CareerString', 'CareerDateString_2022', 'IsJob', 'MultipleSubstrings',
       'CareerStartYear', 'CareerStartMonth', 'CareerSubstring', 'OrgString',
       'InstitutionType', 'PrimaryInstitution', 'OrgName', 'Position', 'Notes',
       'InstitutionType_m', '_merge'],
      dtype='object')

In [59]:
col4.loc[col4["_merge"]=="both","InstitutionType"] = col4.loc[col4["_merge"]=="both","InstitutionType_m"]

In [60]:
col4 = col4[col3.columns]
col4.shape

(9002, 13)

#### select mismatches to merge with orgtree

In [61]:
opr_key_columns = ["PrimaryInstitution","OrgName"]
m2_org_columns = opr_key_columns + ["InstitutionType"]
m2_merge = m2[m2_org_columns]
m2_merge = m2_merge.drop_duplicates(keep="first")
m2_merge.shape

(521, 3)

#### orgtree: change InstitutionType

In [62]:
# merge opr on m2 (complete & updated mismatches)
opr2 = opr.merge(m2_merge,on=opr_key_columns,how="left",suffixes=("","_m"),indicator=True)
merge_results(opr2)


Merge Results...

	shape     : (2267, 25)
	left_only : (1872, 25)
	both      : (395, 25)
	right_only: (0, 25)


In [63]:
opr2.columns

Index(['InsideGov', 'InstitutionType', 'OrgType', 'PrimaryInstitution',
       'OrgName', 'PI_Index', 'OrgRank', 'P1', 'P2', 'P3', 'LinkToNext_PI',
       'LinkToNext_Org', 'LinkToNext_Year', 'Notes', 'L1_Index', 'L2_Index',
       'L3_Index', 'L4_Index', 'L5_Index', 'Change_InstitutionType',
       'Change_PI', 'Change_OrgName', 'Alias_OrgName', 'InstitutionType_m',
       '_merge'],
      dtype='object')

In [64]:
opr2.loc[opr2["_merge"]=="both","InstitutionType"] = opr2.loc[opr2["_merge"]=="both","InstitutionType_m"]

In [65]:
opr2 = opr2[opr.columns]
opr2.shape

(2267, 23)

In [66]:
opr.columns

Index(['InsideGov', 'InstitutionType', 'OrgType', 'PrimaryInstitution',
       'OrgName', 'PI_Index', 'OrgRank', 'P1', 'P2', 'P3', 'LinkToNext_PI',
       'LinkToNext_Org', 'LinkToNext_Year', 'Notes', 'L1_Index', 'L2_Index',
       'L3_Index', 'L4_Index', 'L5_Index', 'Change_InstitutionType',
       'Change_PI', 'Change_OrgName', 'Alias_OrgName'],
      dtype='object')

In [67]:
# updated tables
print("col4:\t",col4.shape)
print("m2:\t",m2.shape)
print("opr2:\t",opr2.shape)

col4:	 (9002, 13)
m2:	 (2225, 12)
opr2:	 (2267, 23)


### Step 3. for mismatches (IType, PI, OrgName) not in orgtree, add to orgtree with PI_Index 801

#### orgtree:concat mismatches (IType, PI, OrgName) not in orgtree

In [68]:
m2_merge.columns

Index(['PrimaryInstitution', 'OrgName', 'InstitutionType'], dtype='object')

In [69]:
opr2.columns

Index(['InsideGov', 'InstitutionType', 'OrgType', 'PrimaryInstitution',
       'OrgName', 'PI_Index', 'OrgRank', 'P1', 'P2', 'P3', 'LinkToNext_PI',
       'LinkToNext_Org', 'LinkToNext_Year', 'Notes', 'L1_Index', 'L2_Index',
       'L3_Index', 'L4_Index', 'L5_Index', 'Change_InstitutionType',
       'Change_PI', 'Change_OrgName', 'Alias_OrgName'],
      dtype='object')

In [70]:
# merge m2_merge with opr2; identify mismatches not contained in opr2
m2_concat = m2_merge.merge(opr2,on=opr_key_columns,how="outer",suffixes=("","_m"),indicator=True)
merge_results(m2_concat)


Merge Results...

	shape     : (2396, 25)
	left_only : (129, 25)
	both      : (395, 25)
	right_only: (1872, 25)


In [71]:
# the number of new PI, OrgName pairs we'll add to orgtree from mismatches
m2_concat = m2_concat[m2_concat["_merge"]=="left_only"]
m2_concat.shape

(129, 25)

In [72]:
m2_concat["InsideGov"]=np.nan

In [73]:
[item for item in list(m2_concat.columns) if item not in list(opr2.columns)]

['InstitutionType_m', '_merge']

In [74]:
[item for item in list(opr2.columns) if item not in list(m2_concat.columns)]

[]

In [75]:
opr2.columns

Index(['InsideGov', 'InstitutionType', 'OrgType', 'PrimaryInstitution',
       'OrgName', 'PI_Index', 'OrgRank', 'P1', 'P2', 'P3', 'LinkToNext_PI',
       'LinkToNext_Org', 'LinkToNext_Year', 'Notes', 'L1_Index', 'L2_Index',
       'L3_Index', 'L4_Index', 'L5_Index', 'Change_InstitutionType',
       'Change_PI', 'Change_OrgName', 'Alias_OrgName'],
      dtype='object')

In [76]:
m2_concat = m2_concat[opr2.columns]
m2_concat.shape

(129, 23)

In [77]:
m2_concat.InsideGov.unique()

array([nan])

In [78]:
m2_concat.OrgType.unique()

array([nan], dtype=object)

In [79]:
# Designate PI_Index as 801. Later I can re-code this.
m2_concat["PI_Index"]="801.0"

In [80]:
m2_concat.head(5)

Unnamed: 0,InsideGov,InstitutionType,OrgType,PrimaryInstitution,OrgName,PI_Index,OrgRank,P1,P2,P3,...,Notes,L1_Index,L2_Index,L3_Index,L4_Index,L5_Index,Change_InstitutionType,Change_PI,Change_OrgName,Alias_OrgName
3,,정권기관,,내각,외무성_아프리카아랍라틴아메리카국_나이지리아대사관,801.0,,,,,...,,,,,,,,,,
4,,정권기관,,내각,철도성_X국,801.0,,,,,...,,,,,,,,,,
6,,정권기관,,내각,외무성_아프리카아랍라틴아메리카국_콩고민주공화국대사관,801.0,,,,,...,,,,,,,,,,
10,,정권기관,,내각,평안남도농촌경리위원회,801.0,,,,,...,,,,,,,,,,
18,,당외곽및사회단체_사회부문,,평양국제관계대학,,801.0,,,,,...,,,,,,,,,,


In [81]:
opr3 = pd.concat([opr2,m2_concat])

In [82]:
# updated tables
print("col4:\t",col4.shape)
print("m2:\t",m2.shape)
print("opr3:\t",opr3.shape)

col4:	 (9002, 13)
m2:	 (2225, 12)
opr3:	 (2396, 23)


### Step 4. using orgtree (Change_ x2), edit careerorglink, mismatches & orgtree (PI, OrgName)

- careerorglink: change PI, OrgName
- mimatches: removed those with changed InstitutionType, PI, OrgName, because their Position may be incorrect
- orgtree: change InstitutionType, PI, OrgName

#### select opr to merge with careerorglink & mismatches

In [83]:
opr_key_columns = ["PrimaryInstitution","OrgName"]
opr_col_columns = opr_key_columns + ["Change_InstitutionType","Change_PI","Change_OrgName"]

In [84]:
opr3[opr3["Change_PI"].notnull()].shape

(18, 23)

In [85]:
opr3[opr3["Change_OrgName"].notnull()].shape

(36, 23)

In [86]:
opr3[opr3["Change_InstitutionType"].notnull()].shape

(16, 23)

In [87]:
opr3_changes = opr3.loc[(opr3["Change_PI"].notnull()) | (opr3["Change_OrgName"].notnull()) | (opr3["Change_InstitutionType"].notnull()),opr_col_columns]
opr3_changes.shape

(45, 5)

In [88]:
opr3_changes = unique_non_null_rows(opr3_changes)
opr3_changes.shape


Unique Non-null Rows...

	Non-unique rows: (45, 5)
	Unique rows    : (45, 5)


(45, 5)

#### edit careerorglink

In [89]:
col5 = col4.merge(opr3_changes,on=opr_key_columns,how="left",suffixes=("","_m"),indicator=True)
merge_results(col5)


Merge Results...

	shape     : (9002, 17)
	left_only : (8790, 17)
	both      : (212, 17)
	right_only: (0, 17)


In [90]:
# when Change_InstitutionType is not null, change all 3
col5.loc[col5["Change_InstitutionType"].notnull(),"InstitutionType"] = col5.loc[col5["Change_InstitutionType"].notnull(),"Change_InstitutionType"]

In [91]:
col5.loc[col5["Change_InstitutionType"].notnull(),"PrimaryInstitution"] = col5.loc[col5["Change_InstitutionType"].notnull(),"Change_PI"]

In [92]:
col5.loc[col5["Change_InstitutionType"].notnull(),"OrgName"] = col5.loc[col5["Change_InstitutionType"].notnull(),"Change_OrgName"]

In [93]:
# when Change_InstitutionType is null & Change_OrgName is not null, then just change OrgName

In [94]:
col5.loc[col5["Change_InstitutionType"].isnull() & col5["Change_OrgName"].notnull(),"OrgName"] = col5.loc[col5["Change_InstitutionType"].isnull() & col5["Change_OrgName"].notnull(),"Change_OrgName"]

In [95]:
col5 = col5[col4.columns]

In [96]:
col5.shape

(9002, 13)

#### remove mismatches

In [97]:
opr_key_columns = ["PrimaryInstitution","OrgName"]
m3 = m2.merge(opr3_changes,on=opr_key_columns,how="left",suffixes=("","_m"),indicator=True)
merge_results(m3)


Merge Results...

	shape     : (2225, 16)
	left_only : (2112, 16)
	both      : (113, 16)
	right_only: (0, 16)


In [98]:
m3 = m3[m3["_merge"]!="both"]
m3.shape

(2112, 16)

In [99]:
m3 = m3[m2.columns]
m3.shape

(2112, 12)

#### edit orgtree

In [100]:
opr4 = opr3.copy(deep=True)

In [101]:
# when Change_InstitutionType is not null, change all 3
opr4.loc[opr4["Change_InstitutionType"].notnull(),"InstitutionType"] = opr4.loc[opr4["Change_InstitutionType"].notnull(),"Change_InstitutionType"]

In [102]:
opr4.loc[opr4["Change_InstitutionType"].notnull(),"PrimaryInstitution"] = opr4.loc[opr4["Change_InstitutionType"].notnull(),"Change_PI"]

In [103]:
opr4.loc[opr4["Change_InstitutionType"].notnull(),"OrgName"] = opr4.loc[opr4["Change_InstitutionType"].notnull(),"Change_OrgName"]

In [104]:
# when Change_InstitutionType is null & Change_OrgName is not null, then just change OrgName

In [105]:
opr4.loc[opr4["Change_InstitutionType"].isnull() & opr4["Change_OrgName"].notnull(),"OrgName"] = opr4.loc[opr4["Change_InstitutionType"].isnull() & opr4["Change_OrgName"].notnull(),"Change_OrgName"]

In [106]:
# opr3: drop Changex3 columns. We've made all necessary changes.
opr4.drop(columns=["Change_InstitutionType","Change_PI","Change_OrgName"],inplace=True)

In [107]:
opr4 = opr4.drop_duplicates(keep="first")

In [108]:
opr4.columns

Index(['InsideGov', 'InstitutionType', 'OrgType', 'PrimaryInstitution',
       'OrgName', 'PI_Index', 'OrgRank', 'P1', 'P2', 'P3', 'LinkToNext_PI',
       'LinkToNext_Org', 'LinkToNext_Year', 'Notes', 'L1_Index', 'L2_Index',
       'L3_Index', 'L4_Index', 'L5_Index', 'Alias_OrgName'],
      dtype='object')

In [109]:
col5.columns

Index(['CareerString', 'CareerDateString_2022', 'IsJob', 'MultipleSubstrings',
       'CareerStartYear', 'CareerStartMonth', 'CareerSubstring', 'OrgString',
       'InstitutionType', 'PrimaryInstitution', 'OrgName', 'Position',
       'Notes'],
      dtype='object')

In [110]:
m3.columns

Index(['CareerString', 'CareerStartYear', 'MultipleSubstrings',
       'CareerSubstring', 'IsJob', 'InstitutionType', 'PrimaryInstitution',
       'OrgName', 'Position', 'LinkToNext_Year', 'Resolution', 'Notes'],
      dtype='object')

In [111]:
# updated tables
print("col5:\t",col5.shape)
print("m3:\t",m3.shape)
print("opr4:\t",opr4.shape)

col5:	 (9002, 13)
m3:	 (2112, 12)
opr4:	 (2392, 20)


### Step 5. using orgtree, update InstitutionType in careerorglink, for those jobs not in mismatch

- careerorglink: change InstitutionType

#### select opr to merge with careerorglink

In [112]:
opr_key_columns = ["PrimaryInstitution","OrgName"]
opr_col_columns = opr_key_columns + ["InstitutionType"]
opr4_merge = opr4[opr_col_columns]
opr4_merge = opr4_merge.drop_duplicates()
opr4_merge.shape

(2349, 3)

In [113]:
opr4_merge[opr4_merge.duplicated(opr_key_columns,keep=False)].shape

(0, 3)

#### edit careerorglink

In [114]:
col6 = col5.merge(opr4_merge,on=opr_key_columns,how="left",suffixes=("","_m"),indicator=True)
merge_results(col6)


Merge Results...

	shape     : (9002, 15)
	left_only : (2372, 15)
	both      : (6630, 15)
	right_only: (0, 15)


In [115]:
col6.columns

Index(['CareerString', 'CareerDateString_2022', 'IsJob', 'MultipleSubstrings',
       'CareerStartYear', 'CareerStartMonth', 'CareerSubstring', 'OrgString',
       'InstitutionType', 'PrimaryInstitution', 'OrgName', 'Position', 'Notes',
       'InstitutionType_m', '_merge'],
      dtype='object')

In [116]:
# out of curiosity, how many InstitutionType don't match?
col6[(col6["InstitutionType"].isnull()) & (col6["InstitutionType_m"].isnull())].shape

(2382, 15)

In [117]:
# only copy over all non-null InstitutionType from orgtree to careerorglink
col6.loc[col6["InstitutionType_m"].notnull(),"InstitutionType"] = col6.loc[col6["InstitutionType_m"].notnull(),"InstitutionType_m"]

In [118]:
# the number of careers without InstitutionType: NotJobs and Uncertains?
col6[(col6["InstitutionType"].isnull()) & (col6["InstitutionType_m"].isnull())].shape

(2382, 15)

# Issue: if Certain, then update InstitutionType in careerorglink & orgtree

In [119]:
# if Uncertain, then change IsJob to False
# if Certain, then update InstitutionType in careerorglink & orgtree
col6[(col6["InstitutionType"].isnull()) & (col6["IsJob"]=="True")].shape

(217, 15)

In [120]:
col6[(col6["InstitutionType"].isnull()) & (col6["IsJob"]=="True")].head(5)

Unnamed: 0,CareerString,CareerDateString_2022,IsJob,MultipleSubstrings,CareerStartYear,CareerStartMonth,CareerSubstring,OrgString,InstitutionType,PrimaryInstitution,OrgName,Position,Notes,InstitutionType_m,_merge
33,"1951.~57. 루마니아 유학 연도미상 당 조직지도부, 과학교육부, 기계공업부 등...",,True,4,1951.0,,당 기계공업부,,,uncertain,uncertain,uncertain,,,left_only
50,은하수관현악단 가수,연도미상,True,1,,,,은하수관현악단,,은하수관혁악단,,가수,,,left_only
71,당 중앙위원회 고문,2019.03,True,1,2019.0,3.0,,노동당 중앙인민위원회,,노동당,UNCERTAIN,고문,,,left_only
106,"1992. 4 인민군 중장, 인민무력성 총참모부 공병국장",,True,1,1992.0,4.0,인민무력성 총참모부 공병국장,인민무력성 총참모부,,uncertain,uncertain,공병국장,,,left_only
131,남포대학 과장,연도미상,True,1,,,,남포대학,,내각,교육위원회_고등교육성_남포대학,과장,,,left_only


In [121]:
# all non-null InstitutionType have been copied 
col6[(col6["InstitutionType"].isnull()) & (col6["InstitutionType_m"].notnull())].shape

(0, 15)

In [122]:
col6 = col6[col5.columns]

In [123]:
# updated tables
print("col6:\t",col6.shape)
print("m3:\t",m3.shape)
print("opr4:\t",opr4.shape)

col6:	 (9002, 13)
m3:	 (2112, 12)
opr4:	 (2392, 20)


### Step 6. using mismatches Position, update orgtree P1, P2, P3

In [124]:
#### select mismatches to merge with orgtree

In [125]:
m3.columns

Index(['CareerString', 'CareerStartYear', 'MultipleSubstrings',
       'CareerSubstring', 'IsJob', 'InstitutionType', 'PrimaryInstitution',
       'OrgName', 'Position', 'LinkToNext_Year', 'Resolution', 'Notes'],
      dtype='object')

In [126]:
opr_key_columns = ["PrimaryInstitution","OrgName"]
m3_org_columns = opr_key_columns + ["Position","Resolution"]
m3pos_merge = m3[m3_org_columns]
m3pos_merge = m3pos_merge.drop_duplicates(keep="first")
m3pos_merge.shape

(799, 4)

In [127]:
# multiple (PI, OrgName) pairs, each with different Positions and Resolution directives (PI, P2, P3)
# how do I merge these with orgtree, without generating duplicate rows?
# I could process them one at a time
# or I could merge them all at the same time and then aggregate P1, P2, P3 position lists using groupby.apply or transform
# the groupby solution may be more elegant, if I only had to transform one column
# but in this case I need to transform 3 columns at the same time
m3pos_merge[m3pos_merge.duplicated(opr_key_columns)].shape

(285, 4)

#### orgtree: update Positions

In [128]:
def update_positions(m,opr_old):
    
    opr = opr_old.copy(deep=True)

    ind_PI = list(m.columns).index("PrimaryInstitution")
    ind_Org = list(m.columns).index("OrgName")
    ind_Pos = list(m.columns).index("Position")
    ind_Res = list(m.columns).index("Resolution")
    # print(ind_PI,ind_Org,ind_Pos,ind_Res)
    
    row_range = range(0,m.shape[0])
    # print(row_range)
    
    for i in row_range:
        
        PI = m.iloc[i,ind_PI]
        Org = m.iloc[i,ind_Org]
        Pos = m.iloc[i,ind_Pos]
        Res = m.iloc[i,ind_Res]
        # print(i,PI,Org,Pos,Res)
        
        # select among P1, P2, P3 
        PosRank = "P1"
        if Res == "2. OrgName & Position ok. Add Position to P2": PosRank = "P2"
        if Res == "3. OrgName & Position ok. Add Position to P3": PosRank = "P3"
        
        # get current Positions
        currPos = list(opr.loc[(opr["PrimaryInstitution"]==PI) & (opr["OrgName"]==Org),PosRank].values)
        PosInCurrPos = Pos in currPos
        newPos = [item.strip() for item in currPos if item not in [np.nan,"nan"]]

        # edit Positions
        if not PosInCurrPos:
            newPos.append(Pos.strip())
            
        print(i,PI,Org,Pos,PosRank,PosInCurrPos,currPos,",".join(newPos))
        
        # update DF with new Positions
        opr.loc[(opr["PrimaryInstitution"]==PI) & (opr["OrgName"]==Org),PosRank] = ",".join(newPos)

    return opr
        

In [129]:
m3pos_merge.sort_values(["PrimaryInstitution","OrgName"],inplace=True)

In [130]:
 opr5 = update_positions(m3pos_merge,opr4)

0 4.15문화창작단 nan 단장 P1 False [] 단장
1 4.15문화창작단 nan 부단장 P2 False [] 부단장
2 7.7연합기업소 nan 기사장 P1 False [] 기사장
3 강동지구탄광연합기업소 nan 지배인 P1 False [] 지배인
4 강서구역청산협동농장 nan 관리위원장 P1 False [] 관리위원장
5 강선제강소 nan 책임비서 P1 False [] 책임비서
6 강원도인민위원회 nan 부위원장 P2 False [] 부위원장
7 강원도인민위원회 nan 위원장 P1 False [] 위원장
8 강원도임업연합기업소 nan 지배인 P1 False [] 지배인
9 개성무역총회사 nan 총사장 P1 False [] 총사장
10 개성방직공장 3대혁명소조 소조원 P3 False [nan] 소조원
11 개성시농촌경리위원회 nan 위원장 P1 False [] 위원장
12 개성시인민위원회 nan 부위원장 P2 False [] 부위원장
13 개성시인민위원회 nan 위원장 P1 False [] 위원장
14 개천군협동농장 nan 관리위원장 P1 False [] 관리위원장
15 공군사령부 제3전단 부사령관 P2 False [nan] 부사령관
16 공군사령부 제3전단 사령관 P1 False [nan] 사령관
17 공군사령부 nan 부사령관 P2 False [] 부사령관
18 공군사령부 nan 사령관 P1 False [] 사령관
19 공군사령부 nan 정치위원 P3 False [] 정치위원
20 공군사령부 nan 참모장 P1 False [] 참모장
21 공산청년동맹 nan 위원장 P1 False [] 위원장
22 공산청년동맹 nan 서기 P3 False [] 서기
23 과학백과사전출판사 nan 사장 P1 False [] 사장
24 과학백과사전출판사 nan 책임주필 P1 False [] 책임주필
25 광명성경제연합회 nan 회장 P1 False [] 회장
26 구성공작기계공장 nan 지배인 P1 False [] 지배인
27 국가개발은행 nan 이사장 P1 False

261 노동당 당중앙위원회 총비서 P1 False [nan] 총비서
262 노동당 당중앙위원회_39호실 부실장 P2 False [nan] 부실장
263 노동당 당중앙위원회_39호실 제1부부장 P2 False ['부실장'] 부실장,제1부부장
264 노동당 당중앙위원회_X위원회 위원장 P1 False [nan] 위원장
265 노동당 당중앙위원회_X위원회 부위원장 P2 False [nan] 부위원장
266 노동당 당중앙위원회_X위원회 비서 P2 False ['부위원장'] 부위원장,비서
267 노동당 당중앙위원회_간부부 부장 P1 True ['부장'] 부장
268 노동당 당중앙위원회_검열위원회 위원장 P1 True ['위원장'] 위원장
269 노동당 당중앙위원회_검열위원회 제1부위원장 P2 False ['부위원장'] 부위원장,제1부위원장
270 노동당 당중앙위원회_경공업부 부장 P1 True ['부장'] 부장
271 노동당 당중앙위원회_경제부 부장 P1 True ['부장'] 부장
272 노동당 당중앙위원회_경제정책검열부 부장 P1 True ['부장'] 부장
273 노동당 당중앙위원회_공업부 부장 P1 False [nan] 부장
274 노동당 당중앙위원회_과학교육부 부장 P1 True ['부장'] 부장
275 노동당 당중앙위원회_과학교육부 부부장 P2 False ['제1부부장'] 제1부부장,부부장
276 노동당 당중앙위원회_과학교육부 지도원 P3 False [nan] 지도원
277 노동당 당중앙위원회_과학교육부_X과 과장 P1 False [nan] 과장
278 노동당 당중앙위원회_국제부 부장 P1 True ['부장'] 부장
279 노동당 당중앙위원회_국제부 지도원 P3 False [nan] 지도원
280 노동당 당중앙위원회_국제부_X과 과장 P1 False [nan] 과장
281 노동당 당중앙위원회_군사부 부장 P1 True ['부장'] 부장
282 노동당 당중앙위원회_군사위원회 위원 P3 False [nan] 위원
283 노동당 당중앙위원회_군수공업부 제1부부장 P2

608 중앙인민위원회 경제정책위원회 위원 P3 False [nan] 위원
609 중앙인민위원회 대외정책위원회 위원 P3 False [nan] 위원
610 중앙인민위원회 법제위원회 위원장 P1 False [nan] 위원장
611 중앙인민위원회 인민무력부 부부장 P2 False [nan] 부부장
612 중앙인민위원회 인민무력부C 부부장 P2 False [nan] 부부장
613 중앙인민위원회 인민무력부C 제1부부장 P2 False ['부부장'] 부부장,제1부부장
614 중앙인민위원회 인민무력부C 부장 P1 False [nan] 부장
615 중앙인민위원회 인민무력부C_공병국 국장 P1 False [nan] 국장
616 중앙인민위원회 인민무력부C_군수동원총국 국장 P1 False [nan] 국장
617 중앙인민위원회 인민무력부C_보위국 국장 P1 False [nan] 국장
618 중앙인민위원회 인민무력부C_후방총국 총국장 P1 False [nan] 총국장
619 중앙인민위원회 인민무력부_X국 부국장 P2 False [nan] 부국장
620 중앙인민위원회 인민무력부_간부국 국장 P1 False [nan] 국장
621 중앙인민위원회 인민무력부_보위사령부 사령관 P1 False [nan] 사령관
622 중앙인민위원회 인민무력부_정찰국 국장 P1 False [nan] 국장
623 중앙인민위원회 인민무력부_후방총국 총국장 P1 False [nan] 총국장
624 중앙인민위원회 주석부 책임서기 P3 False [nan] 책임서기
625 중앙인민위원회 nan 국가부주석 P2 False [] 국가부주석
626 중앙인민위원회 nan 부주석 P2 False [] 부주석
627 중앙인민위원회 nan 수위 P1 False [] 수위
628 중앙인민위원회 nan 부장 P1 False [] 부장
629 중앙인민위원회 nan 위원 P3 False [] 위원
630 중앙인민위원회 nan 제1부주석 P2 False [] 제1부주석
631 중앙인민위원회 nan 주석 P1 False [] 주석
632 

In [131]:
opr4[(opr4["PrimaryInstitution"]=="국방위원회") & (opr4["OrgName"]=="인민무력부D")]

Unnamed: 0,InsideGov,InstitutionType,OrgType,PrimaryInstitution,OrgName,PI_Index,OrgRank,P1,P2,P3,LinkToNext_PI,LinkToNext_Org,LinkToNext_Year,Notes,L1_Index,L2_Index,L3_Index,L4_Index,L5_Index,Alias_OrgName
2086,1,인민군,,국방위원회,인민무력부D,501.502.0.0.0,1,부장,부부장,,,,2016,,501,502,0,0,0,


In [132]:
opr5[(opr5["PrimaryInstitution"]=="국방위원회") & (opr5["OrgName"]=="인민무력부D")]

Unnamed: 0,InsideGov,InstitutionType,OrgType,PrimaryInstitution,OrgName,PI_Index,OrgRank,P1,P2,P3,LinkToNext_PI,LinkToNext_Org,LinkToNext_Year,Notes,L1_Index,L2_Index,L3_Index,L4_Index,L5_Index,Alias_OrgName
2086,1,인민군,,국방위원회,인민무력부D,501.502.0.0.0,1,부장,"부부장,제1부부장",,,,2016,,501,502,0,0,0,


In [133]:
opr4.shape

(2392, 20)

In [134]:
opr5.shape

(2392, 20)

In [135]:
# drop Resolution from mismatches; actually, it doesn't matter, because at this point, w 

In [136]:
# updated tables
print("Prior to this Step")
print("col6:\t",col6.shape)
print("m3:\t",m3.shape)
print("opr4:\t",opr4.shape)

print("")

print("After to this Step")
print("col6:\t",col6.shape)
print("m3:\t",m3.shape)
print("opr5:\t",opr5.shape)

Prior to this Step
col6:	 (9002, 13)
m3:	 (2112, 12)
opr4:	 (2392, 20)

After to this Step
col6:	 (9002, 13)
m3:	 (2112, 12)
opr5:	 (2392, 20)


# Format & Export Tables

#### format careerorglink

In [137]:
careerorglink_columns = ['CareerString', 'CareerDateString_2022',
                         'IsJob', 'MultipleSubstrings','CareerStartYear', 'CareerStartMonth',
                         'CareerSubstring','InstitutionType', 'PrimaryInstitution', 'OrgName', 'Position', 'Notes']

In [138]:
# check that col has all necessary columns
[item for item in careerorglink_columns if item not in col6.columns]

[]

In [139]:
# remove these items
[item for item in col6.columns if item not in careerorglink_columns]

['OrgString']

In [140]:
col6 = col6[careerorglink_columns]

#### export careerorglink to cleaning

In [141]:
filename_careerorglink_new = "3.0 careerorglink.xlsx"
col6.to_excel(path_cleaning + subpath_2_7 + filename_careerorglink_new,index=False)

#### export careerorglink to tables. write over the old one.

In [142]:
filename_careerorglink = "careerorglink.xlsx"
col6.to_excel(path_tables + filename_careerorglink,index=False)

#### export mismatches to cleaning

In [143]:
filename_mismatch_new = "3.0 mismatch_통합.xlsx"
m3.to_excel(path_cleaning + subpath_2_7 + filename_mismatch_new,index=False)

#### format orgtree

In [144]:
orgtree_columns = ['InstitutionType', 'OrgType', 'PrimaryInstitution','OrgName',
                   'PI_Index', 'OrgRank', 'P1', 'P2', 'P3','Alias_OrgName',
                   'LinkToNext_PI','LinkToNext_Org','LinkToNext_Year','Notes',
                   'L1_Index', 'L2_Index','L3_Index', 'L4_Index', 'L5_Index']

In [145]:
# check that orgtree has all necessary columns
[item for item in orgtree_columns if item not in opr5.columns]

[]

In [146]:
# remove these items
[item for item in opr5.columns if item not in orgtree_columns]

['InsideGov']

In [147]:
opr5 = opr5[orgtree_columns]

In [148]:
# update indices & ranks
opr5 = update_opr_index(opr5,6)

#### export orgtree to cleaning

In [149]:
filename_orgtree_new = "3.0 orgtree.xlsx"
opr5.to_excel(path_cleaning + subpath_2_7 + filename_orgtree_new,index=False)

#### export orgtree to tables

In [150]:
filename_orgtree = "orgtree.xlsx"
opr5.to_excel(path_tables + filename_orgtree,index=False)

- Review Integrity Checks & Edit Mismatches
    - Validate InstitutionType, OrgType
    - Data in Careers but not in Orgtree, including Aliases)
    - Orgs in Orgtree but not in Careers or 기관별인명록
    - For 200 & 500 series PrimaryInstitutions, search whether they are contained in OrgName, within a PrimaryInstitution
    - Verify no Positions overlapping within the same PI, OrgName
    - Verify that all (CareerString,CareerDateString,CareerSubstring) are uniquely matched to a (PI, OrgName). Not multiple encodings

- Recode & Recategorize some (PI,OrgName)
    - 중앙위원회
    - 인민군, 총참모부, 인민무력부, 
    - 국방위원회: 인민군?
    - 중앙인민위원회: 정권기관
    - 국제친선기관: I will add and match existing ones
    - 인민회의 or 인민위원회?
    - 노동당 총정치국 --> 인민군 총정치국
    - add 북조선노동당 as an alias to 노동당
    - alias the PrimaryInstitutions, 내각, 내각A, 내각B, 등 at least when calculating whether transitions are between or across institutions
    - 노동당 - fix the old party organization
        - https://encykorea.aks.ac.kr/Article/E0070188
        - 1947년 북조선로동당 ‘5과’로 창설되었다. 6·25전쟁기 남한지역 내 게릴라부대의 지하당 공작 과정에서 대남 공작기구를 확대하면서 당 조직부 내의 연락부를 분리했다. 1961년 4차 당대회에서 남한에서의 지하당 조직 확대와 통일전선 강화 등의 결정과 함께 내무성 등의 대남 공작기구를 통합해 중앙당 ‘연락국’을 신설했다. 1964년 2월 조선로동당 중앙위원회 4기 8차 전원회의에서 ‘3대 혁명력량 강화’노선을 채택하면서 종래의 연락국을 ‘대남사업총국’으로 개칭했다.
        - 1966년 10월 12일 당 중앙위원회 4기 14차 전원회의에서 비서국이 신설되면서 대남사업담당비서 산하에 ‘연락부’, ‘문화부’, ‘조사부’, ‘인민무력부정착국’, ‘조총련’부서를 두었으며, 1978년 1월 통일전선부가, 1990년대 후반 35호실(대외정보조사부)과 작전부가 신설됨으로써 대남 담당 부서의 체계가 구축되었다. 남한 내 정당, 사회단체, 군부에 대한 공작 거점 및 공작 전술 연구개발 업무를 수행하고 있다. 특히 간첩교육과 파견을 직접 담당하고 있으며, 조총련에 대한 활동 지도도 담당하고 있다.
        
    - 내각 & 정무원
        - How different was their organization?
        - https://encykorea.aks.ac.kr/Article/E0066727 - Let's alias. It seems like the name simply changed from 내각 (1948) --> 정무원 (1972) --> 내각 (1998)
        - Do we need to duplicate the entire 내각 structure for 정무원?
        - At the very least, should we swap out 내각 for 정무원 in cases where data are labelled as 내각 but during 정무원 periods (pre-1998)?
        - If the internal organization was quite similar, then maybe we could just alias 내각 and 정무원 and not worry about replicating the organization three
    - 인도기관?
        - 국방위원회
        - 국무위원회
    - resolve differences between InstitutionType, PrimaryInstitution, OrgName
        - for PI=총참모부, rename PI=인민군, OrgName = 총참모부_; and remove duplicates. e.g., 총참모부_작전총국
        - 노동당, 당중앙위원회
        - many organizations for whom 중앙위원회 is their main leadership body
        - 인민군. remove as PrimaryInstitution, and use as InstitutionType instead
        - https://encykorea.aks.ac.kr/Article/E0066687
        - 인민무력부, 인민무력성, 
		- PrimaryInstitutions
			- 최고사령부
			- 총참모부
			- 주요 사령부
			- 등
    - Make sure X.0.0.0 positions are not being duplicated with X.1.0.0 positions, especially in the case of 중앙위원회
    - Discuss X in PI_Index
        - 비서국A --> 정무국 --> 비서국B
            - check for duplicates: 당 비서국
        - 당중앙위원회_정치위원회 (폐지) 직급
        - 당중앙위원회_총정치국 - 당/인민군
        - 당중앙위원회_조선혁명박물관당위원회 & other 1.1.X.0.0 - double-check if they are included in 외곽단 - InGov?
        - Change_OrgName to 당중앙위원회_X
            - 비서국_인민무력부B
            - 비서국_인민무력부B_보위국
            - 비서국_인민무력부B_작전국
            - 비서국_인민무력부B_정치안전국





- Re-Index
    - See PI-Index Coding Rules in Evernote
    - Update X codes with 200 or 500. Delete any 200/500 which don't appear in our data. (They might have been orgs we corrected in the data.)
    - 최고인민회의 X지역인민회의 - move from 200 Series to 0 Series
    - Drop X지역위원회 and recode 200 Series to 0 Series
    - Change coding of 당외곽 및 사회단체
        - if the orgs were found in the 별책, then change their codes from 200 series to 100 series
        - Match, rename and recode (from 500+ to 100+) 500 series 국제친선기관
    - code data source rather than index



# Task 3. Add OrgType & other metadata

- Add OrgType


# Future Tasks

- Future Integrity Checks & Data Cleaning
    - Identify inconsistencies in Position Rankings (e.g., 명예위원장)
    - Verify no contrary categories of positions overlapping (e.g., 위원장, 소장)
    - Verify (PI, OrgName) not found in 기관별인명록
    - Research X entries: e.g., 3.525.591: 내각_외무성_X국_대사관
    - Reconcile 기관 across three categories: 기타기관, 당외곽및사회단체, 당외곽및사회단체(별책)
    - Add more orgs
        - Expand any X지역위원회 into full list
        - 내각 - Level 2 or Level 3?
            - orgs after 내각 국토환경보호성_산림총국
            - orgs after 내각 대외경제성_민족경제협력위원회
            - orgs after 내각 보건성_중앙의약품관리소
            - orgs after 내 각 상업성_중앙도매소
            - 내각 정보산업성 우편국, 전화국, 체신관리국, 체신소, 
            - 내각 (45)
        - 당외곽및사회단체 (별책)
    - Discussion about matching levels
        - https://namu.wiki/w/ - consider the OrgRang of 김정은의 겸직
		- 조선민주주의인민공화국 국무위원회 위원장[국가원수]
		- 조선로동당 중앙군사위원장
		- 조선민주주의인민공화국무력 최고사령관
		- 조선로동당 중앙위원회 정치국 상무위원
    - Review UNCERTAIN OrgName & Positions
        - some of these seem resolvable