In [1]:
import pandas as pd
import numpy as np
from datetime import date
from statistics import mean, mode
from pandas.api.types import CategoricalDtype
pd.set_option('display.max_rows', None)

In [2]:
today = date.today()
print(today)

2023-10-30


# Cleaning - Career

In [3]:
path_cleaning_career = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 1 cleaning/cleaning step 2 - career/"

In [4]:
subpath_2_8 = "2.8 orgtree local/"

# Tables

In [5]:
path_tables = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 2 tables/"

In [6]:
# tables
filename_careerorglink = "careerorglink.xlsx"
filename_leadercareerlink = "leadercareerlink.xlsx"
filename_orgtree = "orgtree.xlsx"
filename_elected = "positions_elected.xlsx"

In [9]:
# col = pd.read_excel(path_tables + filename_careerorglink,dtype="str")
# col.shape

(9002, 12)

In [10]:
# col.columns

Index(['CareerString', 'CareerDateString_2022', 'IsJob', 'MultipleSubstrings',
       'CareerStartYear', 'CareerStartMonth', 'CareerSubstring',
       'InstitutionType', 'PrimaryInstitution', 'OrgName', 'Position',
       'Notes'],
      dtype='object')

In [11]:
# lcl = pd.read_excel(path_tables + filename_leadercareerlink,dtype="str")
# lcl.shape

(12617, 3)

In [15]:
# lcl.columns

In [16]:
org = pd.read_excel(path_tables + filename_orgtree,dtype="str")
org.shape

(2368, 19)

In [17]:
org.columns

Index(['InstitutionType', 'OrgType', 'PrimaryInstitution', 'OrgName',
       'PI_Index', 'OrgRank', 'P1', 'P2', 'P3', 'Alias_OrgName',
       'LinkToNext_PI', 'LinkToNext_Org', 'LinkToNext_Year', 'Notes',
       'L1_Index', 'L2_Index', 'L3_Index', 'L4_Index', 'L5_Index'],
      dtype='object')

In [19]:
# elected = pd.read_excel(path_tables + filename_elected,dtype="str")
# elected.shape

In [20]:
# elected.columns

# Queries

In [21]:
path_queries = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 3 queries/"

In [22]:
# jobs and job transitions, elected & unelected, in and out of gov
filename_leaderjob_electUnelect_inOutgov = "leaderjob_electUnelect_inOutgov.xlsx"
filename_leaderjobtransition_electUnelect_inOutgov = "leaderjobtransition_electUnelect_inOutgov.xlsx"

filename_leaderjob_no_spa = "leaderjob_no_spa.xlsx"
filename_leaderjobtransition_electUnelect_no_spa = "leaderjobtransition_no_spa.xlsx"

In [25]:
# jobs, certain, no SPA
# jobs = pd.read_excel(path_queries + filename_leaderjob_no_spa,dtype="str")
# jobs.shape

In [26]:
# transitions, certain, no SPA
# trans = pd.read_excel(path_queries + filename_leaderjobtransition_electUnelect_no_spa,dtype="str")
# trans.shape

# Functions

In [27]:
def merge_results(m):
    
    print("\nMerge Results...")
    print("")
    print("\tshape     :",m.shape)
    print("\tleft_only :",m[m["_merge"]=="left_only"].shape)
    print("\tboth      :",m[m["_merge"]=="both"].shape)
    print("\tright_only:",m[m["_merge"]=="right_only"].shape)

In [28]:
# using this on (PI,OrgName) will ensure unique & non-null keys
# using this on a larger df will ensure unique rows and non-null keys, but not unique keys

def unique_non_null_rows(olddf):
    
    df = olddf.copy()
    
    
    ### drop duplicates
    df = df.drop_duplicates(keep="first",ignore_index=True)

    ### drop null rows
    df.dropna(how="all",axis=0,inplace=True)
    
    # drop rows with null PI
    df = df[~(df["PrimaryInstitution"].isna())]
        
    ### drop rows where PI contains stop words
    stop_words_lower = ["uncertain","current","deprecated","please_revise"]
    df = df[ ~ df["PrimaryInstitution"].str.lower().isin(stop_words_lower)]
            

    print("\nUnique Non-null Rows...")
    print("")
    print("\tNon-unique rows:",olddf.shape)
    print("\tUnique rows    :",df.shape)

    ### sort
    df = df.sort_values(["PrimaryInstitution","OrgName"])
    
    return df

In [29]:
def create_time_series(series,group_var,count_var):
    
    yeardist = series.groupby(group_var,as_index=False).count().sort_values(group_var)
    
    max_year = max(series[group_var])
    x = pd.DataFrame({"year":pd.Series(range(min(yeardist[group_var].astype(int)),max(yeardist[group_var].astype(int))+1))})
    
    yeardist[group_var] = yeardist[group_var].astype(int,errors="ignore")
    
    ts = x.merge(yeardist,left_on="year",right_on=group_var,how="left")
    ts.loc[ts[count_var].isna(),count_var]=0
    # merge_results(ts)
    
    ts_columns = ["year",count_var]
    ts = ts[ts_columns]
    
    return ts

# Orgs_local - prep new table

### military & ngo; party & gov: InstitutionType, PrimaryInstitution, OrgName

In [30]:
org.InstitutionType.unique()

array(['노동당', '당외곽및사회단체_경제부문(별책)', '당외곽및사회단체_사회부문(별책)', '정권기관',
       '당외곽및사회단체_대외부문', '당외곽및사회단체_사회부문', '인민군', '당외곽및사회단체_근로단체',
       '당외곽및사회단체_경제부문', '당외곽및사회단체_체육부문', '당외곽및사회단체_정치부문', '당외곽및사회단체_종교부문',
       'UNCERTAIN', '국제친선단체'], dtype=object)

In [31]:
org.shape

(2368, 19)

In [32]:
institutiontype_split = ["노동당","정권기관"]

In [33]:
# military & ngo
org_military_ngo = org[~org.InstitutionType.isin(institutiontype_split)]
org_military_ngo.shape

(911, 19)

In [34]:
# party & gov
org_party_gov = org[org.InstitutionType.isin(institutiontype_split)]
org_party_gov.shape

(1457, 19)

In [38]:
# add Local field
org_military_ngo["Local"] = 0
org_party_gov["Local"] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  org_military_ngo["Local"] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  org_party_gov["Local"] = 0


### export military & ngo PI,OrgName

In [39]:
filename_org_military_ngo = "org_military_ngo.xlsx"
filename_org_party_gov = "org_party_gov.xlsx"

In [40]:
# export military & ngo list for manual editing
org_military_ngo.to_excel(path_cleaning_career + subpath_2_8 + filename_org_military_ngo,index=False)

In [41]:
# export party & gov list for manual editing
org_party_gov.to_excel(path_cleaning_career + subpath_2_8 + filename_org_party_gov,index=False)

### import manually edited files

In [148]:
# filenames of manually edited org_local files
filename_org_military_ngo_edited = "org_military_ngo_edited1_Esther.xlsx"
filename_org_party_gov_edited = "org_party_gov_edited1_Jacob.xlsx"

In [149]:
# import manually edited org_local files
org_military_ngo2 = pd.read_excel(path_cleaning_career + subpath_2_8 + filename_org_military_ngo_edited)

In [150]:
# re-code Local to T/F
org_military_ngo2.loc[org_military_ngo2.Local.isna(),"Local"]=False
org_military_ngo2.loc[org_military_ngo2.Local == 0,"Local"]=False
org_military_ngo2.loc[org_military_ngo2.Local == 1,"Local"]=True
org_military_ngo2.Local.unique()

array([True, False], dtype=object)

In [151]:
org_party_gov2 = pd.read_excel(path_cleaning_career + subpath_2_8 + filename_org_party_gov_edited)

In [152]:
# re-code Local to T/F
org_party_gov2.loc[org_party_gov2.Local.isna(),"Local"]=False
org_party_gov2.loc[org_party_gov2.Local == 0,"Local"]=False
org_party_gov2.loc[org_party_gov2.Local == 1,"Local"]=True
org_party_gov2.Local.unique()

array([False, True], dtype=object)

### concatenate and export as a table 

In [153]:
# extract relevant fields and concatenate
local_columns = ["InstitutionType","PrimaryInstitution","OrgName","Local"]
org_military_ngo3 = org_military_ngo2[local_columns]
org_party_gov3 = org_party_gov2[local_columns]

In [154]:
org_military_ngo3.head(2)

Unnamed: 0,InstitutionType,PrimaryInstitution,OrgName,Local
0,UNCERTAIN,7.7연합기업소,,True
1,UNCERTAIN,강서구역청산엽동농장,,True


In [155]:
org_party_gov3.head()

Unnamed: 0,InstitutionType,PrimaryInstitution,OrgName,Local
0,노동당,노동당,당중앙검사위원회,False
1,노동당,노동당,당중앙군사위원회,False
2,노동당,노동당,당중앙위원회,False
3,노동당,노동당,당중앙위원회_38호실,False
4,노동당,노동당,당중앙위원회_39호실,False


In [156]:
org_local = pd.concat([org_party_gov3,org_military_ngo3])

In [157]:
# export org_local as a Table
org_local.to_excel(path_cleaning_career + subpath_2_8 + filename_org_local,index=False)

### merge org_local field to orgtree

In [158]:
org_local.head()

Unnamed: 0,InstitutionType,PrimaryInstitution,OrgName,Local
0,노동당,노동당,당중앙검사위원회,False
1,노동당,노동당,당중앙군사위원회,False
2,노동당,노동당,당중앙위원회,False
3,노동당,노동당,당중앙위원회_38호실,False
4,노동당,노동당,당중앙위원회_39호실,False


In [159]:
org_local.groupby("Local").count()

Unnamed: 0_level_0,InstitutionType,PrimaryInstitution,OrgName
Local,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,2099,2099,1595
True,269,269,165


In [160]:
org_local.shape

(2368, 4)

In [161]:
org_party_gov3.shape

(1457, 4)

In [162]:
org_military_ngo3.shape

(911, 4)

In [163]:
1457 + 911

2368

In [164]:
2099 + 269

2368

In [168]:
org_key = ["InstitutionType","PrimaryInstitution","OrgName"]
org2 = org.merge(org_local,on=org_key,how="left",indicator=False)
# org2 = org.merge(org_local,on=org_key,how="left",indicator=True)
# merge_results(org2)

In [169]:
org2.head()

Unnamed: 0,InstitutionType,OrgType,PrimaryInstitution,OrgName,PI_Index,OrgRank,P1,P2,P3,Alias_OrgName,LinkToNext_PI,LinkToNext_Org,LinkToNext_Year,Notes,L1_Index,L2_Index,L3_Index,L4_Index,L5_Index,Local
0,노동당,,노동당,,1.0.0.0.0.0,0,"위원장,책임비서,총비서",제1비서,,,Current,Current,Current,,1,0,0,0,0,False
1,당외곽및사회단체_경제부문(별책),,신포원양수산연합기업소,,1.0.0.0.0.0,0,"지배인,당책임비서","기사장,당비서",,,,,,,1,0,0,0,0,True
2,당외곽및사회단체_경제부문(별책),,태천3호발전소,,1.0.0.0.0.0,0,소장,부소장,,,,,,,1,0,0,0,0,True
3,당외곽및사회단체_경제부문(별책),,평양곡산공장,,1.0.0.0.0.0,0,"지배인,당책임비서","기사장,당비서",부기사장,,,,,,1,0,0,0,0,False
4,당외곽및사회단체_경제부문(별책),,강계고려약공장,,1.0.0.0.0.0,0,지배인,,,,,,,,1,0,0,0,0,True


In [170]:
org2.shape

(2368, 20)

In [171]:
# update orgtree Table
# org.to_excel(path_tables + filename_orgtree,index=False)