In [1]:
import pandas as pd
import numpy as np
from datetime import date
from statistics import mean, mode
from pandas.api.types import CategoricalDtype

In [2]:
today = date.today()
print(today)

2023-10-05


# Tables

In [3]:
path_tables = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 2 tables/"

In [4]:
# tables
filename_careerorglink = "careerorglink.xlsx"
filename_leadercareerlink = "leadercareerlink.xlsx"
filename_orgtree = "orgtree.xlsx"
filename_elected = "positions_elected.xlsx"

In [5]:
col = pd.read_excel(path_tables + filename_careerorglink,dtype="str")
col.shape

(9002, 12)

In [6]:
col.columns

Index(['CareerString', 'CareerDateString_2022', 'IsJob', 'MultipleSubstrings',
       'CareerStartYear', 'CareerStartMonth', 'CareerSubstring',
       'InstitutionType', 'PrimaryInstitution', 'OrgName', 'Position',
       'Notes'],
      dtype='object')

In [7]:
lcl = pd.read_excel(path_tables + filename_leadercareerlink,dtype="str")
lcl.shape

(12617, 3)

In [8]:
lcl.columns

Index(['LeaderID', 'CareerString', 'CareerDateString_2022'], dtype='object')

In [9]:
org = pd.read_excel(path_tables + filename_orgtree,dtype="str")
org.shape

(2368, 19)

In [10]:
org.columns

Index(['InstitutionType', 'OrgType', 'PrimaryInstitution', 'OrgName',
       'PI_Index', 'OrgRank', 'P1', 'P2', 'P3', 'Alias_OrgName',
       'LinkToNext_PI', 'LinkToNext_Org', 'LinkToNext_Year', 'Notes',
       'L1_Index', 'L2_Index', 'L3_Index', 'L4_Index', 'L5_Index'],
      dtype='object')

In [11]:
elected = pd.read_excel(path_tables + filename_elected,dtype="str")
elected.shape

(105, 4)

In [12]:
elected.columns

Index(['PrimaryInstitution', 'OrgName', 'Position', 'IsElected'], dtype='object')

# Queries

In [13]:
path_queries = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 3 queries/"

In [14]:
filename_leaderjobtransition = "leaderjobtransition_alljobs.xlsx"
# filename_leaderjobtransition = "leaderjobtransition_ingov.xlsx"

In [15]:
trans = pd.read_excel(path_queries + filename_leaderjobtransition,dtype="str")
trans.shape

(2868, 27)

#### change datatypes

In [16]:
trans = trans.astype({"OrgAdvance":"int","PositionAdvance":"int","CareerStartYear_1":"int"})
trans.dtypes

LeaderID                   object
CareerString_1             object
CareerDateString_2022_1    object
CareerStartYear_1           int32
CareerStartMonth_1         object
CareerStartDate_1          object
CareerSubstring_1          object
InstitutionType_1          object
PrimaryInstitution_1       object
OrgName_1                  object
Position_1                 object
OrgRank_1                  object
PositionRank_1             object
CareerString_2             object
CareerDateString_2022_2    object
CareerStartYear_2          object
CareerStartMonth_2         object
CareerStartDate_2          object
CareerSubstring_2          object
InstitutionType_2          object
PrimaryInstitution_2       object
OrgName_2                  object
Position_2                 object
OrgRank_2                  object
PositionRank_2             object
OrgAdvance                  int32
PositionAdvance             int32
dtype: object

#### filter out UNCERTAIN - I thought I already did this

In [17]:
certain_criteria = (trans["InstitutionType_1"]!="UNCERTAIN") & (trans["InstitutionType_2"]!="UNCERTAIN")
trans = trans[certain_criteria]
trans.shape

(2774, 27)

# Functions

In [18]:
def merge_results(m):
    
    print("\nMerge Results...")
    print("")
    print("\tshape     :",m.shape)
    print("\tleft_only :",m[m["_merge"]=="left_only"].shape)
    print("\tboth      :",m[m["_merge"]=="both"].shape)
    print("\tright_only:",m[m["_merge"]=="right_only"].shape)

In [19]:
# using this on (PI,OrgName) will ensure unique & non-null keys
# using this on a larger df will ensure unique rows and non-null keys, but not unique keys

def unique_non_null_rows(olddf):
    
    df = olddf.copy()
    
    
    ### drop duplicates
    df = df.drop_duplicates(keep="first",ignore_index=True)

    ### drop null rows
    df.dropna(how="all",axis=0,inplace=True)
    
    # drop rows with null PI
    df = df[~(df["PrimaryInstitution"].isna())]
        
    ### drop rows where PI contains stop words
    stop_words_lower = ["uncertain","current","deprecated","please_revise"]
    df = df[ ~ df["PrimaryInstitution"].str.lower().isin(stop_words_lower)]
            

    print("\nUnique Non-null Rows...")
    print("")
    print("\tNon-unique rows:",olddf.shape)
    print("\tUnique rows    :",df.shape)

    ### sort
    df = df.sort_values(["PrimaryInstitution","OrgName"])
    
    return df

In [20]:
def create_time_series(series,group_var,count_var):
    
    yeardist = series.groupby(group_var,as_index=False).count().sort_values(group_var)
    
    max_year = max(series[group_var])
    x = pd.DataFrame({"year":pd.Series(range(min(yeardist[group_var].astype(int)),max(yeardist[group_var].astype(int))+1))})
    
    yeardist[group_var] = yeardist[group_var].astype(int,errors="ignore")
    
    ts = x.merge(yeardist,left_on="year",right_on=group_var,how="left")
    ts.loc[ts[count_var].isna(),count_var]=0
    # merge_results(ts)
    
    ts_columns = ["year",count_var]
    ts = ts[ts_columns]
    
    return ts

# Analysis - Research Note

In [21]:
path_analysis = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/analysis/"

In [22]:
# analysis sub-paths
study0_path = "2023.10.04 Study 0 - research note/"
study1_path = "2023.10.04 Study 1 - political capital/"
study2_path = "2023.10.04 Study 2 - commitment vs control/"
study3_path = "2023.10.04 Study 3 - reds vs experts/"

### leaders starting new jobs by year

In [23]:
lcl.head(2)

Unnamed: 0,LeaderID,CareerString,CareerDateString_2022
0,리선권,개성공단 남북공동위원회 통행통신통관 분과위원회,2013.09 ~ 2014.01
1,조경철,故 김정일 국가장의위원회 위원,2011.12


In [24]:
col.head(2)

Unnamed: 0,CareerString,CareerDateString_2022,IsJob,MultipleSubstrings,CareerStartYear,CareerStartMonth,CareerSubstring,InstitutionType,PrimaryInstitution,OrgName,Position,Notes
0,4.15문학창작단 단장,1989.04,True,1,1989,4,,당외곽및사회단체_사회부문(별책),4.15문화창작단,,단장,
1,"1989. 4.15문학창작단 단장, 조선작가동맹 통일문학담당 부위원장",,True,2,1989,4,4.15 문화창작단 단장,당외곽및사회단체_사회부문(별책),4.15문화창작단,,단장,


In [25]:
jobs = col[(col.IsJob=="True") & (col.InstitutionType!="UNCERTAIN") & (col.CareerStartYear.notna())]
jobs.shape

(6173, 12)

In [26]:
elected_key = ["PrimaryInstitution","OrgName","Position"]
jobs2 = jobs.merge(elected,on=elected_key,how="left")
# merge_results(jobs2)
jobs2.shape

(6173, 13)

In [27]:
jobs2.loc[jobs2.IsElected.isna(),"IsElected"] = 0
jobs2.groupby(["IsElected"]).count()

Unnamed: 0_level_0,CareerString,CareerDateString_2022,IsJob,MultipleSubstrings,CareerStartYear,CareerStartMonth,CareerSubstring,InstitutionType,PrimaryInstitution,OrgName,Position,Notes
IsElected,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,4494,2398,4494,4494,4494,3930,763,4494,4494,3126,4494,102
1,1679,947,1679,1679,1679,1667,869,1679,1679,1108,1679,2


In [28]:
lj_key = ["CareerString","CareerDateString_2022"]
leader_jobs = lcl.merge(jobs2,how="inner",on=lj_key,indicator=True)
merge_results(leader_jobs)


Merge Results...

	shape     : (8767, 15)
	left_only : (0, 15)
	both      : (8767, 15)
	right_only: (0, 15)


In [29]:
ly_columns = ["LeaderID","CareerStartYear","IsElected"]
leader_year_all = leader_jobs[ly_columns].drop_duplicates(keep="first")
leader_year_all.shape

(4533, 3)

In [30]:
leader_year_all.head(2)

Unnamed: 0,LeaderID,CareerStartYear,IsElected
0,리원일,1999,0
1,조용원,2020,1


In [31]:
leader_year_noelect = leader_year_all[leader_year_all["IsElected"]==0].drop_duplicates(keep="first")
leader_year_noelect.shape

(2458, 3)

### time series

In [32]:
# not elected + elected
leader_year_all_ts = create_time_series(leader_year_all,group_var="CareerStartYear",count_var="LeaderID")

In [33]:
leader_year_all_ts

Unnamed: 0,year,LeaderID
0,1937,1.0
1,1938,0.0
2,1939,0.0
3,1940,0.0
4,1941,0.0
...,...,...
81,2018,52.0
82,2019,264.0
83,2020,46.0
84,2021,184.0


In [34]:
# not elected only
leader_year_noelect_ts = create_time_series(leader_year_noelect,"CareerStartYear","LeaderID")

In [35]:
leader_year_noelect_ts

Unnamed: 0,year,LeaderID
0,1937,1.0
1,1938,0.0
2,1939,0.0
3,1940,0.0
4,1941,0.0
...,...,...
80,2017,36.0
81,2018,31.0
82,2019,82.0
83,2020,23.0


In [36]:
# combined ts
leader_year_all_ts.rename(columns={"LeaderID": "Elected & Unelected"}, inplace=True)
leader_year_noelect_ts.rename(columns={"LeaderID": "Unelected Only"}, inplace=True)
leader_year_combined_ts = leader_year_all_ts.merge(leader_year_noelect_ts,on="year",how="left")
leader_year_combined_ts

Unnamed: 0,year,Elected & Unelected,Unelected Only
0,1937,1.0,1.0
1,1938,0.0,0.0
2,1939,0.0,0.0
3,1940,0.0,0.0
4,1941,0.0,0.0
...,...,...,...
81,2018,52.0,31.0
82,2019,264.0,82.0
83,2020,46.0,23.0
84,2021,184.0,59.0


### export time series

In [37]:
filename_stem = "leader_year_ts.xlsx"
leader_year_combined_ts.to_excel(path_analysis + study0_path + filename_stem,index=False)