In [107]:
import pandas as pd
import numpy as np
from datetime import date
from statistics import mean, mode
from pandas.api.types import CategoricalDtype

In [108]:
today = date.today()
print(today)

2023-12-17


# Tables

In [109]:
path_tables = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 2 tables/"

In [110]:
# tables
filename_careerorglink = "careerorglink.xlsx"
filename_leadercareerlink = "leadercareerlink.xlsx"
filename_orgtree = "orgtree.xlsx"
filename_elected = "positions_elected.xlsx"

In [111]:
# career-org link
col = pd.read_excel(path_tables + filename_careerorglink,dtype="str")
col.shape

(9002, 12)

In [112]:
col.columns

Index(['CareerString', 'CareerDateString_2022', 'IsJob', 'MultipleSubstrings',
       'CareerStartYear', 'CareerStartMonth', 'CareerSubstring',
       'InstitutionType', 'PrimaryInstitution', 'OrgName', 'Position',
       'Notes'],
      dtype='object')

In [113]:
# leader-career link
lcl = pd.read_excel(path_tables + filename_leadercareerlink,dtype="str")
lcl.shape

(12617, 3)

In [8]:
lcl.columns

Index(['LeaderID', 'CareerString', 'CareerDateString_2022'], dtype='object')

In [9]:
# orgtree
org = pd.read_excel(path_tables + filename_orgtree,dtype="str")
org.shape

(2368, 20)

In [10]:
org.columns

Index(['InstitutionType', 'OrgType', 'PrimaryInstitution', 'OrgName',
       'PI_Index', 'OrgRank', 'P1', 'P2', 'P3', 'Alias_OrgName',
       'LinkToNext_PI', 'LinkToNext_Org', 'LinkToNext_Year', 'Notes',
       'L1_Index', 'L2_Index', 'L3_Index', 'L4_Index', 'L5_Index', 'Local'],
      dtype='object')

In [11]:
# elected = pd.read_excel(path_tables + filename_elected,dtype="str")
# elected.shape

In [12]:
# elected.columns

# Queries

In [13]:
path_queries = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 3 queries/"

In [14]:
filename_leaderjob_all = "leaderjob_electUnelect_inOutgov.xlsx"
filename_leaderjob_no_spa = "leaderjob_no_spa.xlsx"
filename_leaderjobtransition_no_spa = "leaderjobtransition_no_spa.xlsx"

In [15]:
# leader jobs
ljobs_all = pd.read_excel(path_queries + filename_leaderjob_all,dtype="str")
ljobs_all.shape

(8594, 14)

In [16]:
# ljobs = pd.read_excel(path_queries + filename_leaderjob_no_spa,dtype="str")
# ljobs.shape

In [17]:
# trans = pd.read_excel(path_queries + filename_leaderjobtransition_no_spa,dtype="str")
# trans.shape

#### change datatypes

In [18]:
# ljobs = ljobs.astype({"CareerStartYear":"int","CareerStartDate":"int"})
# ljobs.dtypes

In [19]:
# trans = trans.astype({"OrgAdvance":"int","PositionAdvance":"int",
#                      "CareerStartYear_1":"int","CareerStartYear_2":"int",
#                      "CareerStartDate_1":"int","CareerStartDate_2":"int"})
# trans.dtypes

# Functions

In [20]:
def merge_results(m):
    
    print("\nMerge Results...")
    print("")
    print("\tshape     :",m.shape)
    print("\tleft_only :",m[m["_merge"]=="left_only"].shape)
    print("\tboth      :",m[m["_merge"]=="both"].shape)
    print("\tright_only:",m[m["_merge"]=="right_only"].shape)

In [21]:
# using this on (PI,OrgName) will ensure unique & non-null keys
# using this on a larger df will ensure unique rows and non-null keys, but not unique keys

def unique_non_null_rows(olddf):
    
    df = olddf.copy()
    
    
    ### drop duplicates
    df = df.drop_duplicates(keep="first",ignore_index=True)

    ### drop null rows
    df.dropna(how="all",axis=0,inplace=True)
    
    # drop rows with null PI
    df = df[~(df["PrimaryInstitution"].isna())]
        
    ### drop rows where PI contains stop words
    stop_words_lower = ["uncertain","current","deprecated","please_revise"]
    df = df[ ~ df["PrimaryInstitution"].str.lower().isin(stop_words_lower)]
            

    print("\nUnique Non-null Rows...")
    print("")
    print("\tNon-unique rows:",olddf.shape)
    print("\tUnique rows    :",df.shape)

    ### sort
    df = df.sort_values(["PrimaryInstitution","OrgName"])
    
    return df

In [22]:
def create_time_series(series,group_var,count_var):
    
    yeardist = series.groupby(group_var,as_index=False).count().sort_values(group_var)
    
    max_year = max(series[group_var])
    x = pd.DataFrame({"year":pd.Series(range(min(yeardist[group_var].astype(int)),max(yeardist[group_var].astype(int))+1))})
    
    yeardist[group_var] = yeardist[group_var].astype(int,errors="ignore")
    
    ts = x.merge(yeardist,left_on="year",right_on=group_var,how="left")
    ts.loc[ts[count_var].isna(),count_var]=0
    # merge_results(ts)
    
    ts_columns = ["year",count_var]
    ts = ts[ts_columns]
    
    return ts

# Analysis - Research Note

In [23]:
path_analysis = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/analysis/"

In [24]:
# analysis sub-paths
study0_path = "2023.10.04 Study 0 - research note/"
study1_path = "2023.10.04 Study 1 - political capital/"
study2_path = "2023.10.04 Study 2 - commitment vs control/"
study3_path = "2023.10.04 Study 3 - reds vs experts/"

# Statistics - elites, orgs and jobs

# Elites stats

In [25]:
# include all jobs: elected/not-elected, SPA/not-SPA, local/central

In [26]:
lcl.shape

(12617, 3)

In [27]:
# total elites
lcl.LeaderID.unique().shape

(637,)

In [28]:
ljobs_all.columns

Index(['LeaderID', 'CareerString', 'CareerDateString_2022', 'CareerStartYear',
       'CareerStartMonth', 'CareerStartDate', 'CareerSubstring',
       'InstitutionType', 'PrimaryInstitution', 'OrgName', 'Position',
       'IsElected', 'OrgRank', 'PositionRank'],
      dtype='object')

In [29]:
# total elites with jobs
ljobs_all.LeaderID.unique().shape

(607,)

## Org stats

In [30]:
org.shape

(2368, 20)

In [31]:
org.columns

Index(['InstitutionType', 'OrgType', 'PrimaryInstitution', 'OrgName',
       'PI_Index', 'OrgRank', 'P1', 'P2', 'P3', 'Alias_OrgName',
       'LinkToNext_PI', 'LinkToNext_Org', 'LinkToNext_Year', 'Notes',
       'L1_Index', 'L2_Index', 'L3_Index', 'L4_Index', 'L5_Index', 'Local'],
      dtype='object')

In [32]:
not_social_orgs = ["Party","Military","Government"]

In [33]:
org2 = org

In [34]:
org2.shape

(2368, 20)

In [35]:
org2["InstitutionCategory"] = org2["InstitutionType"].copy()

In [36]:
org2 = org2[~(org2["InstitutionType"]=="UNCERTAIN")]
org2.shape

(2329, 21)

In [37]:
# rename levels of InstitutionCategory
org2.loc[org2["InstitutionCategory"]=="노동당","InstitutionCategory"]="Party"
org2.loc[org2["InstitutionCategory"]=="정권기관","InstitutionCategory"]="Government"
org2.loc[org2["InstitutionCategory"]=="인민군","InstitutionCategory"]="Military"
org2.loc[~(org2["InstitutionCategory"].isin(not_social_orgs)),"InstitutionCategory"]="Social"

In [38]:
org2.InstitutionCategory.unique()

array(['Party', 'Social', 'Government', 'Military'], dtype=object)

In [39]:
# rename levels of InstitutionCategory
org2.loc[org2["Local"]=="True","Local"]="Local"
org2.loc[org2["Local"]=="False","Local"]="Central"

In [40]:
org2.Local.unique()

array(['Central', 'Local'], dtype=object)

### org stat1 - Number of Organizations by Institution

In [41]:
stat1_columns = ["InstitutionCategory","OrgName"]
stat1_column_labels = ["Institution","Number of Organizations"]

In [42]:
stat1 = org2[stat1_columns].groupby("InstitutionCategory",as_index=False).count()
stat1.columns = stat1_column_labels
stat1

Unnamed: 0,Institution,Number of Organizations
0,Government,1301
1,Military,136
2,Party,120
3,Social,201


In [43]:
# export to excel


### org stat2 - Number of Organizations by Institution and Rank

In [44]:
stat2_columns = ["InstitutionCategory","OrgRank","OrgName"]
stat2_groupby_columns = ["InstitutionCategory","OrgRank"]
stat2_column_labels = ["Institution","Rank","Number of Organizations"]

In [45]:
stat2 = org2[stat2_columns].groupby(stat2_groupby_columns,as_index=False).count()
stat2.columns = stat2_column_labels
stat2

Unnamed: 0,Institution,Rank,Number of Organizations
0,Government,0,45
1,Government,1,243
2,Government,2,856
3,Government,3,151
4,Government,4,6
5,Military,0,19
6,Military,1,70
7,Military,2,8
8,Military,3,24
9,Military,4,15


In [46]:
# export to excel


### org stat3 - Number of Organizations by Institution and Rank

In [47]:
stat3_columns = ["InstitutionCategory","Local","OrgName"]
stat3_groupby_columns = ["InstitutionCategory","Local"]
stat3_column_labels = ["Institution","Local or Central","Number of Organizations"]

In [48]:
stat3 = org2[stat3_columns].groupby(stat3_groupby_columns,as_index=False).count()
stat3.columns = stat3_column_labels
stat3

Unnamed: 0,Institution,Local or Central,Number of Organizations
0,Government,Central,1221
1,Government,Local,80
2,Military,Central,111
3,Military,Local,25
4,Party,Central,86
5,Party,Local,34
6,Social,Central,175
7,Social,Local,26


# Jobs stats

In [49]:
ljobs_all.shape

(8594, 14)

In [50]:
ljobs_all.columns

Index(['LeaderID', 'CareerString', 'CareerDateString_2022', 'CareerStartYear',
       'CareerStartMonth', 'CareerStartDate', 'CareerSubstring',
       'InstitutionType', 'PrimaryInstitution', 'OrgName', 'Position',
       'IsElected', 'OrgRank', 'PositionRank'],
      dtype='object')

In [51]:
jobs_columns = ['LeaderID','CareerStartYear','CareerStartDate', 'InstitutionType', 'PrimaryInstitution', 'OrgName', 'Position', 'IsElected', 'OrgRank', 'PositionRank']

In [52]:
ljobs_test = unique_non_null_rows(ljobs_all[jobs_columns])
ljobs_test.shape


Unique Non-null Rows...

	Non-unique rows: (8594, 10)
	Unique rows    : (6116, 10)


(6116, 10)

In [53]:
jobs2 = unique_non_null_rows(ljobs_all[jobs_columns])
jobs2.shape


Unique Non-null Rows...

	Non-unique rows: (8594, 10)
	Unique rows    : (6116, 10)


(6116, 10)

### job stats1 - Distribution of Number of Jobs in Elite Resumes

In [54]:
jstat1_columns = ["LeaderID","Position"]
jstat1_groupby_columns = ["LeaderID"]

In [55]:
jstat1 = jobs2[jstat1_columns].groupby(jstat1_groupby_columns,as_index=False).count()
# jstat1.columns = stat3_column_labels
jstat1.sort_values("Position",ascending=False,inplace=True)
jstat1

Unnamed: 0,LeaderID,Position
406,양형섭,58
328,박성철a,51
108,김영남,48
146,김일성,48
273,리종옥,47
...,...,...
571,포희성,1
279,리충길,1
69,김만길,1
329,박성철b,1


### jobs stat2: Distribution of the number of jobs in each elite resume

In [56]:
# jstat2_columns = ["Position","Position"]
jstat2_groupby_columns = ["Position"]
jstat2_column_labels = ["Number of Jobs in Resume","Number of Elites"]

In [57]:
jstat2 = jstat1.groupby(jstat2_groupby_columns,as_index=False).count()
jstat2.sort_values("Position",ascending=True,inplace=True)
jstat2.columns = jstat2_column_labels
jstat2

Unnamed: 0,Number of Jobs in Resume,Number of Elites
0,1,41
1,2,57
2,3,52
3,4,44
4,5,39
5,6,33
6,7,48
7,8,40
8,9,33
9,10,24


### job stat3 - Number of Government Jobs by SPA

In [62]:
stat3_columns = ["SPA","OrgName"]
stat3_groupby_columns = ["SPA"]
stat3_column_labels = ["SPA","Number of Jobs"]

In [63]:
jobs3 = jobs2[jobs2["InstitutionType"]=="정권기관"]
jobs3.loc[:,["SPA"]]=False
jobs3.loc[jobs3["PrimaryInstitution"]=="최고인민회의",["SPA"]] = True

In [64]:
stat3 = jobs3[stat3_columns].groupby(stat3_groupby_columns,as_index=False).count()
stat3.columns = stat3_column_labels
stat3

Unnamed: 0,SPA,Number of Jobs
0,False,1099
1,True,362


### jobs stat4: average jobs per year, omitting 최고인민회의

In [65]:
jobs2.columns

Index(['LeaderID', 'CareerStartYear', 'CareerStartDate', 'InstitutionType',
       'PrimaryInstitution', 'OrgName', 'Position', 'IsElected', 'OrgRank',
       'PositionRank'],
      dtype='object')

In [66]:
jobs4 = jobs2[~(jobs2.PrimaryInstitution=="최고인민회의")]
jobs4.shape

(4246, 10)

In [67]:
jstat4_columns = ["CareerStartYear","Position"]
jstat4_groupby_columns = ["CareerStartYear"]
# jstat4_column_labels = ["Number of Jobs in Resume","Number of Elites"]

In [68]:
jstat4 = jobs4[jstat4_columns].groupby(jstat4_groupby_columns,as_index=False).count()
jstat4.sort_values('CareerStartYear',ascending=True,inplace=True)
jstat4

Unnamed: 0,CareerStartYear,Position
0,1937,1
1,1945,6
2,1946,5
3,1947,1
4,1948,9
...,...,...
74,2018,51
75,2019,184
76,2020,48
77,2021,235


In [69]:
mean(list(jstat4.Position))

53.74683544303797

In [70]:
jstat4["Largerthan53"]=False
jstat4.loc[jstat4.Position>53,"Largerthan53"]=True

In [71]:
jstat4[jstat4.Position<54]

Unnamed: 0,CareerStartYear,Position,Largerthan53
0,1937,1,False
1,1945,6,False
2,1946,5,False
3,1947,1,False
4,1948,9,False
5,1949,2,False
6,1950,11,False
7,1951,7,False
8,1952,9,False
9,1953,4,False


### job stat5 - Number of Local Jobs

In [72]:
stat5_columns = ["Local","OrgName"]
stat5_groupby_columns = ["Local"]
stat5_column_labels = ["Local","Number of Jobs"]

In [73]:
org2.columns

Index(['InstitutionType', 'OrgType', 'PrimaryInstitution', 'OrgName',
       'PI_Index', 'OrgRank', 'P1', 'P2', 'P3', 'Alias_OrgName',
       'LinkToNext_PI', 'LinkToNext_Org', 'LinkToNext_Year', 'Notes',
       'L1_Index', 'L2_Index', 'L3_Index', 'L4_Index', 'L5_Index', 'Local',
       'InstitutionCategory'],
      dtype='object')

In [74]:
# merge leader-job linklist with orgtree to get Local and InstitutionCategory :)
ot_columns = ["PrimaryInstitution","OrgName","Local","InstitutionCategory"]
pk_columns = ["PrimaryInstitution","OrgName"]
jobs5 = jobs2.merge(org2[ot_columns],on=pk_columns,how="left",indicator=False)
# merge_results(jobs4)

In [75]:
jobs5.InstitutionCategory

0       Social
1       Social
2       Social
3       Social
4       Social
         ...  
6111    Social
6112    Social
6113    Social
6114    Social
6115    Social
Name: InstitutionCategory, Length: 6116, dtype: object

In [76]:
stat5 = jobs5[stat5_columns].groupby(stat5_groupby_columns,as_index=False).count()
stat5.columns = stat5_column_labels
stat5

Unnamed: 0,Local,Number of Jobs
0,Central,3217
1,Local,266


### job stat6 - Percent of Local Jobs for Each Institution Category

In [77]:
stat6_columns = ["InstitutionCategory","IsLocal"]
stat6_groupby_columns = ["InstitutionCategory"]
stat6_column_labels = ["InstitutionCategory","Total Job Count","Local Job Count","Percent Local Jobs "]

In [78]:
jobs5.loc[:,"IsLocal"] = False
jobs5.loc[jobs5["Local"]=="Local","IsLocal"] = True

In [79]:
# use job5, which is job2 + Local variable from orgtree
stat6 = jobs5[stat6_columns].groupby(stat6_groupby_columns,as_index=False)["IsLocal"].agg({"Job Count":"count","Local Job Count":"sum"})
stat6.loc[:,"Percent Local Jobs"] = stat6.apply(lambda x: round(x["Local Job Count"]/x["Job Count"]*100,2),axis=1)
stat6.columns = stat6_column_labels
stat6

Unnamed: 0,InstitutionCategory,Total Job Count,Local Job Count,Percent Local Jobs
0,Government,3291,90,2.73
1,Military,352,62,17.61
2,Party,1620,130,8.02
3,Social,853,60,7.03


### job stat7 - Number of Jobs by Organization Rank

In [80]:
stat7_columns = ["OrgRank","OrgName"]
stat7_groupby_columns = ["OrgRank"]
stat7_column_labels = ["Organization Rank","Number of Jobs"]

In [81]:
# use job5, which is job2 + Local variable from orgtree
stat7 = jobs5[stat7_columns].groupby(stat7_groupby_columns,as_index=False).count()
stat7.columns = stat7_column_labels
stat7

Unnamed: 0,Organization Rank,Number of Jobs
0,0,197
1,1,2046
2,2,911
3,3,304
4,4,25


### job stat8 - Percent of Jobs by Org Rank, for Each Institution 

In [92]:
stat8_columns = ["InstitutionCategory","OrgRank","OrgName"]
stat8_groupby_columns = ["InstitutionCategory","OrgRank"]
stat8_column_labels = ["Institution Category","Organization Rank","Number of Jobs"]

In [93]:
# use job5, which is job2 + Local variable from orgtree
stat8 = jobs5[stat8_columns].groupby(stat8_groupby_columns,as_index=False).count()
stat8.columns = stat8_column_labels
stat8

Unnamed: 0,Institution Category,Organization Rank,Number of Jobs
0,Government,0,71
1,Government,1,1015
2,Government,2,166
3,Government,3,204
4,Government,4,5
5,Military,0,43
6,Military,1,60
7,Military,2,9
8,Military,3,60
9,Military,4,19


In [94]:
stat8_catsum_columns = ["Institution Category","Number of Jobs"]
stat8_catsum_groupby_columns = ["Institution Category"]
stat8_catsum_column_labels = ["Institution Category","Total Jobs by Category"]
stat8_catsum = stat8[stat8_catsum_columns].groupby(stat8_catsum_groupby_columns,as_index=False).agg({"Number of Jobs":"sum"})
stat8_catsum.columns = stat8_catsum_column_labels
stat8_catsum

Unnamed: 0,Institution Category,Total Jobs by Category
0,Government,1461
1,Military,191
2,Party,1609
3,Social,222


In [95]:
stat8 = stat8.merge(stat8_catsum,on="Institution Category",how="left")
stat8

Unnamed: 0,Institution Category,Organization Rank,Number of Jobs,Total Jobs by Category
0,Government,0,71,1461
1,Government,1,1015,1461
2,Government,2,166,1461
3,Government,3,204,1461
4,Government,4,5,1461
5,Military,0,43,191
6,Military,1,60,191
7,Military,2,9,191
8,Military,3,60,191
9,Military,4,19,191


In [98]:
stat8["Percent Jobs by Rank"] = round(stat8["Number of Jobs"] / stat8["Total Jobs by Category"] * 100,2)
stat8

Unnamed: 0,Institution Category,Organization Rank,Number of Jobs,Total Jobs by Category,Percent Jobs by Rank
0,Government,0,71,1461,4.86
1,Government,1,1015,1461,69.47
2,Government,2,166,1461,11.36
3,Government,3,204,1461,13.96
4,Government,4,5,1461,0.34
5,Military,0,43,191,22.51
6,Military,1,60,191,31.41
7,Military,2,9,191,4.71
8,Military,3,60,191,31.41
9,Military,4,19,191,9.95


### job stat9 - Number of Jobs by Position Rank

In [99]:
stat9_columns = ["PositionRank","OrgName"]
stat9_groupby_columns = ["PositionRank"]
stat9_column_labels = ["Position Rank","Number of Jobs"]

In [100]:
# use job5, which is job2 + Local variable from orgtree
stat9 = jobs5[stat9_columns].groupby(stat9_groupby_columns,as_index=False).count()
stat9.columns = stat9_column_labels
stat9

Unnamed: 0,Position Rank,Number of Jobs
0,1,1401
1,2,735
2,3,1347


### job stat10 - Percent of Jobs by Position Rank, for each Institution Category

In [101]:
stat10_columns = ["InstitutionCategory","PositionRank","OrgName"]
stat10_groupby_columns = ["InstitutionCategory","PositionRank"]
stat10_column_labels = ["Institution Category","Position Rank","Number of Jobs"]

In [102]:
# use job5, which is job2 + Local variable from orgtree
stat10 = jobs5[stat10_columns].groupby(stat10_groupby_columns,as_index=False).count()
stat10.columns = stat10_column_labels
stat10

Unnamed: 0,Institution Category,Position Rank,Number of Jobs
0,Government,1,779
1,Government,2,326
2,Government,3,356
3,Military,1,146
4,Military,2,37
5,Military,3,8
6,Party,1,363
7,Party,2,310
8,Party,3,936
9,Social,1,113


In [103]:
stat10_catsum_columns = ["Institution Category","Number of Jobs"]
stat10_catsum_groupby_columns = ["Institution Category"]
stat10_catsum_column_labels = ["Institution Category","Total Jobs by Category"]
stat10_catsum = stat10[stat10_catsum_columns].groupby(stat10_catsum_groupby_columns,as_index=False).agg({"Number of Jobs":"sum"})
stat10_catsum.columns = stat10_catsum_column_labels
stat10_catsum

Unnamed: 0,Institution Category,Total Jobs by Category
0,Government,1461
1,Military,191
2,Party,1609
3,Social,222


In [104]:
stat10 = stat10.merge(stat10_catsum,on="Institution Category",how="left")
stat10

Unnamed: 0,Institution Category,Position Rank,Number of Jobs,Total Jobs by Category
0,Government,1,779,1461
1,Government,2,326,1461
2,Government,3,356,1461
3,Military,1,146,191
4,Military,2,37,191
5,Military,3,8,191
6,Party,1,363,1609
7,Party,2,310,1609
8,Party,3,936,1609
9,Social,1,113,222


In [106]:
stat10["Percent Jobs by Rank"] = round(stat10["Number of Jobs"] / stat10["Total Jobs by Category"] * 100,2)
stat10

Unnamed: 0,Institution Category,Position Rank,Number of Jobs,Total Jobs by Category,Percent Jobs by Rank
0,Government,1,779,1461,53.32
1,Government,2,326,1461,22.31
2,Government,3,356,1461,24.37
3,Military,1,146,191,76.44
4,Military,2,37,191,19.37
5,Military,3,8,191,4.19
6,Party,1,363,1609,22.56
7,Party,2,310,1609,19.27
8,Party,3,936,1609,58.17
9,Social,1,113,222,50.9
