In [1]:
import pandas as pd
import numpy as np
from datetime import date
from statistics import mean, mode
from pandas.api.types import CategoricalDtype

In [2]:
today = date.today()
print(today)

2023-12-03


# Tables

In [8]:
path_tables = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 2 tables/"

In [43]:
# tables
filename_careerorglink = "careerorglink.xlsx"
filename_leadercareerlink = "leadercareerlink.xlsx"
filename_orgtree = "orgtree.xlsx"
filename_elected = "positions_elected.xlsx"

In [10]:
col = pd.read_excel(path_tables + filename_careerorglink,dtype="str")
col.shape

(9002, 12)

In [11]:
col.columns

Index(['CareerString', 'CareerDateString_2022', 'IsJob', 'MultipleSubstrings',
       'CareerStartYear', 'CareerStartMonth', 'CareerSubstring',
       'InstitutionType', 'PrimaryInstitution', 'OrgName', 'Position',
       'Notes'],
      dtype='object')

In [12]:
lcl = pd.read_excel(path_tables + filename_leadercareerlink,dtype="str")
lcl.shape

(12617, 3)

In [13]:
lcl.columns

Index(['LeaderID', 'CareerString', 'CareerDateString_2022'], dtype='object')

In [72]:
org = pd.read_excel(path_tables + filename_orgtree,dtype="str")
org.shape

(2368, 20)

In [73]:
org.columns

Index(['InstitutionType', 'OrgType', 'PrimaryInstitution', 'OrgName',
       'PI_Index', 'OrgRank', 'P1', 'P2', 'P3', 'Alias_OrgName',
       'LinkToNext_PI', 'LinkToNext_Org', 'LinkToNext_Year', 'Notes',
       'L1_Index', 'L2_Index', 'L3_Index', 'L4_Index', 'L5_Index', 'Local'],
      dtype='object')

In [16]:
# elected = pd.read_excel(path_tables + filename_elected,dtype="str")
# elected.shape

In [17]:
# elected.columns

# Queries

In [18]:
path_queries = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 3 queries/"

In [45]:
filename_leaderjob_all = "leaderjob_electUnelect_inOutgov.xlsx"
filename_leaderjob_no_spa = "leaderjob_no_spa.xlsx"
filename_leaderjobtransition_no_spa = "leaderjobtransition_no_spa.xlsx"

In [46]:
ljobs_all = pd.read_excel(path_queries + filename_leaderjob_all,dtype="str")
ljobs_all.shape

(8594, 14)

In [20]:
# ljobs = pd.read_excel(path_queries + filename_leaderjob_no_spa,dtype="str")
# ljobs.shape

In [21]:
# trans = pd.read_excel(path_queries + filename_leaderjobtransition_no_spa,dtype="str")
# trans.shape

#### change datatypes

In [23]:
# ljobs = ljobs.astype({"CareerStartYear":"int","CareerStartDate":"int"})
# ljobs.dtypes

In [26]:
# trans = trans.astype({"OrgAdvance":"int","PositionAdvance":"int",
#                      "CareerStartYear_1":"int","CareerStartYear_2":"int",
#                      "CareerStartDate_1":"int","CareerStartDate_2":"int"})
# trans.dtypes

# Functions

In [27]:
def merge_results(m):
    
    print("\nMerge Results...")
    print("")
    print("\tshape     :",m.shape)
    print("\tleft_only :",m[m["_merge"]=="left_only"].shape)
    print("\tboth      :",m[m["_merge"]=="both"].shape)
    print("\tright_only:",m[m["_merge"]=="right_only"].shape)

In [28]:
# using this on (PI,OrgName) will ensure unique & non-null keys
# using this on a larger df will ensure unique rows and non-null keys, but not unique keys

def unique_non_null_rows(olddf):
    
    df = olddf.copy()
    
    
    ### drop duplicates
    df = df.drop_duplicates(keep="first",ignore_index=True)

    ### drop null rows
    df.dropna(how="all",axis=0,inplace=True)
    
    # drop rows with null PI
    df = df[~(df["PrimaryInstitution"].isna())]
        
    ### drop rows where PI contains stop words
    stop_words_lower = ["uncertain","current","deprecated","please_revise"]
    df = df[ ~ df["PrimaryInstitution"].str.lower().isin(stop_words_lower)]
            

    print("\nUnique Non-null Rows...")
    print("")
    print("\tNon-unique rows:",olddf.shape)
    print("\tUnique rows    :",df.shape)

    ### sort
    df = df.sort_values(["PrimaryInstitution","OrgName"])
    
    return df

In [29]:
def create_time_series(series,group_var,count_var):
    
    yeardist = series.groupby(group_var,as_index=False).count().sort_values(group_var)
    
    max_year = max(series[group_var])
    x = pd.DataFrame({"year":pd.Series(range(min(yeardist[group_var].astype(int)),max(yeardist[group_var].astype(int))+1))})
    
    yeardist[group_var] = yeardist[group_var].astype(int,errors="ignore")
    
    ts = x.merge(yeardist,left_on="year",right_on=group_var,how="left")
    ts.loc[ts[count_var].isna(),count_var]=0
    # merge_results(ts)
    
    ts_columns = ["year",count_var]
    ts = ts[ts_columns]
    
    return ts

# Analysis - Research Note

In [30]:
path_analysis = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/analysis/"

In [31]:
# analysis sub-paths
study0_path = "2023.10.04 Study 0 - research note/"
study1_path = "2023.10.04 Study 1 - political capital/"
study2_path = "2023.10.04 Study 2 - commitment vs control/"
study3_path = "2023.10.04 Study 3 - reds vs experts/"

# Statistics - elites, orgs and jobs

# Elites stats

In [32]:
# include all jobs: elected/not-elected, SPA/not-SPA, local/central

In [36]:
lcl.shape

(12617, 3)

In [39]:
# total elites
lcl.LeaderID.unique().shape

(637,)

In [47]:
ljobs_all.columns

Index(['LeaderID', 'CareerString', 'CareerDateString_2022', 'CareerStartYear',
       'CareerStartMonth', 'CareerStartDate', 'CareerSubstring',
       'InstitutionType', 'PrimaryInstitution', 'OrgName', 'Position',
       'IsElected', 'OrgRank', 'PositionRank'],
      dtype='object')

In [48]:
# total elites with jobs
ljobs_all.LeaderID.unique().shape

(607,)

## Org stats

In [74]:
org.shape

(2368, 20)

In [75]:
org.columns

Index(['InstitutionType', 'OrgType', 'PrimaryInstitution', 'OrgName',
       'PI_Index', 'OrgRank', 'P1', 'P2', 'P3', 'Alias_OrgName',
       'LinkToNext_PI', 'LinkToNext_Org', 'LinkToNext_Year', 'Notes',
       'L1_Index', 'L2_Index', 'L3_Index', 'L4_Index', 'L5_Index', 'Local'],
      dtype='object')

In [100]:
not_social_orgs = ["Party","Military","Government"]

In [137]:
org2 = org

In [166]:
org2.shape

(2329, 21)

In [138]:
org2["InstitutionCategory"] = org2["InstitutionType"].copy()

In [139]:
org2 = org2[~(org2["InstitutionType"]=="UNCERTAIN")]
org2.shape

(2329, 21)

In [140]:
org2[stat1_columns].groupby("InstitutionCategory").count()

Unnamed: 0_level_0,OrgName
InstitutionCategory,Unnamed: 1_level_1
국제친선단체,1
노동당,120
당외곽및사회단체_경제부문,11
당외곽및사회단체_경제부문(별책),7
당외곽및사회단체_근로단체,33
당외곽및사회단체_대외부문,8
당외곽및사회단체_사회부문,58
당외곽및사회단체_사회부문(별책),30
당외곽및사회단체_정치부문,28
당외곽및사회단체_종교부문,18


In [141]:
# rename levels of InstitutionCategory
org2.loc[org2["InstitutionCategory"]=="노동당","InstitutionCategory"]="Party"
org2.loc[org2["InstitutionCategory"]=="정권기관","InstitutionCategory"]="Government"
org2.loc[org2["InstitutionCategory"]=="인민군","InstitutionCategory"]="Military"
org2.loc[~(org2["InstitutionCategory"].isin(not_social_orgs)),"InstitutionCategory"]="Social"

In [143]:
org2.InstitutionCategory.unique()

array(['Party', 'Social', 'Government', 'Military'], dtype=object)

In [142]:
# rename levels of InstitutionCategory
org2.loc[org2["Local"]=="True","Local"]="Local"
org2.loc[org2["Local"]=="False","Local"]="Central"

In [144]:
org2.Local.unique()

array(['Central', 'Local'], dtype=object)

### org stat1 - Number of Organizations by Institution

In [145]:
stat1_columns = ["InstitutionCategory","OrgName"]
stat1_column_labels = ["Institution","Number of Organizations"]

In [146]:
stat1 = org2[stat_columns].groupby("InstitutionCategory",as_index=False).count()
stat1.columns = stat1_column_labels
stat1

Unnamed: 0,Institution,Number of Organizations
0,Government,1301
1,Military,136
2,Party,120
3,Social,201


In [147]:
# export to excel


### org stat2 - Number of Organizations by Institution and Rank

In [148]:
stat2_columns = ["InstitutionCategory","OrgRank","OrgName"]
stat2_groupby_columns = ["InstitutionCategory","OrgRank"]
stat2_column_labels = ["Institution","Rank","Number of Organizations"]

In [149]:
stat2 = org2[stat2_columns].groupby(stat2_groupby_columns,as_index=False).count()
stat2.columns = stat2_column_labels
stat2

Unnamed: 0,Institution,Rank,Number of Organizations
0,Government,0,45
1,Government,1,243
2,Government,2,856
3,Government,3,151
4,Government,4,6
5,Military,0,19
6,Military,1,70
7,Military,2,8
8,Military,3,24
9,Military,4,15


In [150]:
# export to excel


### org stat3 - Number of Organizations by Institution and Rank

In [151]:
stat3_columns = ["InstitutionCategory","Local","OrgName"]
stat3_groupby_columns = ["InstitutionCategory","Local"]
stat3_column_labels = ["Institution","Local or Central","Number of Organizations"]

In [152]:
stat3 = org2[stat3_columns].groupby(stat3_groupby_columns,as_index=False).count()
stat3.columns = stat3_column_labels
stat3

Unnamed: 0,Institution,Local or Central,Number of Organizations
0,Government,Central,1221
1,Government,Local,80
2,Military,Central,111
3,Military,Local,25
4,Party,Central,86
5,Party,Local,34
6,Social,Central,175
7,Social,Local,26


# Jobs stats

In [154]:
ljobs_all.shape

(8594, 14)

In [153]:
ljobs_all.columns

Index(['LeaderID', 'CareerString', 'CareerDateString_2022', 'CareerStartYear',
       'CareerStartMonth', 'CareerStartDate', 'CareerSubstring',
       'InstitutionType', 'PrimaryInstitution', 'OrgName', 'Position',
       'IsElected', 'OrgRank', 'PositionRank'],
      dtype='object')

In [182]:
jobs_columns = ['LeaderID','CareerStartYear','CareerStartDate', 'InstitutionType', 'PrimaryInstitution', 'OrgName', 'Position', 'IsElected', 'OrgRank', 'PositionRank']

In [183]:
ljobs_test = unique_non_null_rows(ljobs_all[jobs_columns])
ljobs_test.shape


Unique Non-null Rows...

	Non-unique rows: (8594, 10)
	Unique rows    : (6116, 10)


(6116, 10)

In [184]:
jobs2 = unique_non_null_rows(ljobs_all[jobs_columns])
jobs2.shape


Unique Non-null Rows...

	Non-unique rows: (8594, 10)
	Unique rows    : (6116, 10)


(6116, 10)

### job stats1 - Distribution of Number of Jobs in Elite Resumes

In [185]:
jstat1_columns = ["LeaderID","Position"]
jstat1_groupby_columns = ["LeaderID"]

In [186]:
jstat1 = jobs2[jstat1_columns].groupby(jstat1_groupby_columns,as_index=False).count()
# jstat1.columns = stat3_column_labels
jstat1.sort_values("Position",ascending=False,inplace=True)
jstat1

Unnamed: 0,LeaderID,Position
406,양형섭,58
328,박성철a,51
108,김영남,48
146,김일성,48
273,리종옥,47
...,...,...
571,포희성,1
279,리충길,1
69,김만길,1
329,박성철b,1


In [187]:
# jstat1_columns = ["Position","Position"]
jstat2_groupby_columns = ["Position"]
jstat2_column_labels = ["Number of Jobs in Resume","Number of Elites"]

In [188]:
jstat2 = jstat1.groupby(jstat2_groupby_columns,as_index=False).count()
jstat2.sort_values("Position",ascending=True,inplace=True)
jstat2.columns = jstat2_column_labels
jstat2

Unnamed: 0,Number of Jobs in Resume,Number of Elites
0,1,41
1,2,57
2,3,52
3,4,44
4,5,39
5,6,33
6,7,48
7,8,40
8,9,33
9,10,24


### average jobs per year

In [189]:
jobs2.columns

Index(['LeaderID', 'CareerStartYear', 'CareerStartDate', 'InstitutionType',
       'PrimaryInstitution', 'OrgName', 'Position', 'IsElected', 'OrgRank',
       'PositionRank'],
      dtype='object')

In [202]:
jobs3 = jobs2[~(jobs2.PrimaryInstitution=="최고인민회의")]
jobs3.shape

(4246, 10)

In [203]:
jstat3_columns = ["CareerStartYear","Position"]
jstat3_groupby_columns = ["CareerStartYear"]
# jstat3_column_labels = ["Number of Jobs in Resume","Number of Elites"]

In [204]:
jstat3 = jobs3[jstat3_columns].groupby(jstat3_groupby_columns,as_index=False).count()
jstat3.sort_values('CareerStartYear',ascending=True,inplace=True)
jstat3

Unnamed: 0,CareerStartYear,Position
0,1937,1
1,1945,6
2,1946,5
3,1947,1
4,1948,9
...,...,...
74,2018,51
75,2019,184
76,2020,48
77,2021,235


In [205]:
mean(list(jstat3.Position))

53.74683544303797

In [206]:
jstat3["Largerthan53"]=False
jstat3.loc[jstat3.Position>53,"Largerthan53"]=True

In [209]:
jstat3[jstat3.Position<54]

Unnamed: 0,CareerStartYear,Position,Largerthan53
0,1937,1,False
1,1945,6,False
2,1946,5,False
3,1947,1,False
4,1948,9,False
5,1949,2,False
6,1950,11,False
7,1951,7,False
8,1952,9,False
9,1953,4,False
