In [1]:
import pandas as pd
import numpy as np
from datetime import date
from statistics import mean, mode
from pandas.api.types import CategoricalDtype

In [2]:
today = date.today()
print(today)

2023-11-26


# Tables

In [3]:
path_tables = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 2 tables/"

In [4]:
# tables
filename_careerorglink = "careerorglink.xlsx"
filename_leadercareerlink = "leadercareerlink.xlsx"
filename_orgtree = "orgtree.xlsx"
filename_elected = "positions_elected.xlsx"

In [5]:
# col = pd.read_excel(path_tables + filename_careerorglink,dtype="str")
# col.shape

In [6]:
# col.columns

In [7]:
# lcl = pd.read_excel(path_tables + filename_leadercareerlink,dtype="str")
# lcl.shape

In [8]:
# lcl.columns

In [9]:
# org = pd.read_excel(path_tables + filename_orgtree,dtype="str")
# org.shape

In [10]:
# org.columns

In [11]:
# elected = pd.read_excel(path_tables + filename_elected,dtype="str")
# elected.shape

In [12]:
# elected.columns

# Queries

In [13]:
path_queries = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 3 queries/"

In [14]:
filename_leaderjob_no_spa = "leaderjob_no_spa.xlsx"
filename_leaderjobtransition_no_spa = "leaderjobtransition_no_spa.xlsx"

In [15]:
ljobs = pd.read_excel(path_queries + filename_leaderjob_no_spa,dtype="str")
ljobs.shape

(6047, 15)

In [32]:
trans = pd.read_excel(path_queries + filename_leaderjobtransition_no_spa,dtype="str")
trans.shape

(4306, 31)

#### change datatypes

In [33]:
ljobs = ljobs.astype({"CareerStartYear":"int","CareerStartDate":"int"})
ljobs.dtypes

LeaderID                 object
CareerString             object
CareerDateString_2022    object
CareerStartYear           int32
CareerStartMonth         object
CareerStartDate           int32
CareerSubstring          object
InstitutionType          object
PrimaryInstitution       object
OrgName                  object
Local                    object
Position                 object
IsElected                object
OrgRank                  object
PositionRank             object
dtype: object

In [34]:
trans = trans.astype({"OrgAdvance":"int","PositionAdvance":"int",
                      "CareerStartYear_1":"int","CareerStartYear_2":"int",
                      "CareerStartDate_1":"int","CareerStartDate_2":"int"})
trans.dtypes

LeaderID                   object
CareerString_1             object
CareerDateString_2022_1    object
CareerStartYear_1           int32
CareerStartMonth_1         object
CareerStartDate_1           int32
CareerSubstring_1          object
InstitutionType_1          object
PrimaryInstitution_1       object
OrgName_1                  object
Local_1                    object
Position_1                 object
IsElected_1                object
OrgRank_1                  object
PositionRank_1             object
CareerString_2             object
CareerDateString_2022_2    object
CareerStartYear_2           int32
CareerStartMonth_2         object
CareerStartDate_2           int32
CareerSubstring_2          object
InstitutionType_2          object
PrimaryInstitution_2       object
OrgName_2                  object
Local_2                    object
Position_2                 object
IsElected_2                object
OrgRank_2                  object
PositionRank_2             object
OrgAdvance    

# Functions

In [35]:
def merge_results(m):
    
    print("\nMerge Results...")
    print("")
    print("\tshape     :",m.shape)
    print("\tleft_only :",m[m["_merge"]=="left_only"].shape)
    print("\tboth      :",m[m["_merge"]=="both"].shape)
    print("\tright_only:",m[m["_merge"]=="right_only"].shape)

In [36]:
# using this on (PI,OrgName) will ensure unique & non-null keys
# using this on a larger df will ensure unique rows and non-null keys, but not unique keys

def unique_non_null_rows(olddf):
    
    df = olddf.copy()
    
    
    ### drop duplicates
    df = df.drop_duplicates(keep="first",ignore_index=True)

    ### drop null rows
    df.dropna(how="all",axis=0,inplace=True)
    
    # drop rows with null PI
    df = df[~(df["PrimaryInstitution"].isna())]
        
    ### drop rows where PI contains stop words
    stop_words_lower = ["uncertain","current","deprecated","please_revise"]
    df = df[ ~ df["PrimaryInstitution"].str.lower().isin(stop_words_lower)]
            

    print("\nUnique Non-null Rows...")
    print("")
    print("\tNon-unique rows:",olddf.shape)
    print("\tUnique rows    :",df.shape)

    ### sort
    df = df.sort_values(["PrimaryInstitution","OrgName"])
    
    return df

In [37]:
def create_time_series(series,group_var,count_var):
    
    yeardist = series.groupby(group_var,as_index=False).count().sort_values(group_var)
    
    max_year = max(series[group_var])
    x = pd.DataFrame({"year":pd.Series(range(min(yeardist[group_var].astype(int)),max(yeardist[group_var].astype(int))+1))})
    
    yeardist[group_var] = yeardist[group_var].astype(int,errors="ignore")
    
    ts = x.merge(yeardist,left_on="year",right_on=group_var,how="left")
    ts.loc[ts[count_var].isna(),count_var]=0
    # merge_results(ts)
    
    ts_columns = ["year",count_var]
    ts = ts[ts_columns]
    
    return ts

# Analysis - Research Note

In [38]:
path_analysis = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/analysis/"

In [39]:
# analysis sub-paths
study0_path = "2023.10.04 Study 0 - research note/"
study1_path = "2023.10.04 Study 1 - political capital/"
study2_path = "2023.10.04 Study 2 - commitment vs control/"
study3_path = "2023.10.04 Study 3 - reds vs experts/"

### jobs - by national-local

In [40]:
ljobs.head(2)

Unnamed: 0,LeaderID,CareerString,CareerDateString_2022,CareerStartYear,CareerStartMonth,CareerStartDate,CareerSubstring,InstitutionType,PrimaryInstitution,OrgName,Local,Position,IsElected,OrgRank,PositionRank
0,리원일,노동성 상(유임),1999.02,1999,2,199902,,정권기관,내각,노동성,False,상,,1,1
1,조용원,당 정치국 후보위원,2020.01,2020,1,202001,,노동당,노동당,당중앙위원회_정치국,False,후보위원,1.0,2,3


In [41]:
ljobs.shape

(6047, 15)

In [63]:
# number of jobs, by local (True) and national (False)
count_vars = ["LeaderID","Local"]
ljobs[count_vars].groupby("Local").count()

Unnamed: 0_level_0,LeaderID
Local,Unnamed: 1_level_1
False,5580
True,467


In [85]:
leaders = list(ljobs.LeaderID.unique())

In [86]:
# total leaders
len(leaders)

584

In [74]:
# number of leaders who have had 1+ local jobs
leaders_local = list(ljobs[ljobs.Local=="True"]["LeaderID"].unique())

In [76]:
len(leaders_local)

161

In [78]:
# number of leaders who have NEVER had local jobs
len(list(ljobs.LeaderID.unique())) - len(leaders_local)

423

In [79]:
# number of leaders who have had 1+ national jobs
leaders_national = list(ljobs[ljobs.Local=="False"]["LeaderID"].unique())

In [81]:
len(leaders_national)

565

In [89]:
# leaders who have NEVER had national jobs
leaders_onlylocal = [leader for leader in leaders if leader not in leaders_national]

In [90]:
# number of leaders who have NEVER had national jobs
len(leaders_onlylocal)

19

### trans - by national-local

In [43]:
trans.columns

Index(['LeaderID', 'CareerString_1', 'CareerDateString_2022_1',
       'CareerStartYear_1', 'CareerStartMonth_1', 'CareerStartDate_1',
       'CareerSubstring_1', 'InstitutionType_1', 'PrimaryInstitution_1',
       'OrgName_1', 'Local_1', 'Position_1', 'IsElected_1', 'OrgRank_1',
       'PositionRank_1', 'CareerString_2', 'CareerDateString_2022_2',
       'CareerStartYear_2', 'CareerStartMonth_2', 'CareerStartDate_2',
       'CareerSubstring_2', 'InstitutionType_2', 'PrimaryInstitution_2',
       'OrgName_2', 'Local_2', 'Position_2', 'IsElected_2', 'OrgRank_2',
       'PositionRank_2', 'OrgAdvance', 'PositionAdvance'],
      dtype='object')

In [47]:
count_vars2 = ["LeaderID","Local_1","Local_2"]
trans[count_vars2].groupby(["Local_1","Local_2"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,LeaderID
Local_1,Local_2,Unnamed: 2_level_1
False,False,3873
False,True,133
True,False,216
True,True,84


In [83]:
3873 + 133 + 216 + 84

4306

In [84]:
trans.shape

(4306, 31)

In [53]:
trans.columns

Index(['LeaderID', 'CareerString_1', 'CareerDateString_2022_1',
       'CareerStartYear_1', 'CareerStartMonth_1', 'CareerStartDate_1',
       'CareerSubstring_1', 'InstitutionType_1', 'PrimaryInstitution_1',
       'OrgName_1', 'Local_1', 'Position_1', 'IsElected_1', 'OrgRank_1',
       'PositionRank_1', 'CareerString_2', 'CareerDateString_2022_2',
       'CareerStartYear_2', 'CareerStartMonth_2', 'CareerStartDate_2',
       'CareerSubstring_2', 'InstitutionType_2', 'PrimaryInstitution_2',
       'OrgName_2', 'Local_2', 'Position_2', 'IsElected_2', 'OrgRank_2',
       'PositionRank_2', 'OrgAdvance', 'PositionAdvance'],
      dtype='object')

In [56]:
# examples of Local to Local
view_columns = ['LeaderID',
                'CareerStartDate_1','InstitutionType_1', 'PrimaryInstitution_1','OrgName_1', 'Position_1',
                'CareerStartDate_2','InstitutionType_2', 'PrimaryInstitution_2','OrgName_2', 'Position_2',
                'OrgAdvance', 'PositionAdvance']


In [104]:
local_to_local = trans.loc[(trans.Local_1=="True") & (trans.Local_2=="True"),view_columns]
local_to_national = trans.loc[(trans.Local_1=="True") & (trans.Local_2=="False"),view_columns]
national_to_local = trans.loc[(trans.Local_1=="False") & (trans.Local_2=="True"),view_columns]
national_to_national = trans.loc[(trans.Local_1=="False") & (trans.Local_2=="False"),view_columns]

In [61]:
local_to_local.to_excel(path_analysis + study0_path + "local/local_to_local.xlsx",index=False)
local_to_national.to_excel(path_analysis + study0_path + "local/local_to_national.xlsx",index=False)
national_to_local.to_excel(path_analysis + study0_path + "local/national_to_local.xlsx",index=False)

### resumes - leaders who had local jobs

In [94]:
# resumes of leaders who only had local jobs
sort_columns = ["LeaderID","CareerStartDate"]
resume_onlylocal = ljobs[ljobs.LeaderID.isin(leaders_onlylocal)].sort_values(sort_columns)

In [95]:
resume_onlylocal.to_excel(path_analysis + study0_path + "local/resume_onlylocal.xlsx",index=False)

In [96]:
# leaders who had local-national job transitions
leaders_local_to_national = list(local_to_national.LeaderID.unique())

In [97]:
len(leaders_local_to_national)

133

In [99]:
# leaders who had national-local job transitions
leaders_national_to_local = list(national_to_local.LeaderID.unique())

In [100]:
len(leaders_national_to_local)

85

In [102]:
# leaders who had local-local job transitions
leaders_local_to_local = list(local_to_local.LeaderID.unique())

In [103]:
len(leaders_local_to_local)

54

In [105]:
# leaders who had national-national job transitions
leaders_national_national = list(national_to_national.LeaderID.unique())

In [106]:
len(leaders_national_national)

470