In [1]:
import pandas as pd
import numpy as np
from datetime import date
from statistics import mean, mode
from pandas.api.types import CategoricalDtype

In [2]:
today = date.today()
print(today)

2024-02-15


# Tables

In [3]:
path_tables = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 2 tables/"

In [4]:
# tables
filename_careerorglink = "careerorglink.xlsx"
filename_leadercareerlink = "leadercareerlink.xlsx"
filename_orgtree = "orgtree.xlsx"
filename_elected = "positions_elected.xlsx"

In [5]:
# career-org link
col = pd.read_excel(path_tables + filename_careerorglink,dtype="str")
col.shape

(9002, 12)

In [6]:
col.columns

Index(['CareerString', 'CareerDateString_2022', 'IsJob', 'MultipleSubstrings',
       'CareerStartYear', 'CareerStartMonth', 'CareerSubstring',
       'InstitutionType', 'PrimaryInstitution', 'OrgName', 'Position',
       'Notes'],
      dtype='object')

In [7]:
# leader-career link
lcl = pd.read_excel(path_tables + filename_leadercareerlink,dtype="str")
lcl.shape

(12617, 3)

In [8]:
lcl.columns

Index(['LeaderID', 'CareerString', 'CareerDateString_2022'], dtype='object')

In [9]:
# orgtree
org = pd.read_excel(path_tables + filename_orgtree,dtype="str")
org.shape

(2367, 20)

In [10]:
org.columns

Index(['InstitutionType', 'OrgType', 'PrimaryInstitution', 'OrgName',
       'PI_Index', 'OrgRank', 'P1', 'P2', 'P3', 'Alias_OrgName',
       'LinkToNext_PI', 'LinkToNext_Org', 'LinkToNext_Year', 'Notes',
       'L1_Index', 'L2_Index', 'L3_Index', 'L4_Index', 'L5_Index', 'Local'],
      dtype='object')

In [11]:
# elected = pd.read_excel(path_tables + filename_elected,dtype="str")
# elected.shape

In [12]:
# elected.columns

# Queries

In [13]:
path_queries = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/data/combined data/combined data - 3 queries/"

In [14]:
filename_leaderjob_all = "leaderjob_electUnelect_inOutgov.xlsx"
filename_leaderjob_no_spa = "leaderjob_no_spa.xlsx"
filename_leaderjobtransition_no_spa = "leaderjobtransition_no_spa.xlsx"

In [15]:
# leader jobs
ljobs_all = pd.read_excel(path_queries + filename_leaderjob_all,dtype="str")
ljobs_all.shape

(8594, 14)

In [16]:
# ljobs = pd.read_excel(path_queries + filename_leaderjob_no_spa,dtype="str")
# ljobs.shape

In [17]:
# trans = pd.read_excel(path_queries + filename_leaderjobtransition_no_spa,dtype="str")
# trans.shape

#### change datatypes

In [18]:
# ljobs = ljobs.astype({"CareerStartYear":"int","CareerStartDate":"int"})
# ljobs.dtypes

In [19]:
# trans = trans.astype({"OrgAdvance":"int","PositionAdvance":"int",
#                      "CareerStartYear_1":"int","CareerStartYear_2":"int",
#                      "CareerStartDate_1":"int","CareerStartDate_2":"int"})
# trans.dtypes

# Functions

In [20]:
def merge_results(m):
    
    print("\nMerge Results...")
    print("")
    print("\tshape     :",m.shape)
    print("\tleft_only :",m[m["_merge"]=="left_only"].shape)
    print("\tboth      :",m[m["_merge"]=="both"].shape)
    print("\tright_only:",m[m["_merge"]=="right_only"].shape)

In [21]:
# using this on (PI,OrgName) will ensure unique & non-null keys
# using this on a larger df will ensure unique rows and non-null keys, but not unique keys

def unique_non_null_rows(olddf):
    
    df = olddf.copy()
    
    
    ### drop duplicates
    df = df.drop_duplicates(keep="first",ignore_index=True)

    ### drop null rows
    df.dropna(how="all",axis=0,inplace=True)
    
    # drop rows with null PI
    df = df[~(df["PrimaryInstitution"].isna())]
        
    ### drop rows where PI contains stop words
    stop_words_lower = ["uncertain","current","deprecated","please_revise"]
    df = df[ ~ df["PrimaryInstitution"].str.lower().isin(stop_words_lower)]
            

    print("\nUnique Non-null Rows...")
    print("")
    print("\tNon-unique rows:",olddf.shape)
    print("\tUnique rows    :",df.shape)

    ### sort
    df = df.sort_values(["PrimaryInstitution","OrgName"])
    
    return df

In [22]:
def create_time_series(series,group_var,count_var):
    
    yeardist = series.groupby(group_var,as_index=False).count().sort_values(group_var)
    
    max_year = max(series[group_var])
    x = pd.DataFrame({"year":pd.Series(range(min(yeardist[group_var].astype(int)),max(yeardist[group_var].astype(int))+1))})
    
    yeardist[group_var] = yeardist[group_var].astype(int,errors="ignore")
    
    ts = x.merge(yeardist,left_on="year",right_on=group_var,how="left")
    ts.loc[ts[count_var].isna(),count_var]=0
    # merge_results(ts)
    
    ts_columns = ["year",count_var]
    ts = ts[ts_columns]
    
    return ts

# Analysis - Research Note

In [23]:
path_analysis = "C:/Users/seoul/Dropbox/00 technical/github/nkelites/analysis/"

In [24]:
# analysis sub-paths
study0_path = "2023.10.04 Study 0 - research note/"
study1_path = "2023.10.04 Study 1 - political capital/"
study2_path = "2023.10.04 Study 2 - commitment vs control/"
study3_path = "2023.10.04 Study 3 - reds vs experts/"

# Statistics - elites, orgs and jobs

# Elites stats

In [25]:
# include all jobs: elected/not-elected, SPA/not-SPA, local/central

In [26]:
lcl.shape

(12617, 3)

In [27]:
lcl.head()

Unnamed: 0,LeaderID,CareerString,CareerDateString_2022
0,리선권,개성공단 남북공동위원회 통행통신통관 분과위원회,2013.09 ~ 2014.01
1,조경철,故 김정일 국가장의위원회 위원,2011.12
2,리영수,근로단체부장 해임 * 후임: 리일환,2014
3,리설주,금수산궁전 개관식 참석시 동행,2013.08
4,리원일,노동성 상(유임),1999.02


In [28]:
# total elites
lcl.LeaderID.unique().shape

(637,)

In [29]:
ljobs_all.columns

Index(['LeaderID', 'CareerString', 'CareerDateString_2022', 'CareerStartYear',
       'CareerStartMonth', 'CareerStartDate', 'CareerSubstring',
       'InstitutionType', 'PrimaryInstitution', 'OrgName', 'Position',
       'IsElected', 'OrgRank', 'PositionRank'],
      dtype='object')

In [30]:
# total elites with jobs
ljobs_all.LeaderID.unique().shape

(607,)

## Org stats

In [31]:
org.shape

(2367, 20)

In [32]:
org.columns

Index(['InstitutionType', 'OrgType', 'PrimaryInstitution', 'OrgName',
       'PI_Index', 'OrgRank', 'P1', 'P2', 'P3', 'Alias_OrgName',
       'LinkToNext_PI', 'LinkToNext_Org', 'LinkToNext_Year', 'Notes',
       'L1_Index', 'L2_Index', 'L3_Index', 'L4_Index', 'L5_Index', 'Local'],
      dtype='object')

In [33]:
not_social_orgs = ["Party","Military","Government"]

In [34]:
org2 = org

In [35]:
org2.shape

(2367, 20)

In [36]:
org2["InstitutionCategory"] = org2["InstitutionType"].copy()

In [37]:
org2 = org2[~(org2["InstitutionType"]=="UNCERTAIN")]
org2.shape

(2328, 21)

In [38]:
# rename levels of InstitutionCategory
org2.loc[org2["InstitutionCategory"]=="노동당","InstitutionCategory"]="Party"
org2.loc[org2["InstitutionCategory"]=="정권기관","InstitutionCategory"]="Government"
org2.loc[org2["InstitutionCategory"]=="인민군","InstitutionCategory"]="Military"
org2.loc[~(org2["InstitutionCategory"].isin(not_social_orgs)),"InstitutionCategory"]="Social"

In [39]:
org2.InstitutionCategory.unique()

array(['Party', 'Social', 'Government', 'Military'], dtype=object)

In [40]:
# rename levels of InstitutionCategory
org2.loc[org2["Local"]=="True","Local"]="Local"
org2.loc[org2["Local"]=="False","Local"]="Central"

In [41]:
org2.Local.unique()

array(['Central', 'Local'], dtype=object)

### double-check OrgRank

In [42]:
select_cols = ["InstitutionType","PrimaryInstitution","OrgName","OrgRank"]

In [43]:
#pd.set_option('display.max_rows', None)
#org2[select_cols]

### org stat1 - Number of Organizations by Institution

In [44]:
stat1_columns = ["InstitutionCategory","OrgName"]
stat1_column_labels = ["Institution","Number of Organizations"]

In [45]:
stat1 = org2[stat1_columns].groupby("InstitutionCategory",as_index=False).count()
stat1.columns = stat1_column_labels
stat1

Unnamed: 0,Institution,Number of Organizations
0,Government,1300
1,Military,136
2,Party,120
3,Social,201


In [46]:
# export to excel


### org stat2 - Number of Organizations by Institution and Rank

In [47]:
stat2_columns = ["InstitutionCategory","OrgRank","OrgName"]
stat2_groupby_columns = ["InstitutionCategory","OrgRank"]
stat2_column_labels = ["Institution","Rank","Number of Organizations"]

In [48]:
stat2 = org2[stat2_columns].groupby(stat2_groupby_columns,as_index=False).count()
stat2.columns = stat2_column_labels
stat2

Unnamed: 0,Institution,Rank,Number of Organizations
0,Government,0,0
1,Government,1,258
2,Government,2,886
3,Government,3,150
4,Government,4,6
5,Military,0,0
6,Military,1,82
7,Military,2,13
8,Military,3,23
9,Military,4,11


In [49]:
# export to excel


### org stat3 - Number of Organizations by Institution and Rank

In [50]:
stat3_columns = ["InstitutionCategory","Local","OrgName"]
stat3_groupby_columns = ["InstitutionCategory","Local"]
stat3_column_labels = ["Institution","Local or Central","Number of Organizations"]

In [51]:
stat3 = org2[stat3_columns].groupby(stat3_groupby_columns,as_index=False).count()
stat3.columns = stat3_column_labels
stat3

Unnamed: 0,Institution,Local or Central,Number of Organizations
0,Government,Central,1220
1,Government,Local,80
2,Military,Central,111
3,Military,Local,25
4,Party,Central,86
5,Party,Local,34
6,Social,Central,175
7,Social,Local,26


# org - Example of OrgTree

In [93]:
select_cols = ["PrimaryInstitution","OrgName","P1","P2","P3","OrgRank"]
sort_cols = ["PrimaryInstitution","OrgName"]
select_rows = (org2["PrimaryInstitution"]=="노동당") & (org2["OrgName"].str.contains("당중앙위원회"))
orgtree_sample_party = org2.loc[select_rows,select_cols].sort_values(sort_cols).head(50)
orgtree_sample_party

Unnamed: 0,PrimaryInstitution,OrgName,P1,P2,P3,OrgRank
28,노동당,당중앙위원회,총비서,,"과장,책임지도원,위원,후보위원,지도원,고문",1
122,노동당,당중앙위원회_38호실,실장,,,2
61,노동당,당중앙위원회_39호실,실장,"부실장,제1부부장",,2
62,노동당,당중앙위원회_X부,부장,"부부장,제1부부장",,2
2232,노동당,당중앙위원회_X위원회,위원장,"비서,부위원장",,2
106,노동당,당중앙위원회_간부부,부장,,,2
93,노동당,당중앙위원회_강원도당위원회,책임비서,비서,,2
96,노동당,당중앙위원회_강원도당위원회_문천군당위원회,책임비서,비서,,3
94,노동당,당중앙위원회_강원도당위원회_문천시당위원회,책임비서,비서,,3
95,노동당,당중앙위원회_강원도당위원회_원산시당위원회,책임비서,비서,,3


In [94]:
select_cols = ["PrimaryInstitution","OrgName","P1","P2","P3","OrgRank"]
sort_cols = ["PrimaryInstitution","OrgName"]
select_rows = (org2["PrimaryInstitution"]=="내각")
orgtree_sample_gov = org2.loc[select_rows,select_cols].sort_values(sort_cols).head(50)
orgtree_sample_gov

Unnamed: 0,PrimaryInstitution,OrgName,P1,P2,P3,OrgRank
1583,내각,225국,국장,부국장,,1
1584,내각,225국_X,,,,2
699,내각,간석지건설지도국,국장,부국장,,1
707,내각,간석지건설지도국_간석지설계사업소,지배인,기사장,,2
739,내각,건설건재공업성,상,부상,,1
742,내각,건설건재공업성_과학기술국,국장,부국장,,2
740,내각,건설건재공업성_돌가강관리국,국장,부국장,,2
741,내각,건설건재공업성_발전소건설국,국장,부국장,,2
747,내각,건설건재공업성_비품제작소,소장,부소장,,2
743,내각,건설건재공업성_산업건설관리국,국장,부국장,,2


In [95]:
# export samples of orgtree
orgtree_sample_party.to_excel(path_analysis + study0_path + "orgtree_sample_party.xlsx",index=False)
orgtree_sample_gov.to_excel(path_analysis + study0_path + "orgtree_sample_gov.xlsx",index=False)

# Jobs stats

In [52]:
ljobs_all.shape

(8594, 14)

In [53]:
ljobs_all.columns

Index(['LeaderID', 'CareerString', 'CareerDateString_2022', 'CareerStartYear',
       'CareerStartMonth', 'CareerStartDate', 'CareerSubstring',
       'InstitutionType', 'PrimaryInstitution', 'OrgName', 'Position',
       'IsElected', 'OrgRank', 'PositionRank'],
      dtype='object')

In [54]:
jobs_columns = ['LeaderID','CareerStartYear','CareerStartDate', 'InstitutionType', 'PrimaryInstitution', 'OrgName', 'Position', 'IsElected', 'OrgRank', 'PositionRank']

In [55]:
ljobs_test = unique_non_null_rows(ljobs_all[jobs_columns])
ljobs_test.shape


Unique Non-null Rows...

	Non-unique rows: (8594, 10)
	Unique rows    : (6116, 10)


(6116, 10)

In [56]:
jobs2 = unique_non_null_rows(ljobs_all[jobs_columns])
jobs2.shape


Unique Non-null Rows...

	Non-unique rows: (8594, 10)
	Unique rows    : (6116, 10)


(6116, 10)

In [57]:
select_cols = ["InstitutionType","OrgName"]
# jobs2[select_cols].groupby("InstitutionType").count()
jobs2[jobs2.PrimaryInstitution=="최고인민회의"].shape

(1870, 10)

### job stats1 - Distribution of Number of Jobs in Elite Resumes

In [58]:
jstat1_columns = ["LeaderID","Position"]
jstat1_groupby_columns = ["LeaderID"]

In [59]:
jstat1 = jobs2[jstat1_columns].groupby(jstat1_groupby_columns,as_index=False).count()
# jstat1.columns = stat3_column_labels
jstat1.sort_values("Position",ascending=False,inplace=True)
jstat1

Unnamed: 0,LeaderID,Position
406,양형섭,58
328,박성철a,51
108,김영남,48
146,김일성,48
273,리종옥,47
...,...,...
571,포희성,1
279,리충길,1
69,김만길,1
329,박성철b,1


### jobs stat2: Distribution of the number of jobs in each elite resume

In [60]:
# jstat2_columns = ["Position","Position"]
jstat2_groupby_columns = ["Position"]
jstat2_column_labels = ["Number of Jobs in Resume","Number of Elites"]

In [61]:
jstat2 = jstat1.groupby(jstat2_groupby_columns,as_index=False).count()
jstat2.sort_values("Position",ascending=True,inplace=True)
jstat2.columns = jstat2_column_labels
jstat2

Unnamed: 0,Number of Jobs in Resume,Number of Elites
0,1,41
1,2,57
2,3,52
3,4,44
4,5,39
5,6,33
6,7,48
7,8,40
8,9,33
9,10,24


### job stat3 - Number of Government Jobs by SPA

In [62]:
stat3_columns = ["SPA","OrgName"]
stat3_groupby_columns = ["SPA"]
stat3_column_labels = ["SPA","Number of Jobs"]

In [63]:
jobs3 = jobs2[jobs2["InstitutionType"]=="정권기관"]
jobs3.loc[:,["SPA"]]=False
jobs3.loc[jobs3["PrimaryInstitution"]=="최고인민회의",["SPA"]] = True

In [64]:
stat3 = jobs3[stat3_columns].groupby(stat3_groupby_columns,as_index=False).count()
stat3.columns = stat3_column_labels
stat3

Unnamed: 0,SPA,Number of Jobs
0,False,1099
1,True,362


### jobs stat4: average jobs per year, omitting 최고인민회의

In [65]:
jobs2.columns

Index(['LeaderID', 'CareerStartYear', 'CareerStartDate', 'InstitutionType',
       'PrimaryInstitution', 'OrgName', 'Position', 'IsElected', 'OrgRank',
       'PositionRank'],
      dtype='object')

In [66]:
jobs4 = jobs2[~(jobs2.PrimaryInstitution=="최고인민회의")]
jobs4.shape

(4246, 10)

In [67]:
# 
# jobs2.shape[0] - 1870

In [68]:
jstat4_columns = ["CareerStartYear","Position"]
jstat4_groupby_columns = ["CareerStartYear"]
# jstat4_column_labels = ["Number of Jobs in Resume","Number of Elites"]

In [69]:
jstat4 = jobs4[jstat4_columns].groupby(jstat4_groupby_columns,as_index=False).count()
jstat4.sort_values('CareerStartYear',ascending=True,inplace=True)
jstat4

Unnamed: 0,CareerStartYear,Position
0,1937,1
1,1945,6
2,1946,5
3,1947,1
4,1948,9
...,...,...
74,2018,51
75,2019,184
76,2020,48
77,2021,235


In [70]:
mean(list(jstat4.Position))

53.74683544303797

In [71]:
jstat4["Largerthan53"]=False
jstat4.loc[jstat4.Position>53,"Largerthan53"]=True

In [72]:
jstat4[jstat4.Position<54]

Unnamed: 0,CareerStartYear,Position,Largerthan53
0,1937,1,False
1,1945,6,False
2,1946,5,False
3,1947,1,False
4,1948,9,False
5,1949,2,False
6,1950,11,False
7,1951,7,False
8,1952,9,False
9,1953,4,False


### Decide - whether to calculate following stats for jobs_all or jobs_no SPA

In [73]:
# jobs2 = jobs_all, jobs4 = jobs_no SPA
#jobs_decide = jobs2
jobs_decide = jobs4

In [384]:
jobs4.shape

(4246, 10)

In [76]:
# unique Leader IDs in jobs4
len(jobs4.LeaderID.unique())

584

### job stat5 - Number of Local Jobs

In [365]:
stat5_columns = ["Local","OrgName"]
stat5_groupby_columns = ["Local"]
stat5_column_labels = ["Local","Number of Jobs"]

In [366]:
org2.columns

Index(['InstitutionType', 'OrgType', 'PrimaryInstitution', 'OrgName',
       'PI_Index', 'OrgRank', 'P1', 'P2', 'P3', 'Alias_OrgName',
       'LinkToNext_PI', 'LinkToNext_Org', 'LinkToNext_Year', 'Notes',
       'L1_Index', 'L2_Index', 'L3_Index', 'L4_Index', 'L5_Index', 'Local',
       'InstitutionCategory'],
      dtype='object')

In [388]:
# merge leader-job linklist with orgtree to get Local and InstitutionCategory :)
ot_columns = ["PrimaryInstitution","OrgName","Local","InstitutionCategory"]
pk_columns = ["PrimaryInstitution","OrgName"]
jobs5 = jobs_decide.merge(org2[ot_columns],on=pk_columns,how="left",indicator=True)
merge_results(jobs5)


Merge Results...

	shape     : (4246, 13)
	left_only : (0, 13)
	both      : (4246, 13)
	right_only: (0, 13)


In [389]:
jobs5.InstitutionCategory

0       Social
1       Social
2       Social
3       Social
4       Social
         ...  
4241    Social
4242    Social
4243    Social
4244    Social
4245    Social
Name: InstitutionCategory, Length: 4246, dtype: object

In [390]:
stat5 = jobs5[stat5_columns].groupby(stat5_groupby_columns,as_index=False).count()
stat5.columns = stat5_column_labels
stat5

Unnamed: 0,Local,Number of Jobs
0,Central,2897
1,Local,224


In [383]:
2897 + 224

3121

### job stat6 - Percent of Local Jobs for Each Institution Category

In [370]:
stat6_columns = ["InstitutionCategory","IsLocal"]
stat6_groupby_columns = ["InstitutionCategory"]
stat6_column_labels = ["InstitutionCategory","Total Job Count","Local Job Count","Percent Local Jobs "]

In [371]:
jobs5.loc[:,"IsLocal"] = False
jobs5.loc[jobs5["Local"]=="Local","IsLocal"] = True

In [372]:
# use job5, which is job2 + Local variable from orgtree
stat6 = jobs5[stat6_columns].groupby(stat6_groupby_columns,as_index=False)["IsLocal"].agg({"Job Count":"count","Local Job Count":"sum"})
stat6.loc[:,"Percent Local Jobs"] = stat6.apply(lambda x: round(x["Local Job Count"]/x["Job Count"]*100,2),axis=1)
stat6.columns = stat6_column_labels
stat6

Unnamed: 0,InstitutionCategory,Total Job Count,Local Job Count,Percent Local Jobs
0,Government,1421,48,3.38
1,Military,352,62,17.61
2,Party,1620,130,8.02
3,Social,853,60,7.03


In [391]:
# 4246
# 1099 + 191 + 1609 + 222
1421 + 352 + 1620 + 853

4246

### job stat7 - Number of Jobs by Organization Rank

In [398]:
jobs5.head()

Unnamed: 0,LeaderID,CareerStartYear,CareerStartDate,InstitutionType,PrimaryInstitution,OrgName,Position,IsElected,OrgRank,PositionRank,Local,InstitutionCategory,_merge
0,강능수,1989,198904,당외곽및사회단체_사회부문(별책),4.15문화창작단,,단장,,0,1,Central,Social,both
1,안동춘,2004,200400,당외곽및사회단체_사회부문(별책),4.15문화창작단,,부단장,,0,2,Central,Social,both
2,안동춘,2004,200404,당외곽및사회단체_사회부문(별책),4.15문화창작단,,부단장,,0,2,Central,Social,both
3,홍서헌,2007,200704,국제친선단체,北-러시아 친선의원단,,위원장,,0,1,Central,Social,both
4,홍서헌,1999,199903,국제친선단체,北-러시아 친선의원단,,위원장,,0,1,Central,Social,both


In [399]:
stat7_columns = ["OrgRank","PrimaryInstitution"]
stat7_groupby_columns = ["OrgRank"]
stat7_column_labels = ["Organization Rank","Number of Jobs"]

In [400]:
# use job5, which is job2 + Local variable from orgtree
stat7 = jobs5[stat7_columns].groupby(stat7_groupby_columns,as_index=False).count()
stat7.columns = stat7_column_labels
stat7

Unnamed: 0,Organization Rank,Number of Jobs
0,0,1125
1,1,1769
2,2,1000
3,3,324
4,4,21
5,5,7


### job stat8 - Percent of Jobs by Org Rank, for Each Institution 

In [401]:
jobs5.shape

(4246, 13)

In [403]:
stat8_columns = ["InstitutionCategory","OrgRank","PrimaryInstitution"]
stat8_groupby_columns = ["InstitutionCategory","OrgRank"]
stat8_column_labels = ["Institution Category","Organization Rank","Number of Jobs"]

In [404]:
# use job5, which is job2 + Local variable from orgtree
stat8 = jobs5[stat8_columns].groupby(stat8_groupby_columns,as_index=False).count()
stat8.columns = stat8_column_labels
stat8

Unnamed: 0,Institution Category,Organization Rank,Number of Jobs
0,Government,0,322
1,Government,1,696
2,Government,2,191
3,Government,3,207
4,Government,4,5
5,Military,0,161
6,Military,1,88
7,Military,2,22
8,Military,3,59
9,Military,4,15


In [405]:
stat8_catsum_columns = ["Institution Category","Number of Jobs"]
stat8_catsum_groupby_columns = ["Institution Category"]
stat8_catsum_column_labels = ["Institution Category","Total Jobs by Category"]
stat8_catsum = stat8[stat8_catsum_columns].groupby(stat8_catsum_groupby_columns,as_index=False).agg({"Number of Jobs":"sum"})
stat8_catsum.columns = stat8_catsum_column_labels
stat8_catsum

Unnamed: 0,Institution Category,Total Jobs by Category
0,Government,1421
1,Military,352
2,Party,1620
3,Social,853


In [406]:
# 4246
1421 + 352 + 1620 + 853

4246

In [407]:
stat8 = stat8.merge(stat8_catsum,on="Institution Category",how="left")
stat8

Unnamed: 0,Institution Category,Organization Rank,Number of Jobs,Total Jobs by Category
0,Government,0,322,1421
1,Government,1,696,1421
2,Government,2,191,1421
3,Government,3,207,1421
4,Government,4,5,1421
5,Military,0,161,352
6,Military,1,88,352
7,Military,2,22,352
8,Military,3,59,352
9,Military,4,15,352


In [408]:
stat8["Percent Jobs by Rank"] = round(stat8["Number of Jobs"] / stat8["Total Jobs by Category"] * 100,2)
stat8

Unnamed: 0,Institution Category,Organization Rank,Number of Jobs,Total Jobs by Category,Percent Jobs by Rank
0,Government,0,322,1421,22.66
1,Government,1,696,1421,48.98
2,Government,2,191,1421,13.44
3,Government,3,207,1421,14.57
4,Government,4,5,1421,0.35
5,Military,0,161,352,45.74
6,Military,1,88,352,25.0
7,Military,2,22,352,6.25
8,Military,3,59,352,16.76
9,Military,4,15,352,4.26


### job stat9 - Number of Jobs by Position Rank

In [409]:
stat9_columns = ["PositionRank","PrimaryInstitution"]
stat9_groupby_columns = ["PositionRank"]
stat9_column_labels = ["Position Rank","Number of Jobs"]

In [410]:
# use job5, which is job2 + Local variable from orgtree
stat9 = jobs5[stat9_columns].groupby(stat9_groupby_columns,as_index=False).count()
stat9.columns = stat9_column_labels
stat9

Unnamed: 0,Position Rank,Number of Jobs
0,1,1861
1,2,1057
2,3,1328


In [412]:
# 4246
1861 + 1057 + 1328

4246

### job stat10 - Percent of Jobs by Position Rank, for each Institution Category

In [413]:
stat10_columns = ["InstitutionCategory","PositionRank","PrimaryInstitution"]
stat10_groupby_columns = ["InstitutionCategory","PositionRank"]
stat10_column_labels = ["Institution Category","Position Rank","Number of Jobs"]

In [414]:
# use job5, which is job2 + Local variable from orgtree
stat10 = jobs5[stat10_columns].groupby(stat10_groupby_columns,as_index=False).count()
stat10.columns = stat10_column_labels
stat10

Unnamed: 0,Institution Category,Position Rank,Number of Jobs
0,Government,1,775
1,Government,2,422
2,Government,3,224
3,Military,1,211
4,Military,2,89
5,Military,3,52
6,Party,1,371
7,Party,2,313
8,Party,3,936
9,Social,1,504


In [415]:
stat10_catsum_columns = ["Institution Category","Number of Jobs"]
stat10_catsum_groupby_columns = ["Institution Category"]
stat10_catsum_column_labels = ["Institution Category","Total Jobs by Category"]
stat10_catsum = stat10[stat10_catsum_columns].groupby(stat10_catsum_groupby_columns,as_index=False).agg({"Number of Jobs":"sum"})
stat10_catsum.columns = stat10_catsum_column_labels
stat10_catsum

Unnamed: 0,Institution Category,Total Jobs by Category
0,Government,1421
1,Military,352
2,Party,1620
3,Social,853


In [416]:
stat10 = stat10.merge(stat10_catsum,on="Institution Category",how="left")
stat10

Unnamed: 0,Institution Category,Position Rank,Number of Jobs,Total Jobs by Category
0,Government,1,775,1421
1,Government,2,422,1421
2,Government,3,224,1421
3,Military,1,211,352
4,Military,2,89,352
5,Military,3,52,352
6,Party,1,371,1620
7,Party,2,313,1620
8,Party,3,936,1620
9,Social,1,504,853


In [417]:
stat10["Percent Jobs by Rank"] = round(stat10["Number of Jobs"] / stat10["Total Jobs by Category"] * 100,2)
stat10

Unnamed: 0,Institution Category,Position Rank,Number of Jobs,Total Jobs by Category,Percent Jobs by Rank
0,Government,1,775,1421,54.54
1,Government,2,422,1421,29.7
2,Government,3,224,1421,15.76
3,Military,1,211,352,59.94
4,Military,2,89,352,25.28
5,Military,3,52,352,14.77
6,Party,1,371,1620,22.9
7,Party,2,313,1620,19.32
8,Party,3,936,1620,57.78
9,Social,1,504,853,59.09
