In [1]:
import pandas as pd
from collections import defaultdict
import numpy as np

# Read the data

In [2]:
def main(csv_file):
    """
    Given an appropriate csv_file, output the relevant columns.

    Returns df with columns [user, start, end, normalized_company, industry]
    """

    employ_data = pd.read_csv(csv_file, sep="\t", header=None,
                              names=[i for i in range(34)], low_memory=False)
    # column info from taxonomy file
    name = ['user', 'name', 'birth', 'gender', 'primary',
            'primary_weight', 'secondary', 'secondary_weight',
            'city', 'country', 'education', 'elite', 'start',
            '.', 'end', '??', '/', 'length', 'role', 'department',
            'company', 'normalized_company', 'ticker', 'exchange',
            'public', 'location_company', 'industry', 'educational',
            'degree', 'elite_education', 'major', 'department', 'FIGI',
            'last_update']
#     drop = ['length', 'gender', 'primary',
#         'primary_weight', 'secondary', 'secondary_weight',
#         'city', 'country', 'education', 'elite', '.', '??',
#         '/', 'department', 'exchange',
#         'public', 'location_company', 'educational', 'degree', 'elite_education',
#         'major', 'department', 'FIGI', 'last_update']
    employ_data.columns = name
    return employ_data

def standardize_dates(company, missing_start = '1900-01-01', missing_end = '2018-01-01'):
    """
    Converts start date and end date to datetime objects, and converts None values to the specified missing 
    dates.

    Returns the modified dataframe
    """
    company_data = raw_data[company].copy()
    company_data['start'] = company_data['start'].str.replace('None', missing_start)
    company_data['end'] = company_data['end'].str.replace('None', missing_end)
    company_data['start'] = pd.to_datetime(company_data['start'])
    company_data['end'] = pd.to_datetime(company_data['end'])
    return company_data

def get_users(company_name, company_data, worked_date = '2008-01-01', missing_start = '1900-01-01', missing_end = '2018-01-01'):
    """"
    Returns the users who worked at a given company on worked_date, that does not have both start and
    end dates missing
    
    worked_date: string specifying the date on which to extract employees from. 
                 Must be coercible into a datetime object
    missing_start: default value for missing start dates
    missing_end: default value for missing end dates
    """
    worked_date = pd.to_datetime(worked_date)
    missing_start = pd.to_datetime(missing_start)
    missing_end = pd.to_datetime(missing_end)
    x = company_data
    
    company_tickers = {'db': 'DB', 'leh': 'LEH', 'gs': 'GS', 'ms': 'MS^E', 'ubs': 'UBS'}
    # conditions: start and end not both missing, worked before/after 2008-01-01, ticker matches company
    mask = ~((x['start'] == missing_start) & (x['end'] == missing_end)) & \
        (x['start'] < worked_date) & \
        (x['end'] > worked_date) & \
        (x['ticker'] == company_tickers[company_name])
    return company_data[mask]['user'].unique()

In [3]:
# data without datetime features, and none values for some dates
raw_data = {'db': main('./Data/DB_profiles.csv'),
            'gs': main('./Data/GS_profiles.csv'),
            'leh': main('./Data/LEH_profiles.csv'),
            'ms': main('./Data/MS_profiles.csv'),
            'ubs': main('./Data/UBS_profiles.csv')
            }

# add datetime features for each company
for company in raw_data.keys():
    raw_data[company] = standardize_dates(company)
    
# get the users that worked there as of 2008-01-01
users = {company_name: get_users(company_name, company_data) for company_name, company_data in raw_data.items()}

# filter entries to people that worked there as of 2008-01-01
data = {}
for company, company_data in raw_data.items():
    company_users = users[company]
    data[company] = company_data[company_data['user'].isin(company_users)]


# Extracting and categorizing job types as of 2008

In [4]:
def job_title_2008(company_name, company_data):
    """"
    Return each user's job title at the given company as of 2008-01-01
    """
    date_2008 = pd.to_datetime('2008-01-01')
    missing_start = pd.to_datetime('1900-01-01')
    missing_end = pd.to_datetime('2018-01-01')

    company_tickers = {'db': 'DB', 'leh': 'LEH', 'gs': 'GS', 'ms': 'MS^E', 'ubs': 'UBS'}

    x = company_data
    mask = ~((x['start'] == missing_start) & (x['end'] == missing_end)) & \
           (x['start'] < date_2008) & \
           (x['end'] > date_2008) & \
           (x['ticker'] == company_tickers[company_name])
    return company_data[mask]

In [5]:
drop = ['length', 'gender', 'primary',
        'primary_weight', 'secondary', 'secondary_weight',
        'city', 'country', 'education', 'elite', '.', '??',
        '/', 'department', 'exchange',
        'public', 'location_company', 'educational', 'degree', 'elite_education',
        'major', 'department', 'FIGI', 'last_update', 'industry', 'birth', 'company']

matching_data = {company_name: company_data.drop(labels=drop, axis=1) for company_name, company_data in data.items()}



job_title_as_of_2008 = {company_name: job_title_2008(company_name, company_data) for company_name, company_data in
                  matching_data.items()}

all_data = pd.concat(job_title_as_of_2008.values())
# only person missing a role in the entire data set
all_data = all_data.drop(11512)

In [202]:
# begin extracting job titles
directors = set(all_data[(all_data.role.str.contains(r'director|MD,md', case=False))
                         | (all_data.role.str.match(r'ed|md', case=False))].user)
all_roles = directors.copy()

analysts = set(all_data[all_data.role.str.contains('analyst|Anaylst', case=False)].user).difference(all_roles)
all_roles = all_roles.union(analysts)

vps = set(all_data[all_data.role.str.contains('president|vp', case=False)].user).difference(all_roles)
all_roles = all_roles.union(vps)

assocs = set(all_data[all_data.role.str.contains('associate', case=False)].user).difference(all_roles)
all_roles = all_roles.union(assocs)

accountants = set(
    all_data[all_data.role.str.contains('accountant|account executive|accounting', case=False)].user).difference(
    all_roles)
all_roles = all_roles.union(accountants)

consultants = set(all_data[all_data.role.str.contains('consultant', case=False)].user).difference(all_roles)
all_roles = all_roles.union(consultants)

missing = set(all_data[all_data.role.str.match(r'-|\?|\.', case=False)].user).difference(all_roles)
all_roles = all_roles.union(missing)

developers = set(
    all_data[all_data.role.str.contains(r'developer|engineer|system administrator', case=False)].user).difference(
    all_roles)
all_roles = all_roles.union(developers)

interns = set(all_data[all_data.role.str.contains('intern|trainee|apprentice', case=False)].user).difference(all_roles)
all_roles = all_roles.union(interns)

specialists = set(
    all_data[all_data.role.str.contains('specialist|administrator|research|expert', case=False)].user).difference(
    all_roles)
all_roles = all_roles.union(specialists)

sales = set(all_data[all_data.role.str.contains('sales', case=False)].user).difference(all_roles)
all_roles = all_roles.union(sales)

traders = set(all_data[all_data.role.str.contains(r'trader|trading|Portfolio Management', case=False)].user).difference(
    all_roles)
all_roles = all_roles.union(traders)

bankers = set(all_data[all_data.role.str.contains(r'banking|banker|finance', case=False)].user).difference(all_roles)
all_roles = all_roles.union(bankers)

controllers = set(all_data[all_data.role.str.contains('controller', case=False)].user).difference(all_roles)
all_roles = all_roles.union(controllers)

partners = set(all_data[all_data.role.str.contains('partner', case=False)].user).difference(all_roles)
all_roles = all_roles.union(partners)

counsels = set(all_data[all_data.role.str.contains('counsel', case=False)].user).difference(all_roles)
all_roles = all_roles.union(counsels)

recruiters = set(all_data[all_data.role.str.contains('recruiter|human resources', case=False)].user).difference(
    all_roles)
all_roles = all_roles.union(recruiters)

advisors = set(all_data[all_data.role.str.contains('advisor|adviseur', case=False)].user).difference(all_roles)
all_roles = all_roles.union(advisors)

assistants = set(
    all_data[all_data.role.str.contains('assistant|support|services|receptionist', case=False)].user).difference(
    all_roles)
all_roles = all_roles.union(assistants)

managers = set(all_data[all_data.role.str.contains(
    r'manager|supervisor|team lead|head|lead|coordinator|representative|process executive',
    case=False)].user).difference(all_roles)
all_roles = all_roles.union(managers)

others = set(all_data.user).difference(all_roles)

ValueError: cannot index with vector containing NA / NaN values

In [203]:
# zip all sets and all job title names
all_sets = [directors, analysts, vps, assocs, advisors, assistants, consultants, managers, missing, developers, interns,
            specialists, sales, traders, bankers, controllers, partners, counsels, recruiters, accountants, others]
job_titles = ['director', 'analyst', 'vp', 'assoc', 'advisor', 'assistant', 'consultant', 'manager', 'missing',
              'developer', 'intern', 'specialist', 'sale', 'trader', 'banker', 'controller', 'partner', 'counsel',
              'recruiter', 'accountant', 'other']

zipped = list(zip(all_sets, job_titles))


def to_dict(dictionary, users, job_title):
    """Map users to job_title in the given dictionary"""
    for user in users:
        dictionary.update({user: job_title})


user_to_job_type = {}
[to_dict(user_to_job_type, x, y) for x, y in zipped]
user_to_job_type.update({'c0a3eb6a-59db-3a30-8a39-99a7cc8b9ce1': 'specialist'})
user_to_job_type.update({'5f425323-1cdf-3e81-a08e-35b483c42da9': 'missing'})

# Missing industry mapping

In [8]:
# read all the csv files
profile = pd.read_csv('./Data/profile_industry_mappings.csv', header=None, names=[i for i in range(5)], dtype={4: str})
profile.drop([0, 2], axis='columns', inplace=True)
profile.rename(mapper={1: 'company', 3: 'norm', 4: "ind"}, axis='columns', inplace=True)

mturk = pd.read_csv('./Data/industries_MTurkers_20170711.csv', header=None, encoding='latin-1')
mturk.drop([1], axis='columns', inplace=True)
mturk.rename(mapper={0: 'company', 2: "ind"}, axis='columns', inplace=True)

finance = pd.read_csv('./Data/Finance.csv', dtype={'Industry': str})
finance.drop([finance.columns[0], finance.columns[2], finance.columns[4]], axis='columns', inplace=True)
finance.rename(mapper={'Normalized Company Name': 'norm', 'Industry': "ind"}, axis='columns', inplace=True)

manual = pd.read_csv('./Data/manual_industry_mappings.csv', encoding='latin-1', header=None, dtype={2: str})
manual.drop([1], axis='columns', inplace=True)
manual.rename(mapper={0: 'norm', 2: "ind"}, axis='columns', inplace=True)

industries_2019 = pd.read_csv('./Data/missing_industries_2019.csv', header=None, dtype={2: str})
industries_2019 = industries_2019[~(industries_2019[1] == 1)].copy()

industries_2019.drop([1], axis = 'columns', inplace = True)
industries_2019.rename(mapper={0: 'company', 2: "ind"}, axis='columns', inplace=True)
industries_2019 = industries_2019[~pd.isnull(industries_2019.ind)].copy()

#mturk industry is given as "ind_x", profile industry is given as "ind_y"
company_comb = pd.merge(mturk, profile, on='company', how='outer')
#prioritize mturk data
company_comb['combined'] = company_comb['ind_x'].combine_first(company_comb['ind_y'])

#mturk industry is given as "ind", profile industry is given as "combined"
company_comb = pd.merge(industries_2019, company_comb, on='company', how='outer')
#prioritize manual entry data
company_comb['combined'] = company_comb['ind'].combine_first(company_comb['combined'])

#merge manual and finance files, prioritizing manual
norm_comb = pd.merge(manual, finance, on = 'norm', how = 'outer')
norm_comb['combined'] = norm_comb['ind_x'].combine_first(norm_comb['ind_y'])
#merge manual/finance and profile[norm], prioritizing manual/finance
norm_comb = pd.merge(norm_comb, profile, on = 'norm', how = 'outer')
norm_comb['combined'] = norm_comb['combined'].combine_first(norm_comb['ind'])

# convert the columns of the aggredated dataframe into a dictionary where the key is the company name
# and the value is the industry code
norm_mapping = dict(zip(norm_comb.norm, norm_comb.combined))
company_mapping = dict(zip(company_comb.company, company_comb.combined))
# set the default value if the company is not found to NaN
norm_mapping = defaultdict(lambda: np.NaN, norm_mapping)
company_mapping = defaultdict(lambda: np.NaN, company_mapping)

In [160]:
def add_industry_labels(company_data):
    company_data = company_data.copy()
    # convert to lowercase for more accurate matching
    company_data['normalized_company_lower'] = company_data['normalized_company'].str.lower()
    company_data['company_lower'] = company_data['company'].str.lower()
    # apply norm_mapping and company_mapping to upper and lower case versions
    company_data['company_mapped'] = company_data['company'].apply(lambda y: company_mapping[y])
    company_data['normalized_company_mapped'] = company_data['normalized_company'].apply(lambda y: norm_mapping[y])
    company_data['company_lower_mapped'] = company_data['normalized_company_lower'].apply(lambda y: norm_mapping[y])
    company_data['normalized_company_lower_mapped'] = company_data['normalized_company_lower'].apply(lambda y: norm_mapping[y])
    # combines all mappings. Prioritize Existing industry code > MTurk/profle(company) > 
    # manual/finance/profile(normalized_company) > manual/finance/profile(normalized_company_lower) 
    company_data['industry_two'] = company_data['industry'].combine_first(company_data['company_mapped'])
    company_data['industry_three'] = company_data['industry_two'].combine_first(company_data['normalized_company_mapped'])
    company_data['industry_four'] = company_data['industry_three'].combine_first(company_data['company_lower_mapped'])
    company_data['industry_five'] = company_data['industry_four'].combine_first(company_data['normalized_company_lower_mapped'])
    company_data['industry'] = company_data['industry_five']
    # drop the temporary columns
    company_data.drop(['normalized_company_lower', 'company_lower', 'company_mapped', 'normalized_company_mapped', 'company_lower_mapped','normalized_company_lower_mapped', 'industry_two', 'industry_three', 'industry_four','industry_five'], axis=1, inplace=True)
    return company_data

def before_2016(company_data):
    """
    Return entries before 2016-1-1, excluding those with no start or end time, or are educational.
    """
    mask = (company_data['start'] <= pd.to_datetime('2016-1-1')) & (company_data['end'] >= pd.to_datetime('2016-1-1')) & ~((company_data['start'] == pd.to_datetime('1900-01-01')) & (company_data['end'] == pd.to_datetime('2018-01-01'))) & (~company_data['ticker'].isin(['UNIVERSITY', 'SCHOOL']) & ~(company_data.educational) & ~(company_data['industry'].isin(['UNIVERSITY', 'SCHOOL'])))
    return company_data[mask]


def labeled_2016(company_data):
    # combines filter and mask
    labeled = add_industry_labels(company_data)
    return before_2016(labeled)

# Most informative skills

In [161]:
import scipy.stats as st
def n_most_informative_skills(n, all_data):
    skills = list(all_data.primary.unique())
    information_gains = []
    num_people = len(all_data)
    num_lehman = sum(all_data.is_lehman)
    num_non_lehman = num_people - num_lehman
    entropy_parent = st.entropy([num_lehman, num_non_lehman],base=2)
    
    for skill in skills:
        with_skill = all_data[all_data.primary == skill]
        without_skill = all_data[~(all_data.primary == skill)]
        num_with_skill = len(with_skill)
        num_without_skill = len(without_skill)
        
        with_skill_lehman = sum(with_skill.is_lehman)
        with_skill_non_lehman = num_with_skill - with_skill_lehman
        entropy_with_skill = st.entropy([with_skill_lehman, with_skill_non_lehman],base=2)
        
        without_skill_lehman = sum(without_skill.is_lehman)
        without_skill_non_lehman = num_without_skill - without_skill_lehman
        entropy_without_skill = st.entropy([without_skill_lehman, without_skill_non_lehman],base=2)
        
        conditional_entropy = (num_with_skill/num_people * entropy_with_skill 
                               + num_without_skill/num_people * entropy_without_skill)
        
        information_gain_split = entropy_parent - conditional_entropy
        information_gains.append(information_gain_split)
        
    sorted_ig_indices = np.flip(np.argsort(information_gains))
    skills_by_ig = [skills[index] for index in sorted_ig_indices]
    return skills_by_ig[0:n]    

# Logistic Regression

In [351]:
import statsmodels.discrete.discrete_model as sm
# prepare data for regression by dropping irrelevant names
drop = ['length', 'name',
        'primary_weight', 'secondary', 'secondary_weight', 'elite_education',
        'city', 'country', '.', '??',
        '/', 'department', 'exchange',
        'public', 'location_company',
        'major', 'department', 'FIGI', 'last_update', 'degree']

regression_data = {company_name: company_data.drop(labels=drop, axis=1) for company_name, company_data in data.items()}

#additional step of labeling missing industries and only look at those with jobs as of 2016-1-1
all_data_2016 = pd.concat(regression_data.values())
employed_2016 = labeled_2016(all_data_2016)
employ_2016_users = list(employed_2016.user.unique())

regression_data = {company_name: job_title_2008(company_name, company_data) for company_name, company_data in
                   regression_data.items()}

In [352]:
non_lehman = pd.concat([regression_data['db'], regression_data['gs'], regression_data['ms'], regression_data['ubs']])
non_lehman['is_lehman'] = 0

lehman = regression_data['leh'].copy()
lehman['is_lehman'] = 1

all_data = pd.concat([lehman, non_lehman])
all_data = all_data[all_data.user.isin(employ_2016_users)]

# fill in missing births to the median date, 1976
index = all_data[all_data.birth.isin(['None', '2000'])].index
all_data.loc[index, ['birth']] = '1976'
FINAL_REGRESSION_DATA = all_data

In [378]:
def return_matches(num_matches, num_skills):
    all_data = FINAL_REGRESSION_DATA.copy()
    informative_skills = n_most_informative_skills(num_skills, all_data)
    not_informative = ~all_data.primary.isin(informative_skills)
    all_data.loc[not_informative, 'primary'] = 0
    all_data['job_category'] = all_data.user.apply(lambda x: user_to_job_type[x])

    X = all_data[['birth', 'gender', 'primary', 'education', 'elite']].copy()
    X['education'] = X['education'].apply(str)
    X['gender'] = X['gender'].apply(str)
    X['birth'] = X['birth'].astype(int)
    X['elite'] = X['elite'].astype(int)
    X = pd.get_dummies(data=X, drop_first=True)
    X = sm.tools.add_constant(X)

    y = all_data['is_lehman']

    # regress y on X
    logit = sm.Logit(y, X)
    results = logit.fit(maxiter = 100)

    # get propensities
    all_data['propensity'] = results.predict(X)

    # Begin matching process. Map each user to its propensity
    user_to_propensity = dict(zip(all_data.user, all_data.propensity))

    # get lehman and non-lehman guys
    lehman = all_data[all_data['is_lehman'] == 1]
    non_lehman = all_data[all_data['is_lehman'] == 0]
    
    def flatten(list_of_lists):
        return [item for sublist in list_of_lists for item in sublist]

    def get_closest(n, row):
        # return n nearest neighbors of closest match containing the same job title
        role = row.job_category
        score = row.propensity
        others_by_role = non_lehman[non_lehman.job_category == role].reset_index().loc[:, ['user', 'propensity']]
        absolute_propensity = np.absolute(others_by_role['propensity'] - score)
        ind = np.argsort(absolute_propensity)
        matches = others_by_role.loc[ind[0:n]].user.to_list()
        return matches


    # get closest match for each lehman guy
    lehman['matches'] = lehman.apply(lambda x : get_closest(num_matches, x), axis=1)

    return lehman.copy()

# Matched stayed in finance

In [385]:
one = return_matches(num_matches = 1, num_skills = 5)
two = return_matches(num_matches = 2, num_skills = 5)
five = return_matches(num_matches = 5, num_skills = 5)

Optimization terminated successfully.
         Current function value: 0.363000
         Iterations 8


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Optimization terminated successfully.
         Current function value: 0.363000
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.363000
         Iterations 8


In [377]:
def return_match_stayed(num_matches = 5, num_skills = 5):    
    matched_data = return_matches(num_matches, num_skills)
    unique_matches = set(flatten(list(matched_data.matches)))
    drop = ['length', 'gender', 'primary',
            'primary_weight', 'secondary', 'secondary_weight',
            'city', 'country', 'education', 'elite', '.', '??',
            '/', 'department', 'exchange',
            'public', 'location_company', 'degree', 'elite_education',
            'major', 'department', 'FIGI', 'last_update']

    finance_data = {company_name : company_data.drop(labels=drop, axis=1) for company_name, company_data in data.items()}

    labeled_2016_job_data = {company_name: labeled_2016(company_data) for company_name, company_data in finance_data.items()}

    lehman_to_matches = dict(zip(matched_data.user, matched_data.matches))

    lehman = labeled_2016_job_data['leh'].copy()
    lehman['matches'] = lehman.user.apply(lambda x : lehman_to_matches[x])


    non_lehman =  pd.concat([labeled_2016_job_data['db'], 
                             labeled_2016_job_data['gs'], 
                             labeled_2016_job_data['ms'], 
                             labeled_2016_job_data['ubs']])
    matches = non_lehman[non_lehman.user.isin(unique_matches)]

    matches = matches.groupby('user').first().reset_index()
    matches['stayed_finance'] = matches['industry'].str.startswith('52', na=False)

    match_to_stayed_finance = dict(zip(matches.user, matches.stayed_finance))

    lehman_2016_job_data = lehman.copy()
    lehman_2016_job_data = lehman_2016_job_data.groupby('user').first()
    lehman_2016_job_data = lehman_2016_job_data.explode('matches')

    lehman_2016_job_data['match_stayed_finance'] = lehman_2016_job_data.matches.apply(lambda match: match_to_stayed_finance[match])
    lehman_2016_job_data['lehman_stayed_finance'] = lehman_2016_job_data.industry.str.startswith('52', na = False)
    lehman_2016_job_data['job_category'] = lehman_2016_job_data.index.to_series().apply(lambda x: user_to_job_type[x])
    return lehman_2016_job_data

In [386]:
yOne = return_match_stayed(one)
yTwo = return_match_stayed(two)
yFive = return_match_stayed(five)

# Probit propensities

In [387]:
one_match = calculate_propensity_SE_by_roles(yOne)

In [388]:
two_matches = calculate_propensity_SE_by_roles(yTwo)

In [389]:
five_matches = calculate_propensity_SE_by_roles(yFive)

In [393]:
pd.concat([one_match, two_matches, five_matches]).to_latex()

'\\begin{tabular}{lrrrrr}\n\\toprule\n{} &  full sample &        vp &   analyst &  director &     assoc \\\\\n\\midrule\n0 &     0.059643 &  0.117491 &  0.031251 &  0.106497 &  0.102683 \\\\\n1 &     0.002249 &  0.004079 &  0.005242 &  0.007279 &  0.007929 \\\\\n0 &     0.059643 &  0.117491 &  0.031251 &  0.106497 &  0.102683 \\\\\n1 &     0.002249 &  0.004079 &  0.005242 &  0.007279 &  0.007929 \\\\\n0 &     0.059643 &  0.117491 &  0.031251 &  0.106497 &  0.102683 \\\\\n1 &     0.002249 &  0.004079 &  0.005242 &  0.007279 &  0.007929 \\\\\n\\bottomrule\n\\end{tabular}\n'

In [390]:
one_match

Unnamed: 0,full sample,vp,analyst,director,assoc
0,0.059643,0.117491,0.031251,0.106497,0.102683
1,0.002249,0.004079,0.005242,0.007279,0.007929


In [None]:
def calculate_propensity_SE_by_roles(data, roles = ['full sample', 'vp', 'analyst', 'director', 'assoc']):
    dataDict = {}
    for role in roles:
        dataDict[role] = calculate_propensity_SE(role, data)
    return pd.DataFrame.from_dict(dataDict)

In [327]:
def calculate_propensity_SE(job_category, lehman_2016_job_data):
    """names: [full sample, vp, analyst, director, assoc]"""
    lehman_guys = lehman_2016_job_data[['lehman_stayed_finance', 'job_category']].copy().rename({'lehman_stayed_finance' : 'stayed_finance', 'job_category' : 'job'}, axis = 1)
    lehman_guys['is_lehman'] = 1
    non_lehman_guys = lehman_2016_job_data[['match_stayed_finance', 'job_category']].copy().rename({'match_stayed_finance' : 'stayed_finance', 'job_category' : 'job'}, axis = 1)
    non_lehman_guys['is_lehman'] = 0

    data_by_job = pd.concat([lehman_guys,non_lehman_guys])
    if job_category != 'full sample':
        data_by_job = data_by_job[data_by_job.job == job_category]
    
    assert len(data_by_job != 0 ), 'job category not valid'
    X = data_by_job['is_lehman']
    y = data_by_job['stayed_finance']
    
    model = sm.Probit(y, X)
    results = model.fit(disp = 0)
    summary = results.get_margeff().summary_frame().iloc[:, 0:2]
    dydx = summary.iloc[0,0]
    SE = summary.iloc[0,1]
    return (dydx, SE)

In [None]:
analyst associate vp MD

# Visualization

In [None]:
roles = lehman_2016_job_data.job_category.unique()
stayed_finance = {'role' : [], 'lehman_stayed' : [], 'match_stayed' : [], 'total' : [], 'proportion_lehman':[], 'proportion_match' :[], 'zscore': []}

for role in roles:
    role_data = lehman_2016_job_data[lehman_2016_job_data.job_category == role]
    lehman_stayed = sum(role_data.industry.str.startswith('52', na = False))
    match_stayed = sum(role_data.match_stayed_finance)
    total = len(role_data)
    prop_lehman = lehman_stayed/total
    prop_match = match_stayed/total
    zscore = (prop_lehman - prop_match) / (prop_lehman * prop_match *(2/total))**(1/2)
    stayed_finance['role'].append(role)
    stayed_finance['lehman_stayed'].append(lehman_stayed)
    stayed_finance['match_stayed'].append(match_stayed)
    stayed_finance['total'].append(total)
    stayed_finance['proportion_lehman'].append(prop_lehman)
    stayed_finance['proportion_match'].append(prop_match)
    stayed_finance['zscore'].append(zscore)
role_data = lehman_2016_job_data[lehman_2016_job_data.job_category == role]

lehman_stayed = sum(lehman_2016_job_data.industry.str.startswith('52', na = False))
match_stayed = sum(lehman_2016_job_data.match_stayed_finance)
total = len(lehman_2016_job_data)
prop_lehman = lehman_stayed/total
prop_match = match_stayed/total
zscore = (prop_lehman - prop_match) / (prop_lehman * prop_match *(2/total))**(1/2)

stayed_finance['role'].append('all_roles')
stayed_finance['lehman_stayed'].append(lehman_stayed)
stayed_finance['match_stayed'].append(match_stayed)
stayed_finance['total'].append(total)
stayed_finance['proportion_lehman'].append(lehman_stayed/total)
stayed_finance['proportion_match'].append(match_stayed/total)
stayed_finance['zscore'].append(zscore)

df = pd.DataFrame(stayed_finance)

df = df[df.total>= 200]

toPlot = df[['role', 'proportion_lehman', 'proportion_match']].set_index('role').stack().reset_index()
toPlot = toPlot.rename({'level_1' : 'company', 0 : 'proportion'}, axis = 1)

from matplotlib import pyplot
import seaborn as sns
%matplotlib inline

sns.set(rc={'figure.figsize':(20,5)})

sns.lineplot( x = 'role', y = 'proportion', hue = 'company', data = toPlot);