In [44]:
import pandas as pd
from collections import defaultdict
import numpy as np

In [45]:
def main(csv_file):
    """
    Given an appropriate csv_file, output the relevant columns.

    Returns df with columns [user, start, end, normalized_company, industry]
    """

    employ_data = pd.read_csv(csv_file, sep="\t", header=None,
                              names=[i for i in range(34)], low_memory=False)
    # column info from taxonomy file
    name = ['user', 'name', 'birth', 'gender', 'primary',
            'primary_weight', 'secondary', 'secondary_weight',
            'city', 'country', 'education', 'elite', 'start',
            '.', 'end', '??', '/', 'length', 'role', 'department',
            'company', 'normalized_company', 'ticker', 'exchange',
            'public', 'location_company', 'industry', 'educational',
            'degree', 'elite_education', 'major', 'department', 'FIGI',
            'last_update']
#     drop = ['length', 'gender', 'primary',
#         'primary_weight', 'secondary', 'secondary_weight',
#         'city', 'country', 'education', 'elite', '.', '??',
#         '/', 'department', 'exchange',
#         'public', 'location_company', 'educational', 'degree', 'elite_education',
#         'major', 'department', 'FIGI', 'last_update']
    employ_data.columns = name
    return employ_data


# data without datetime features, and none values for some dates
raw_data = {'db': main('./Data/DB_profiles.csv'),
            'gs': main('./Data/GS_profiles.csv'),
            'leh': main('./Data/LEH_profiles.csv'),
            'ms': main('./Data/MS_profiles.csv'),
            'ubs': main('./Data/UBS_profiles.csv')
            }

In [46]:
def standardize_dates(company):
    """
    Converts start date and end date to datetime objects, and converts None values to the minimum or maximum date.

    Returns the modified dataframe
    """
    company_data = raw_data[company].copy()
    company_data['start'] = company_data['start'].str.replace('None', '1900-01-01')
    company_data['end'] = company_data['end'].str.replace('None', '2018-01-01')
    company_data['start'] = pd.to_datetime(company_data['start'])
    company_data['end'] = pd.to_datetime(company_data['end'])
    return company_data


# set up dictionary to hold data for each company

for company in raw_data.keys():
    raw_data[company] = standardize_dates(company)

In [47]:
def get_users(company_name, company_data):
    """"
    Returns the users who worked at the given company before and and after 2008-01-01, exclusive
    """
    date_2008 = pd.to_datetime('2008-01-01')
    missing_start = pd.to_datetime('1900-01-01')
    missing_end = pd.to_datetime('2018-01-01')
    
    x = company_data
    
    company_tickers = {'db': 'DB', 'leh': 'LEH', 'gs': 'GS', 'ms': 'MS^E', 'ubs': 'UBS'}
    # conditions: start and end not both missing, worked before/after 2008-01-01, ticker matches company
    mask = ~((x['start'] == missing_start) & (x['end'] == missing_end)) & \
        (x['start'] < date_2008) & \
        (x['end'] > date_2008) & \
        (x['ticker'] == company_tickers[company_name])
    return company_data[mask]['user'].unique()


# gets the user_ids within each company that match the conditioning, before and and after
#   2008-01-01, exclusive
users = {company_name: get_users(company_name, company_data) for company_name, company_data in raw_data.items()}

In [48]:
data = {}
for company, company_data in raw_data.items():
    company_users = users[company]
    data[company] = company_data[company_data['user'].isin(company_users)]

# BREAKS#

In [None]:
drop = ['length', 'gender', 'primary',
        'primary_weight', 'secondary', 'secondary_weight',
        'city', 'country', 'education', 'elite', '.', '??',
        '/', 'department', 'exchange',
        'public', 'location_company', 'educational', 'degree', 'elite_education',
        'major', 'department', 'FIGI', 'last_update', 'name', 'birth','end', 'role',
        'company','normalized_company','industry']

breaks_data = {company_name : company_data.drop(labels=drop, axis=1) for company_name, company_data in data.items()}

In [None]:
def prop_breaks(company_name, company_data):
    """
    Outputs proportion of "TIME_OFF" or "MISSING" entries in a
    dataset as the tuple (numerator, denominator).
    """

    def get_breaks():
        """
        Outputs the number of employees that have any "TIME_OFF" or "MISSING" entries after 2008.
        """
        date_2008 = pd.to_datetime('2008-01-01')
        # look only at data after 2008
        after_2008 = company_data[company_data['start'] > date_2008]
        # groupby user, aggregate by looking at the ticker and seeing if the person has had any time off
        return sum(after_2008.groupby('user').ticker.agg(lambda x: any((x == 'TIME_OFF') | (x == 'MISSING'))))
    
    num_company_users = len(users[company_name])
    num_breaks = get_breaks()
    return company_name, num_breaks, num_company_users


prop_breaks_company = [prop_breaks(company_name, company_data) for company_name, company_data in breaks_data.items()]
# final proportions by company in dict{company_name : proportion_breaks} form
prop_breaks_company

# MATCHING BY ROLE

In [49]:
drop = ['length', 'gender', 'primary',
            'primary_weight', 'secondary', 'secondary_weight',
            'city', 'country', 'education', 'elite', '.', '??',
            '/', 'department', 'exchange',
            'public', 'location_company', 'educational', 'degree', 'elite_education',
            'major', 'department', 'FIGI', 'last_update', 'industry','birth','company']
matching_data = {company_name : company_data.drop(labels=drop, axis=1) for company_name, company_data in data.items()}

In [50]:
def job_2008(company_name, company_data):
    """"
    Returns the entry containing each user's job as of 2008-01-01
    """
    date_2008 = pd.to_datetime('2008-01-01')
    missing_start = pd.to_datetime('1900-01-01')
    missing_end = pd.to_datetime('2018-01-01')
    
    company_tickers = {'db': 'DB', 'leh': 'LEH', 'gs': 'GS', 'ms': 'MS^E', 'ubs': 'UBS'}
    
    x = company_data
    mask = ~((x['start'] == missing_start) & (x['end'] == missing_end)) & \
        (x['start'] < date_2008) & \
        (x['end'] > date_2008) & \
        (x['ticker'] == company_tickers[company_name])
    return company_data[mask]

In [51]:
job_as_of_2008 = {company_name: job_2008(company_name, company_data) for company_name, company_data in matching_data.items()}

In [52]:
all_data = pd.concat(job_as_of_2008.values())
#only person missing a role in the entire data set
all_data = all_data.drop(11512)

In [53]:
directors = set(all_data[(all_data.role.str.contains(r'director|MD,md', case = False)) 
                         | (all_data.role.str.match(r'ed|md', case = False))].user)
all_roles = directors.copy()

analysts = set(all_data[all_data.role.str.contains('analyst|Anaylst', case = False)].user).difference(all_roles)
all_roles = all_roles.union(analysts)

vps = set(all_data[all_data.role.str.contains('president|vp', case = False)].user).difference(all_roles)
all_roles = all_roles.union(vps)

assocs = set(all_data[all_data.role.str.contains('associate', case = False)].user).difference(all_roles)
all_roles = all_roles.union(assocs)

accountants = set(all_data[all_data.role.str.contains('accountant|account executive|accounting',case = False)].user).difference(all_roles)
all_roles = all_roles.union(accountants)

consultants = set(all_data[all_data.role.str.contains('consultant', case = False)].user).difference(all_roles)
all_roles = all_roles.union(consultants)

missing = set(all_data[all_data.role.str.match(r'-|\?|\.', case = False)].user).difference(all_roles)
all_roles = all_roles.union(missing)

developers = set(all_data[all_data.role.str.contains(r'developer|engineer|system administrator', case = False)].user).difference(all_roles)
all_roles = all_roles.union(developers)

interns = set(all_data[all_data.role.str.contains('intern|trainee|apprentice', case = False)].user).difference(all_roles)
all_roles = all_roles.union(interns)

specialists = set(all_data[all_data.role.str.contains('specialist|administrator|research|expert', case = False)].user).difference(all_roles)
all_roles = all_roles.union(specialists)

sales = set(all_data[all_data.role.str.contains('sales', case = False)].user).difference(all_roles)
all_roles = all_roles.union(sales)

traders = set(all_data[all_data.role.str.contains(r'trader|trading|Portfolio Management', case = False)].user).difference(all_roles)
all_roles = all_roles.union(traders)

bankers = set(all_data[all_data.role.str.contains(r'banking|banker|finance', case = False)].user).difference(all_roles)
all_roles = all_roles.union(bankers)

controllers = set(all_data[all_data.role.str.contains('controller', case = False)].user).difference(all_roles)
all_roles = all_roles.union(controllers)

partners = set(all_data[all_data.role.str.contains('partner', case = False)].user).difference(all_roles)
all_roles = all_roles.union(partners)

counsels = set(all_data[all_data.role.str.contains('counsel', case = False)].user).difference(all_roles)
all_roles = all_roles.union(counsels)

recruiters = set(all_data[all_data.role.str.contains('recruiter|human resources', case = False)].user).difference(all_roles)
all_roles = all_roles.union(recruiters)

advisors = set(all_data[all_data.role.str.contains('advisor|adviseur', case = False)].user).difference(all_roles)
all_roles = all_roles.union(advisors)

assistants = set(all_data[all_data.role.str.contains('assistant|support|services|receptionist', case = False)].user).difference(all_roles)
all_roles = all_roles.union(assistants)

managers = set(all_data[all_data.role.str.contains(r'manager|supervisor|team lead|head|lead|coordinator|representative|process executive', case = False)].user).difference(all_roles)
all_roles = all_roles.union(managers)

others = set(all_data.user).difference(all_roles)

In [54]:
# remaining = all_data[~all_data.user.isin(all_roles)].copy()

# remaining.role.value_counts().to_csv('./Deliverables/uncategorized_data.csv')

In [55]:
all_sets = [directors, analysts, vps, assocs, advisors, assistants, consultants, managers, missing, developers, interns, specialists, sales, traders, bankers, controllers, partners, counsels, recruiters, accountants, others]
job_titles = ['director', 'analyst','vp', 'assoc','advisor','assistant','consultant','manager','missing','developer', 'intern', 'specialist','sale','trader','banker','controller','parnter','counsel', 'recruiter','accountant','other']

zipped = list(zip(all_sets,job_titles))

def to_dict(dictionary, users, job_title):
    for user in users:
        dictionary.update({user:job_title})

full_mapping = {}
[to_dict(full_mapping, x, y) for x,y in zipped]
full_mapping.update({'c0a3eb6a-59db-3a30-8a39-99a7cc8b9ce1' : 'specialist'})
full_mapping.update({'5f425323-1cdf-3e81-a08e-35b483c42da9' : 'missing'})

In [56]:
all_data['job_category'] = all_data.user.apply(lambda x: full_mapping[x])

In [None]:
# all_data.to_csv('./Deliverables/all_data_categorized.csv')

## REGRESSION

In [None]:
drop = ['length','name','industry',
            'primary_weight', 'secondary', 'secondary_weight', 'elite_education',
            'city', 'country', '.', '??',
            '/', 'department', 'exchange',
            'public', 'location_company',
            'major', 'department', 'FIGI', 'last_update','company','normalized_company','educational','degree']
    
regression_data = {company_name : company_data.drop(labels=drop, axis=1) for company_name, company_data in data.items()}

regression_data = {company_name: job_2008(company_name, company_data) for company_name, company_data in regression_data.items()}

In [None]:
non_lehman =  pd.concat([regression_data['db'], regression_data['gs'], regression_data['ms'], regression_data['ubs']])
non_lehman['is_lehman'] = 0

lehman = regression_data['leh'].copy()
lehman['is_lehman'] = 1

In [None]:
# def condition(company, dataset):
#     company_tickers = {'db': 'DB', 'leh': 'LEH', 'gs': 'GS', 'ms': 'MS^E', 'ubs': 'UBS'}
#     company_ticker = company_tickers[company]
#     dataset = dataset[dataset['ticker'] == company_ticker]
#     company_users = users[company]
#     return dataset[dataset['user'].isin(company_users)].groupby('user').last()

# regress_data = {company_name: condition(company_name, company_data) for company_name, company_data in data.items()}

# lehman = regress_data['leh']
# lehman['is_lehman'] = 1

# non_lehman = pd.concat([regress_data['db'], regress_data['gs'], regress_data['ms'], regress_data['ubs']])
# non_lehman['is_lehman'] = 0

In [None]:
all_data = pd.concat([lehman, non_lehman])

index = all_data[all_data.birth.isin(['None', '2000'])].index
all_data.loc[index, ['birth']] = '1976'

In [None]:
informative_skills = ['Operations Management', 'Insurance', 'Business Development', 'Product Management',  '-1']

not_informative = ~all_data.primary.isin(informative_skills)

all_data.loc[not_informative, 'primary'] = 0

all_data

In [None]:
X = all_data[['birth', 'gender', 'primary', 'education', 'elite']].copy()
X['education'] = X['education'].apply(str)
X['gender'] = X['gender'].apply(str)
X['birth'] = X['birth'].astype(int)
X['elite'] = X['elite'].astype(int)

In [None]:
y = all_data['is_lehman']

In [None]:
import statsmodels.discrete.discrete_model as sm

In [None]:
X = pd.get_dummies(data=X, drop_first=True)
X = sm.tools.add_constant(X)

In [None]:
logit = sm.Logit(y, X)

results = logit.fit()

In [None]:
all_data['propensity'] = results.predict(X)
all_data['job_category'] = all_data.user.apply(lambda x: full_mapping[x])

In [None]:
user_to_propensity = dict(zip(all_data.user, all_data.propensity))

In [None]:
lehman = all_data[all_data['is_lehman'] == 1]
non_lehman = all_data[all_data['is_lehman'] == 0]

In [None]:
def get_closest(row):
    role = row.job_category
    score = row.propensity
    others_by_role = non_lehman[non_lehman.job_category == role].set_index('user')
    return np.absolute(others_by_role['propensity'] - score).idxmin() 


lehman['match'] = lehman.apply(get_closest, axis = 1)

In [None]:
lehman['match_propensity'] = lehman.match.apply(lambda x : user_to_propensity[x])

In [None]:
#lehman.to_csv('./Deliverables/lehman_matches_job_titles_skills.csv')

# Matching on skills

In [None]:
drop = ['length','name','industry',
            'primary_weight', 'secondary', 'secondary_weight', 'elite_education',
            'city', 'country', '.', '??',
            '/', 'department', 'exchange',
            'public', 'location_company',
            'major', 'department', 'FIGI', 'last_update','company','normalized_company','educational','degree']
    
skills_data = {company_name : company_data.drop(labels=drop, axis=1) for company_name, company_data in data.items()}
skills_data = {company_name: job_2008(company_name, company_data) for company_name, company_data in skills_data.items()}

In [None]:
non_lehman =  pd.concat([skills_data['db'], skills_data['gs'], skills_data['ms'], skills_data['ubs']])
non_lehman['is_lehman'] = 0

lehman = skills_data['leh'].copy()
lehman['is_lehman'] = 1
all_data = pd.concat([non_lehman, lehman])
all_data['primary'] = all_data['primary'].astype(str)

In [None]:
skills = list(all_data.primary.value_counts().index)

In [None]:
# p_lehman = len(lehman) / len(all_data)
# p_other = 1 - p_lehman

# entropy_parent = - (p_lehman * np.log2(p_lehman) + p_other * np.log2(p_other))

# split_fin = all_data[all_data.primary == 'Banking and Finance']
# split_non_fin = all_data[~(all_data.primary == 'Banking and Finance')]

# #finance skill split
# p_fin_lehman = sum(split_fin.is_lehman)/len(split_fin)
# p_fin_other = 1 - p_fin_lehman

# entropy_fin = - (p_fin_lehman * np.log2(p_fin_lehman) + p_fin_other * np.log2(p_fin_other))

# #non-finance skill split
# p_non_fin_lehman = sum(split_non_fin.is_lehman)/len(split_non_fin)
# p_non_fin_other = 1 - p_non_fin_lehman

# entropy_non_fin = - (p_non_fin_lehman * np.log2(p_non_fin_lehman) + p_non_fin_other * np.log2(p_non_fin_other))

# # [Weighted avg]Entropy(children) = 
# # (no. of examples in left child node) / (total no. of examples in parent node) * (entropy of left node) 
# # + 
# # (no. of examples in right child node)/ (total no. of examples in parent node) * (entropy of right node)

# n = len(all_data)
# left = len(split_fin)
# right = len(split_non_fin)
# entropy_split = left/n * entropy_fin + right/n * entropy_non_fin

In [None]:
# p_lehman = len(lehman) / len(all_data)
# p_other = 1 - p_lehman
# entropy_parent = - (p_lehman * np.log2(p_lehman) + p_other * np.log2(p_other))
# n = len(all_data)

# IG = []
# for skill in skills:
#     split = all_data[all_data.primary == skill]
#     split_no = all_data[~(all_data.primary == skill)]
    
#     #look at people with the skill first
#     p_split_lehman = sum(split.is_lehman)/len(split)
#     p_split_other = 1 - p_split_lehman

#     entropy_split = - (p_split_lehman * np.log2(p_split_lehman) + p_split_other * np.log2(p_split_other))
    
#     #look at people without the skill next
#     p_no_lehman = sum(split_no.is_lehman)/len(split_no)
#     p_no_other = 1 - p_no_lehman

#     entropy_no_split = - (p_no_lehman * np.log2(p_no_lehman) + p_no_other * np.log2(p_no_other))
    
#     #weight by number in each split
#     left = len(split)
#     right = len(split_no)
#     entropy_children = left/n * entropy_split + right/n * entropy_no_split
    
#     IG.append(entropy_parent - entropy_children)

In [None]:
import scipy.stats as st

In [None]:
#all_data = all_data[~(all_data['primary'] == '-1')].copy()

In [None]:
# p_lehman = len(lehman) / len(all_data)
# p_other = 1 - p_lehman
entropy_parent = - (p_lehman * np.log2(p_lehman) + p_other * np.log2(p_other))
n = len(all_data)


IG = []
for skill in skills:
    split = all_data[all_data.primary == skill]
    split_no = all_data[~(all_data.primary == skill)]
    
    #look at people with the skill first
#     p_split_lehman = sum(split.is_lehman)/len(split)
#     p_split_other = 1 - p_split_lehman

    #entropy_split = - (p_split_lehman * np.log2(p_split_lehman) + p_split_other * np.log2(p_split_other))
    split_lehman = sum(split.is_lehman)
    split_other = len(split) - split_lehman
    entropy_split = st.entropy([split_lehman, split_other],base=2)
    
    #look at people without the skill next
#     p_no_lehman = sum(split_no.is_lehman)/len(split_no)
#     p_no_other = 1 - p_no_lehman

    #entropy_no_split = - (p_no_lehman * np.log2(p_no_lehman) + p_no_other * np.log2(p_no_other))
    no_lehman = sum(split_no.is_lehman)
    no_other = len(split_no) - no_lehman
    entropy_no_split = st.entropy([no_lehman, no_other],base=2)
    
    #weight by number in each split
    left = len(split)
    right = len(split_no)
    entropy_children = left/n * entropy_split + right/n * entropy_no_split
    
    IG.append(entropy_parent - entropy_children)

In [None]:
IG

In [None]:
best_with_missing = np.flip(np.argsort(IG))

best_with_missing

In [None]:
# best_with_out_missing = np.flip(np.argsort(IG))

# best_with_out_missing

In [None]:
skills

# Proportion breaks with all matching

In [None]:
drop = ['length', 'gender', 'primary',
        'primary_weight', 'secondary', 'secondary_weight',
        'city', 'country', 'education', 'elite', '.', '??',
        '/', 'department', 'exchange',
        'public', 'location_company', 'educational', 'degree', 'elite_education',
        'major', 'department', 'FIGI', 'last_update', 'name', 'birth','end', 'role',
        'company','normalized_company','industry']

breaks_data = {company_name : company_data.drop(labels=drop, axis=1) for company_name, company_data in data.items()}

In [None]:
lehman_to_match = dict(zip(lehman.user, lehman.match))

In [None]:
lehman = breaks_data['leh'].copy()

lehman['match'] = lehman.user.apply(lambda x : lehman_to_match[x])

prop_breaks('leh', lehman)

In [None]:
matches = list(lehman.match.unique())

non_lehman =  pd.concat([breaks_data['db'], breaks_data['gs'], breaks_data['ms'], breaks_data['ubs']])

non_lehman_matches = non_lehman[non_lehman.user.isin(matches)]

In [None]:
date_2008 = pd.to_datetime('2008-01-01')
# look only at data after 2008
after_2008 = non_lehman_matches[non_lehman_matches['start'] > date_2008]
# groupby user, aggregate by looking at the ticker and seeing if the person has had any time off
y = after_2008.groupby('user').ticker.agg(lambda x: any((x == 'TIME_OFF') | (x == 'MISSING')))

w = y.to_frame()

r = w[w.ticker == True]

r = list(r.index)
non_lehman_matches.loc['took_break'] = False

non_lehman_matches.loc[non_lehman_matches.user.isin(r), 'took_break'] = True

In [None]:
user_to_break = dict(zip(non_lehman_matches.user, non_lehman_matches.took_break))

x = lehman.groupby('user').first()

x['break'] = x.match.apply(lambda x: user_to_break[x])

In [None]:
sum(x['break'])

In [None]:
#x.to_csv('./Deliverables/matched_breaks.csv')