In [1]:
import pandas as pd
from collections import defaultdict
import numpy as np

# MATCHING BY ROLE

In [2]:
def main(csv_file):
    employ_data = pd.read_csv(csv_file, sep="\t", header=None,
                              names=[i for i in range(34)], low_memory=False)
    # column info from taxonomy file
    name = ['user', 'name', 'birth', 'gender', 'primary',
            'primary_weight', 'secondary', 'secondary_weight',
            'city', 'country', 'education', 'elite', 'start',
            '.', 'end', '??', '/', 'length', 'role', 'department',
            'company', 'normalized_company', 'ticker', 'exchange',
            'public', 'location_company', 'industry', 'educational',
            'degree', 'elite_education', 'major', 'department', 'FIGI',
            'last_update']
    employ_data.columns = name

    # drop irrelevant columns
    drop = ['length', 'gender', 'primary',
            'primary_weight', 'secondary', 'secondary_weight',
            'city', 'country', 'education', 'elite', '.', '??',
            '/', 'department', 'exchange',
            'public', 'location_company', 'educational', 'degree', 'elite_education',
            'major', 'department', 'FIGI', 'last_update', 'industry','birth','company']
    return employ_data.drop(labels=drop, axis=1)


# data without datetime features, and none values for some dates
raw_data = {'db': main('./Data/DB_profiles.csv'),
            'gs': main('./Data/GS_profiles.csv'),
            'leh': main('./Data/LEH_profiles.csv'),
            'ms': main('./Data/MS_profiles.csv'),
            'ubs': main('./Data/UBS_profiles.csv')
            }

def standardize_dates(company):
    """
    Converts start date and end date to datetime objects, and converts None values to the minimum or maximum date.

    Returns the modified dataframe
    """
    company_data = raw_data[company].copy()
    company_data['start'].replace('None', '1900-01-01', inplace=True)
    company_data['end'].replace('None', '2018-01-01', inplace=True)
    company_data['start'] = pd.to_datetime(company_data['start'])
    company_data['end'] = pd.to_datetime(company_data['end'])
    return company_data


# set up dictionary to hold data for each company
data = {}

for company in raw_data.keys():
    data[company] = standardize_dates(company)
 

In [3]:
def get_most_recent_entry(company):
    """"
    Returns where each person worked as of 2008-01-01, according to Fedyk's conditioning
    """
    date_2008 = pd.to_datetime('2008-01-01')
    missing_start = pd.to_datetime('1900-01-01')
    missing_end = pd.to_datetime('2018-01-01')
    
    copy = data[company].copy()
    company_tickers = {'db': 'DB', 'leh': 'LEH', 'gs': 'GS', 'ms': 'MS^E', 'ubs': 'UBS'}
    # conditions: start and end not both missing, worked before/after 2008-01-01, ticker matches company
    mask = ~((copy['start'] == missing_start) & (copy['end'] == missing_end)) & \
        (copy['start'] < date_2008) & \
        (copy['end'] > date_2008) & \
        (copy['ticker'] == company_tickers[company])
    return copy[mask]

recent_entries = {company_name: get_most_recent_entry(company_name) for company_name in data.keys()}

In [4]:
all_data = pd.concat(recent_entries.values())
#only person missing a role in the entire data set
all_data = all_data.drop(11512)

In [5]:
directors = set(all_data[(all_data.role.str.contains(r'director|MD,md', case = False)) 
                         | (all_data.role.str.match(r'ed|md', case = False))].user)
all_roles = directors.copy()

analysts = set(all_data[all_data.role.str.contains('analyst|Anaylst', case = False)].user).difference(all_roles)
all_roles = all_roles.union(analysts)

vps = set(all_data[all_data.role.str.contains('president|vp', case = False)].user).difference(all_roles)
all_roles = all_roles.union(vps)

assocs = set(all_data[all_data.role.str.contains('associate', case = False)].user).difference(all_roles)
all_roles = all_roles.union(assocs)

accountants = set(all_data[all_data.role.str.contains('accountant|account executive|accounting',case = False)].user).difference(all_roles)
all_roles = all_roles.union(accountants)

consultants = set(all_data[all_data.role.str.contains('consultant', case = False)].user).difference(all_roles)
all_roles = all_roles.union(consultants)

missing = set(all_data[all_data.role.str.match(r'-|\?|\.', case = False)].user).difference(all_roles)
all_roles = all_roles.union(missing)

developers = set(all_data[all_data.role.str.contains(r'developer|engineer|system administrator', case = False)].user).difference(all_roles)
all_roles = all_roles.union(developers)

interns = set(all_data[all_data.role.str.contains('intern|trainee|apprentice', case = False)].user).difference(all_roles)
all_roles = all_roles.union(interns)

specialists = set(all_data[all_data.role.str.contains('specialist|administrator|research|expert', case = False)].user).difference(all_roles)
all_roles = all_roles.union(specialists)

sales = set(all_data[all_data.role.str.contains('sales', case = False)].user).difference(all_roles)
all_roles = all_roles.union(sales)

traders = set(all_data[all_data.role.str.contains(r'trader|trading|Portfolio Management', case = False)].user).difference(all_roles)
all_roles = all_roles.union(traders)

bankers = set(all_data[all_data.role.str.contains(r'banking|banker|finance', case = False)].user).difference(all_roles)
all_roles = all_roles.union(bankers)

controllers = set(all_data[all_data.role.str.contains('controller', case = False)].user).difference(all_roles)
all_roles = all_roles.union(controllers)

partners = set(all_data[all_data.role.str.contains('partner', case = False)].user).difference(all_roles)
all_roles = all_roles.union(partners)

counsels = set(all_data[all_data.role.str.contains('counsel', case = False)].user).difference(all_roles)
all_roles = all_roles.union(counsels)

recruiters = set(all_data[all_data.role.str.contains('recruiter|human resources', case = False)].user).difference(all_roles)
all_roles = all_roles.union(recruiters)

advisors = set(all_data[all_data.role.str.contains('advisor|adviseur', case = False)].user).difference(all_roles)
all_roles = all_roles.union(advisors)

assistants = set(all_data[all_data.role.str.contains('assistant|support|services|receptionist', case = False)].user).difference(all_roles)
all_roles = all_roles.union(assistants)

managers = set(all_data[all_data.role.str.contains(r'manager|supervisor|team lead|head|lead|coordinator|representative|process executive', case = False)].user).difference(all_roles)
all_roles = all_roles.union(managers)

others = set(all_data.user).difference(all_roles)

In [6]:
# remaining = all_data[~all_data.user.isin(all_roles)].copy()

# remaining.role.value_counts().to_csv('./Deliverables/uncategorized_data.csv')

In [7]:
all_sets = [directors, analysts, vps, assocs, advisors, assistants, consultants, managers, missing, developers, interns, specialists, sales, traders, bankers, controllers, partners, counsels, recruiters, accountants, others]
job_titles = ['director', 'analyst','vp', 'assoc','advisor','assistant','consultant','manager','missing','developer', 'intern', 'specialist','sale','trader','banker','controller','parnter','counsel', 'recruiter','accountant','other']

#associate each set with a name
zipped = list(zip(all_sets,job_titles))

def to_dict(dictionary, users, job_title):
    
    for user in users:
        dictionary.update({user:job_title})

#process the sets, mapping each user to a job title
full_mapping = {}
[to_dict(full_mapping, x, y) for x,y in zipped]

# two stragglers when doing the regression model, not relevant here
# full_mapping.update({'c0a3eb6a-59db-3a30-8a39-99a7cc8b9ce1' : 'specialist'})
# full_mapping.update({'5f425323-1cdf-3e81-a08e-35b483c42da9' : 'missing'})

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [8]:
all_data['job_category'] = all_data.user.apply(lambda x: full_mapping[x])

all_data.head(50)

Unnamed: 0,user,name,start,end,role,normalized_company,ticker,job_category
2,32c1439f-81b4-37cd-85b1-88c0f004b586,Adam Gelder,1997-05-01,2015-04-01,Director Head of Regional Sales - EMEA Trust &...,Deutsche Bank,DB,director
9,52239a8f-09fb-375b-8e3f-39504d5b2619,Charlotte Jones,2005-05-01,2009-08-01,Global Head of Accounting Policy and Advisory ...,Deutsche Bank,DB,accountant
22,e9fb0588-ca3c-3b95-b103-81468b47a3d5,Paloma Berdejo,2007-07-01,2012-01-01,"Managing Director Country Head Equities Spain,...",Deutsche Bank,DB,director
34,b774d84f-4c75-318a-9dc6-0b1e3da84258,Stefan Boecker,2007-07-01,2010-09-01,"Head of CRM Portfolio Management APAC,head",Deutsche Bank,DB,trader
38,9bf965bf-33ee-373e-85ac-50bab4be811c,Simon Hallbäck,2005-08-01,2008-05-01,"Associate in Credit Risk Management,associate,...",Deutsche Bank,DB,assoc
47,8193d67a-9c79-3c06-b39c-206aa15f604f,Stijn Geens,2006-12-01,2008-08-01,"Operations Officer,operations officer",Deutsche Bank,DB,other
55,1247f6df-be06-3807-abbc-02cb894e7839,Ramona Pearson,2007-10-01,2008-07-01,Transaction Management Group - Change the Bank...,Deutsche Bank,DB,other
63,8b887838-a895-3cab-b741-abddf751ce6f,Sushant Chandak,2007-11-01,2009-01-01,"Process Executive-Fixed Income Operations,oper...",Deutsche Bank,DB,manager
71,0a71a22a-2652-3240-af7a-5dde5aadcbbd,Birgit Frolik-Hoffmeister,2006-03-01,2010-03-01,Telefonische Privatkundenbetreuung,Deutsche Bank,DB,other
81,3a295c28-9da7-368a-b1c0-9a528b5c5579,Sean Okoe Quist,2004-01-01,2012-01-01,"Vice President,vice president",Deutsche Bank,DB,vp


In [9]:
#all_data.to_csv('./Deliverables/all_data_categorized.csv')