## Repository for useful python scripts for data processing

In [1]:
## standard modules to import
import numpy as np
import pandas as pd
import numpy as np
import html
import re
from nameparser import HumanName

# standard pandas display options
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', -1)


In [None]:
# === useful functions ===

def seniority_assignment(df):
    # takes df with 'Title' column, returns df with seniority column matching SFDC conventions
    
    SENIOR = ['Senior', 'Sr', 'manager', 'mgr', 'lead', 'head', 'principle', 'principal']
    DIRECTOR = ['Director', 'Dir']
    VP = [' vice', 'President', 'VP', 'vice president']
    EXECUTIVE = ['chief', 'Executive', 'CEO', 'CIO', 'CSO', 'CTO ', 'founder']

    # create list of regex search patterns
    seniority_levels = []
    for level in [SENIOR, DIRECTOR, VP, EXECUTIVE]:
        esc_lst = [re.escape(s) for s in level]
        pattern = '|'.join(esc_lst)
        seniority_levels.append(pattern)

    # assign seniority level based on keywords
    df.loc[df.Title.str.contains(seniority_levels[0], na=False, case=False, regex=True), 'Seniority'] = 'Senior'
    df.loc[df.Title.str.contains(seniority_levels[1], na=False, case=False, regex=True), 'Seniority'] = 'Director'
    df.loc[df.Title.str.contains(seniority_levels[2], na=False, case=False, regex=True), 'Seniority'] = 'VP'
    df.loc[df.Title.str.contains(seniority_levels[3], na=False, case=False, regex=True), 'Seniority'] = 'Executive'

    # default is 'Entry'
    df.Seniority = df.Seniority.fillna(value = 'Entry')

    # add CTO - this was picking up direCTOr before I added a space to the search term
    df.loc[df.Title == 'CTO', 'Seniority'] = 'Executive'
    
    return df

def first_name(unformatted_name):
    name = HumanName(unformatted_name)
    name.capitalize(force=True)
    return name.first

def last_name(unformatted_name):
    name = HumanName(unformatted_name)
    name.capitalize(force=True)
    return name.last

def title_processing(df):
    # removes rows from df with job titles that are not of interest (e.g. sales and academic titles)
    # takes df with a 'Title' column and returns df with rows removed
    
    titles_dont_care = ['talent', 'sales', 'commercial', 'marketing', 'people',
                        'clinical', 'communication', 'Administrative', 'administration',
                        'Human Resources', 'HR', 'Consultant', 'Contractor', 
                        'self employed', 'patient', 'trial', 'nurse', 'events', 
                        'business', 'assistant', 'professor', 'scholar', 'student', 
                        'fellow', 'candidate', 'postdoc', 'post doc', 
                        'field application scientist']

    # create an regex friendly version of the titles list using re.escape
    esc_list = [re.escape(s) for s in titles_dont_care]
    
    # join the escape list by regex pipe
    search_pattern = '|'.join(esc_list)

    # update the dataframe to remove people with undesireable words in title
    return df[~df.Title.str.contains(search_pattern, na=False, case=False, regex=True)]