In [1]:
from functools import partial
import numpy as np
import pandas as pd
import re
import constants

pd.set_option('display.max_columns', None)

In [2]:
raw_file = '../data/raw.csv'
results_file = '../data/results.csv'

In [3]:
df = pd.read_csv(raw_file)

In [4]:
df.columns = constants.columns

In [5]:
def gender_map(row):
    new_gender = row['gender'].lower()

    if (new_gender.startswith('prefer not')):
        new_gender = 'undisclosed'

    return new_gender

city_regex = re.compile('(.*),.*')
def get_city(row):
    result = city_regex.match(row['location'])
    if result:
        return result.group(1).strip()
    return "?"

country_regex = re.compile('.*,(.*)')
def get_country(row):
    result = country_regex.match(row['location'])
    if result:
        return result.group(1).strip()
    return "?"

df['gender'] = df.apply(gender_map, axis=1)
df['city_of_origin'] = df.apply(get_city, axis=1)
df['country_of_origin'] = df.apply(get_country, axis=1)

In [6]:
def get_valid_coop_salaries(row):
    salaries = np.array(row[['coop_salary_1', 'coop_salary_2', 'coop_salary_3', 
                'coop_salary_4', 'coop_salary_5', 'coop_salary_6']], dtype=np.float32)
    return [max(0, min(100000, x)) for x in salaries]

def compute_coop_avg(row):
    return np.mean(get_valid_coop_salaries(row))

def compute_coop_median(row):
    return np.median(get_valid_coop_salaries(row))

df['coop_salary_avg'] = df.apply(compute_coop_avg, axis=1)
df['coop_salary_median'] = df.apply(compute_coop_median, axis=1)

In [7]:
def get_valid_term_grade_avgs(row):
    avgs = np.array(row[['term_avg_1a', 'term_avg_1b', 'term_avg_2a', 'term_avg_2b'
                'term_avg_3a', 'term_avg_3b', 'term_avg_4a']], dtype=np.float32)
    return [max(0, min(100, x)) for x in avgs]

def compute_culm_grade_avg(row):
    return np.mean(get_valid_term_grade_avgs(row))

def compute_culm_grade_median(row):
    return np.median(get_valid_term_grade_avgs(row))

df['culm_grade_avg'] = df.apply(compute_culm_grade_avg, axis=1)
df['culm_grade_median'] = df.apply(compute_culm_grade_median, axis=1)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]


In [8]:
def round_admission_avg(x):
    x = min(100, max(70, x))
    return int(round(x))

df['rounded_admission_avg'] = df['admission_avg'].apply(round_admission_avg)

In [9]:
"""
Processing comma-separated lists, e.g. hs_extras, uni_extras.
"""
def to_lower(row, colname):
    return row[colname].lower()

def replace_strs(row, colname, transforms=[]):
    tmp = row
    
    for src_str, dst_str in transforms:
        tmp[colname] = tmp[colname].replace(src_str, dst_str)
    return tmp[colname]

def split_and_count(df, colname, delimiter=","):
    return pd.DataFrame([ x.strip() for x in df[colname].str.split(delimiter, expand=True).values.flatten() if x is not None ]).groupby(0).size()


extra_transforms = [
    ('hack the north', 'hackathon'),
    ('team', '')
]
df['uni_extras_normed'] = df.apply(partial(to_lower, colname='uni_extras'), axis=1)
df['uni_extras_normed'] = df.apply(partial(replace_strs, colname='uni_extras_normed', transforms=extra_transforms), axis=1)


In [10]:
split_and_count(df, 'uni_extras', ',')

0
Blueprint                2
Engsoc                   2
Hack the North          10
Mock Interviews          2
Residence Ambassador     2
Resume Critiques         2
SE community            10
UW/UX                   10
Ultimate Frisbee         2
Volunteering             2
dtype: int64

In [11]:
split_and_count(df, 'uni_extras_normed', ',')

0
blueprint                2
engsoc                   2
hackathon               10
mock interviews          2
residence ambassador     2
resume critiques         2
se community            10
ultimate frisbee         2
uw/ux                   10
volunteering             2
dtype: int64

In [12]:
df.to_csv(results_file)