In [1]:
from functools import partial
import numpy as np
import pandas as pd
import re
import constants

from locale import *
setlocale(LC_NUMERIC, 'en_US.UTF-8')

pd.set_option('display.max_columns', None)

def resilient_atof(x):
    try:
        return atof(x) if x else 0.0
    except:
        return 0.0

In [2]:
raw_file = '../data/raw.csv'
results_file = '../data/results.csv'

In [3]:
df = pd.read_csv(raw_file)
df = df.fillna('')

In [4]:
df.columns = constants.columns

In [5]:
def gender_map(row):
    new_gender = row['gender'].lower()

    if (new_gender.startswith('prefer not')):
        new_gender = 'undisclosed'

    return new_gender

city_regex = re.compile('(.*),.*')
def get_city(row):
    result = city_regex.match(row['location'])
    if result:
        return result.group(1).strip()
    return "?"

country_regex = re.compile('.*,(.*)')
def get_country(row):
    result = country_regex.match(row['location'])
    if result:
        return result.group(1).strip()
    return "?"

df['gender'] = df.apply(gender_map, axis=1)
df['city_of_origin'] = df.apply(get_city, axis=1)
df['country_of_origin'] = df.apply(get_country, axis=1)

In [6]:
def get_valid_coop_salaries(row):
    raw_salaries = row[['coop_salary_1', 'coop_salary_2', 'coop_salary_3',
                        'coop_salary_4', 'coop_salary_5', 'coop_salary_6']]
    salaries = np.array(map(resilient_atof, raw_salaries), dtype=np.float32)
    return [max(0, min(100000, x)) for x in salaries]

def compute_coop_avg(row):
    return np.mean(get_valid_coop_salaries(row))

def compute_coop_median(row):
    return np.median(get_valid_coop_salaries(row))

df['coop_salary_avg'] = df.apply(compute_coop_avg, axis=1)
df['coop_salary_median'] = df.apply(compute_coop_median, axis=1)

In [7]:
def get_valid_term_grade_avgs(row):
    raw_avgs = row[['term_avg_1a', 'term_avg_1b', 'term_avg_2a', 'term_avg_2b',
                    'term_avg_3a', 'term_avg_3b', 'term_avg_4a']]
    avgs = np.array(map(resilient_atof, raw_avgs), dtype=np.float32)
    return [max(0, min(100, x)) for x in avgs]

def compute_culm_grade_avg(row):
    return np.mean(get_valid_term_grade_avgs(row))

def compute_culm_grade_median(row):
    return np.median(get_valid_term_grade_avgs(row))

df['culm_grade_avg'] = df.apply(compute_culm_grade_avg, axis=1)
df['culm_grade_median'] = df.apply(compute_culm_grade_median, axis=1)

In [8]:
def round_admission_avg(x):
    x = min(100, max(70, x))
    return int(round(x))

df['rounded_admission_avg'] = df['admission_avg'].apply(round_admission_avg)

In [9]:
"""
Processing comma-separated lists, e.g. hs_extras, uni_extras.
"""
def to_lower(row, colname):
    return row[colname].lower()


""" Perform string replacements in a DataFrame column.

row: a row of a DataFrame provided when run within DataFrame.apply with axis=1
colname (str): the column to perform replacements in
transforms (list( tuple(string, string) )): a list of string replacements to perform, in order

"""
def replace_strs(row, colname, transforms=[]):
    tmp = row
    
    for src_str, dst_str in transforms:
        tmp[colname] = tmp[colname].replace(src_str, dst_str)
    return tmp[colname]


def split_and_count(df, colname, delimiter=","):
    return pd.DataFrame([ x.strip() for x in df[colname].str.split(delimiter, expand=True).values.flatten() if x is not None ]).groupby(0).size()


extra_transforms = [
    ('hack the north', 'hackathon'),
    ('team', '')
]
df['uni_extras_normed'] = df.apply(partial(to_lower, colname='uni_extras'), axis=1)
df['uni_extras_normed'] = df.apply(partial(replace_strs, colname='uni_extras_normed', transforms=extra_transforms), axis=1)


In [10]:
split_and_count(df, 'uni_extras', ',')

0
                                                35
ACM                                              1
Academic Rep                                     1
Archery Club                                     1
Badminton                                        6
Badminton Club                                   2
Badminton club                                   1
Basketball                                       4
Basketball intramural                            1
Blueprint                                        2
CRO for election)\nHackathons                    1
Canadian Undergraduate Technology Conference     1
Christian Fellowship                             1
Coaching high school robotics team               1
DDR club                                         1
Dance                                            2
Dodge-ball Intramural                            1
Dodgeball                                        3
Drama                                            1
Eng Soc                      

In [11]:
split_and_count(df, 'uni_extras_normed', ',')

0
                                                35
academic rep                                     1
acm                                              1
ain't nobody got time for that                   1
archery club                                     1
badminton                                        7
badminton club                                   3
basketball                                       5
basketball intramural                            1
basketball intramurals                           1
bit of robotics club                             1
blueprint                                        2
canadian undergraduate technology conference     1
christian fellowship                             1
club                                             1
coaching high school robotics                    1
combat                                           1
cro for election)\nhackathons                    1
dance                                            4
ddr club                     

In [12]:
df.to_csv(results_file)