In [1]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)

In [2]:
raw_file = '../data/raw.csv'
results_file = '../data/results.csv'

In [3]:
df = pd.read_csv(raw_file)

In [4]:
df.columns =  ['timestamp', 'email', 'gender', 'location', 'is_international', 'ethnicity', 
         # Background
         'parents_edu', 'parents_technical', 'family_income', 'admission_avg', 'hs_extras',
         # Coding
         'code_start_age', 'fav_lang', 'text_editor', 'work_os', 'phone', 'num_hackathons', 'side_proj', 
         # Jobs
         'coop_name_1', 'coop_salary_1', 'coop_loc_1',
         'coop_name_2', 'coop_salary_2', 'coop_loc_2',
         'coop_name_3', 'coop_salary_3', 'coop_loc_3',
         'coop_name_4', 'coop_salary_4', 'coop_loc_4',
         'coop_name_5', 'coop_salary_5', 'coop_loc_5',
         'coop_name_6', 'coop_salary_6', 'coop_loc_6',
         'fav_coop', 'ft_status',
         # Lifestyle
         'exercise', 'cooking', 'sleep_time', 
         # School
         'uni_extras', 'is_se_orig', 'fav_course', 'least_fav_course', 'num_courses_failed', 
         'term_avg_1a', 'class_attendance_1a',
         'term_avg_1b', 'class_attendance_1b',
         'term_avg_2a', 'class_attendance_2a',
         'term_avg_2b', 'class_attendance_2b',
         'term_avg_3a', 'class_attendance_3a',
         'term_avg_3b', 'class_attendance_3b',
         'term_avg_4a', 'class_attendance_4a',
         'hardest_term', 'easiest_term', 'preferred_program', 'soft_eng_rating',
         # Future
         'preferred_company_size', 'work_loc', 'se_friendships', 'career_motives', 'grad_school', 'num_years_soft_eng',
         'preferred_tech_discipline', 'school_debt', 'se_advice'
        ]

In [5]:
def get_valid_coop_salaries(row):
    salaries = np.array(row[['coop_salary_1', 'coop_salary_2', 'coop_salary_3', 
                'coop_salary_4', 'coop_salary_5', 'coop_salary_6']], dtype=np.float32)
    return [x for x in salaries if x > 0]

def compute_coop_avg(row):
    return np.mean(get_valid_coop_salaries(row))

def compute_coop_median(row):
    return np.median(get_valid_coop_salaries(row))

df['coop_salary_avg'] = df.apply(compute_coop_avg, axis=1)
df['coop_salary_median'] = df.apply(compute_coop_median, axis=1)

In [6]:
def filter_admission_avg(x):
    if x <= 70:
        return 70
    return int(round(x))

df['rounded_admission_avg'] = df['admission_avg'].apply(filter_admission_avg)

In [7]:
df.to_csv(results_file)