In [1]:
from Bucket import Bucket
import os
import json

%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt

In [2]:
bucket_path = '../private/result-buckets/'
grade_vs_salary = '../private/grade-vs-salary.json'
grade_vs_attendance = '../private/grade-vs-attendance.json'
TERM = ['1a', '1b', '2a', '2b', '3a', '3b', '4a']

In [3]:
def process_file(filename):
    with open(bucket_path + filename, 'r') as f:
        f = json.loads(f.read())
        b = Bucket.create_from_bucket(filename.replace('_buckets.json', ''), f)
        render_file(b)

In [4]:
def render_file(b):
    fig = plt.figure(figsize=(18, 16))    
    
    ax1 = fig.add_subplot(211)
    cols = []
    for val in b:
        if type(b[val]) == dict and 'message' in b[val]:
            continue
        cols.append(val)
        salaries = []

        salaries = [s.mean() for s in b[val].salaries]
        ax1.plot([1, 2, 3, 4, 5, 6], salaries) # add 7 for term
    ax1.legend(cols, loc='upper left')
#    plt.show()

In [5]:
for i in os.listdir(bucket_path):
    if i.startswith('.'):
        continue
#    process_file(i)

In [6]:
def render_grade_vs_salary(data, term, axis='cumulative_avg'):
    """Creates a scatter plot of the data of a term.
    
    `axis` represents the x axis that's plotted against salary.
    """
    x = []
    y = []
    for salary in data:
        for item in data[salary]:
            x.append(item[axis])
            y.append(item['result_salary'])
    plt.xlim(60, 100)
    plt.scatter(x, y)
    print('Term: ' + term)
    print(np.corrcoef(x, y)[0, 1])
    plt.title('Cumulative average vs Salary, ' + str(term) + 'th coop')
    plt.show()

In [7]:
with open(grade_vs_salary, 'r') as f:
    data = json.loads(f.read())
    for term in range(1, len(data)+1):
        pass
#        render_grade_vs_salary(data[str(term)], str(term))

In [8]:
from scipy.interpolate import interp1d
import math

In [9]:
def render_grade_vs_attendance(data, term):
    x = []
    y = []
    legend = []
    legend_values = {
        '4': 'Attends almost all classes',
        '3': 'Attends most classes',
        '2': 'Sometimes attends class',
        '1': 'Rarely attends class'
    }
    max_val = []
    for i in range(1, 5):
        rating = data.get(str(i), [])
        grades = [0] * 6
        for g in rating:
            if g < 60 or g == 'exchange':
                continue
            index = int(math.floor((float(g) - 60) / (40.0 / 6))) # Normalize to range 0-7
            grades[index] += 1
        if sum(grades) < 5:
            continue
        legend.append(legend_values[str(i)])
        grades = list(map(lambda x: float(x) / sum(grades) * 100, grades))

        x = [num for num in range(60, 100, 7)]
        f = interp1d(x, grades, kind='cubic', fill_value='extrapolate')
        xnew = np.linspace(63, 100, num=40)
        max_val.append(max(f(xnew)))
        plt.plot(xnew, f(xnew))
        
    plt.xlim(60, 100)
    plt.ylim(0, 60)
    plt.legend(legend)
    plt.title('Attendance vs Term Average, ' + TERM[term])
    plt.show()

In [10]:
with open(grade_vs_attendance, 'r') as f:
    data = json.loads(f.read())
    for i, t in enumerate(TERM):
        pass
#        render_grade_vs_attendance(data[t], i)

## Performing an ANOVA of specific data sets.

In [11]:
import scipy.stats as stats
from collections import defaultdict

In [12]:
# Example usage of one-way anova
tillamook = [0.0571, 0.0813, 0.0831, 0.0976, 0.0817, 0.0859, 0.0735, 0.0659, 0.0923, 0.0836]
newport = [0.0873, 0.0662, 0.0672, 0.0819, 0.0749, 0.0649, 0.0835, 0.0725]
petersburg = [0.0974, 0.1352, 0.0817, 0.1016, 0.0968, 0.1064, 0.105]
magadan = [0.1033, 0.0915, 0.0781, 0.0685, 0.0677, 0.0697, 0.0764, 0.0689]
tvarminne = [0.0703, 0.1026, 0.0956, 0.0973, 0.1039, 0.1045]
stats.f_oneway(tillamook, newport, petersburg, magadan, tvarminne)

F_onewayResult(statistic=7.121019471642447, pvalue=0.0002812242314534544)

### Coop Correlations
- Technical background
  - Coding start age
  - Hackathons
  - side projects 
- Background
  - Parental education
  - Admission average
  
### Academic Correlations:
- SE Rating: TODO: do we really want this?
- Admission average
- Parents education

In [13]:
def correlate_code_start_age(content, dv):
    """dv: dependent variable."""
    shs = content[dv]
    values = defaultdict(list)
    values[14].extend(shs['9'])
    values[14].extend(shs['10'])
    values[14].extend(shs['11'])
    values[14].extend(shs['12'])
    values[14].extend(shs['13'])
    values[14].extend(shs['14'])
    values[15].extend(shs['15'])
    values[16].extend(shs['16'])
    values[17].extend(shs['17'])
    values[17].extend(shs['18'])
    values[17].extend(shs['19'])

    print('CALCULATING ' + dv.upper())
    fh = values[14] + values[15]
    sh = values[16] + values[17]
    print('Ages <=15, >15')
    print(stats.f_oneway(fh, sh))
    print(stats.kruskal(fh, sh))
    print(np.mean(fh), np.mean(sh))
    print(np.std(fh), np.std(sh))
    print('[Count] Younger than 16: ' + str(len(fh)) + ', At least 16: ' + str(len(sh)))
    print('')

In [32]:
def extract_range(content, ranges):
    result = []
    for r in ranges:
        current = []

        for val in content:
            if int(val) > r[0] and int(val) <= r[1]:
                current.extend(content[val])
        print r, len(current)
        result.append(current)
    return result

In [61]:
def extract_groups(content, groups):
    result = []
    for r in groups:
        current = []

        for val in content:
            if val.strip() in r:
                current.extend(content[val])
        print r, len(current)
        result.append(current)
    return result

In [57]:
def dict_to_lol(content):
    # list of lists im not a bad programmer
    lol = []
    for i in content:
        lol.append(content[i])
    return lol

In [35]:
def load_file(col_name):
    with open(bucket_path + col_name + '_buckets.json', 'r') as f:
        return json.loads(f.read())

In [36]:
# Code start age
content = load_file('code_start_age')
correlate_code_start_age(content, 'first_half_grades')
correlate_code_start_age(content, 'second_half_grades')

correlate_code_start_age(content, 'first_half_salaries')
correlate_code_start_age(content, 'second_half_salaries')


CALCULATING FIRST_HALF_GRADES
Ages <=15, >15
F_onewayResult(statistic=1.010900093484944, pvalue=0.31693467394518826)
KruskalResult(statistic=1.5483371696092163, pvalue=0.21338108763307484)
(82.17986842105265, 80.65575471698114)
(8.456075056164886, 7.189968404438046)
[Count] Younger than 16: 57, At least 16: 53

CALCULATING SECOND_HALF_GRADES
Ages <=15, >15
F_onewayResult(statistic=0.1914648959030756, pvalue=0.6625739819257515)
KruskalResult(statistic=0.7318345398624917, pvalue=0.392289188305433)
(82.51912280701755, 82.00761006289306)
(6.83026278969861, 5.128680460015708)
[Count] Younger than 16: 57, At least 16: 53

CALCULATING FIRST_HALF_SALARIES
Ages <=15, >15
F_onewayResult(statistic=0.06361994154330908, pvalue=0.8013427318791513)
KruskalResult(statistic=0.07565116830994835, pvalue=0.7832796880126466)
(4392.2269220238095, 4302.318388888889)
(1753.9082864550207, 1948.2961059310712)
[Count] Younger than 16: 56, At least 16: 54

CALCULATING SECOND_HALF_SALARIES
Ages <=15, >15
F_onewayR

In [37]:
# Side projects TODO
content = load_file('side_proj') 

fhs = dict_to_lol(content['first_half_salaries'])
print(np.mean(fhs[0]), np.mean(fhs[1]))
print(np.std(fhs[0]), np.std(fhs[1]))
print(stats.f_oneway(*fhs))
print(stats.kruskal(*fhs))
print('')

fhg = dict_to_lol(content['first_half_grades'])
print(np.mean(fhg[0]), np.mean(fhg[1]))
print(np.std(fhg[0]), np.std(fhg[1]))
print(stats.f_oneway(*fhg))
print(stats.kruskal(*fhg))
print('')

shs = dict_to_lol(content['second_half_salaries'])
print(np.mean(shs[0]), np.mean(shs[1]))
print(np.std(shs[0]), np.std(shs[1]))
print(stats.f_oneway(*shs))
print(stats.kruskal(*shs))
print('')

shg = dict_to_lol(content['second_half_grades'])
print(np.mean(shg[0]), np.mean(shg[1]))
print(np.std(shg[0]), np.std(shg[1]))
print(stats.f_oneway(*shg))
print(stats.kruskal(*shg))

(3952.7395185185187, 5185.204311111112)
(1581.5814869176006, 2506.6551798061537)
F_onewayResult(statistic=1.0587634091330083, pvalue=0.38755765604710224)
KruskalResult(statistic=6.149723557467257, pvalue=0.2919216336846051)

(83.84009259259258, 81.16333333333334)
(6.473969368909215, 8.963631456552017)
F_onewayResult(statistic=0.9565170983552662, pvalue=0.44803479836100524)
KruskalResult(statistic=3.7482511908965583, pvalue=0.5862001268550607)

(8312.672404938272, 9008.167533333333)
(2212.9625380333537, 1856.8817586829546)
F_onewayResult(statistic=0.6554010066955458, pvalue=0.6580272706344233)
KruskalResult(statistic=3.432691802755236, pvalue=0.6335957839023879)

(83.93222222222222, 81.90777777777778)
(5.312219472138232, 8.37182060527538)
F_onewayResult(statistic=1.6103971954323175, pvalue=0.163678077618645)
KruskalResult(statistic=6.2696012299067805, pvalue=0.28086239343933733)


In [27]:
# Hackathons TODO
content = load_file('num_hackathons')

fhs = dict_to_lol(content['first_half_salaries'])
print(np.mean(fhs[0]), np.mean(fhs[1]))
print(np.std(fhs[0]), np.std(fhs[1]))
print(stats.f_oneway(*fhs))
print(stats.kruskal(*fhs))
print('')

fhg = dict_to_lol(content['first_half_grades'])
print(np.mean(fhg[0]), np.mean(fhg[1]))
print(np.std(fhg[0]), np.std(fhg[1]))
print(stats.f_oneway(*fhg))
print(stats.kruskal(*fhg))
print('')

shs = dict_to_lol(content['second_half_salaries'])
print(np.mean(shs[0]), np.mean(shs[1]))
print(np.std(shs[0]), np.std(shs[1]))
print(stats.f_oneway(*shs))
print(stats.kruskal(*shs))
print('')

shg = dict_to_lol(content['second_half_grades'])
print(np.mean(shg[0]), np.mean(shg[1]))
print(np.std(shg[0]), np.std(shg[1]))
print(stats.f_oneway(*shg))
print(stats.kruskal(*shg))

(5537.157107692307, 3620.4732982456135)
(1882.6430325294552, 1359.6290448299071)
F_onewayResult(statistic=3.523430535729713, pvalue=0.017518806488534242)
KruskalResult(statistic=10.161950966399605, pvalue=0.017238487854583925)

(81.71826923076922, 82.07631578947368)
(7.143384808595196, 7.125515464660723)
F_onewayResult(statistic=0.11326726272171668, pvalue=0.9521727330904038)
KruskalResult(statistic=0.6034201066423299, pvalue=0.8956489736424222)

(11055.822387179487, 7693.5659087719305)
(3122.9386666187784, 2630.5840371388413)
F_onewayResult(statistic=4.392995396528476, pvalue=0.005875469398369296)
KruskalResult(statistic=10.735585667253625, pvalue=0.01324505781310595)

(84.04974358974358, 83.309649122807)
(6.652069591396626, 4.538179800346942)
F_onewayResult(statistic=0.9493675473510874, pvalue=0.4195786519865531)
KruskalResult(statistic=3.0762206619316683, pvalue=0.38002209300644973)


In [20]:
# Parents technical
content = load_file('parents_technical')

fhs = dict_to_lol(content['first_half_salaries'])
print(np.mean(fhs[0]), np.mean(fhs[1]))
print(np.std(fhs[0]), np.std(fhs[1]))
print(stats.f_oneway(*fhs))
print(stats.kruskal(*fhs))
print('')

fhg = dict_to_lol(content['first_half_grades'])
print(np.mean(fhg[0]), np.mean(fhg[1]))
print(np.std(fhg[0]), np.std(fhg[1]))
print(stats.f_oneway(*fhg))
print(stats.kruskal(*fhg))
print('')

shs = dict_to_lol(content['second_half_salaries'])
print(np.mean(shs[0]), np.mean(shs[1]))
print(np.std(shs[0]), np.std(shs[1]))
print(stats.f_oneway(*shs))
print(stats.kruskal(*shs))
print('')

shg = dict_to_lol(content['second_half_grades'])
print(np.mean(shg[0]), np.mean(shg[1]))
print(np.std(shg[0]), np.std(shg[1]))
print(stats.f_oneway(*shg))
print(stats.kruskal(*shg))

(4760.610567307693, 3966.183352542372)
(1946.2807229306638, 1668.2688214650123)
F_onewayResult(statistic=5.264325195780167, pvalue=0.02368278260248313)
KruskalResult(statistic=6.000394497171199, pvalue=0.014302679946007606)

(81.45649038461539, 81.34779661016948)
(7.677325885665871, 8.068495177214539)
F_onewayResult(statistic=0.005154052497167302, pvalue=0.9428992179194144)
KruskalResult(statistic=0.013970123763356521, pvalue=0.9059129312695484)

(9294.685413836476, 8480.799020903954)
(2494.7104044422586, 2972.730569932684)
F_onewayResult(statistic=2.3898903518209593, pvalue=0.12499440610039)
KruskalResult(statistic=3.2531050131119477, pvalue=0.07128829325986479)

(82.73439102564103, 81.92324858757063)
(5.965646417830094, 6.110647929556666)
F_onewayResult(statistic=0.4889934458226031, pvalue=0.4858662004008151)
KruskalResult(statistic=0.5500847681882083, pvalue=0.4582830599695533)


In [64]:
# Parents education
content = load_file('parents_edu')

fhs = extract_groups(content['first_half_salaries'], [
    ['Postsecondary diploma (e.g. college)', 'Some postsecondary', 'High school', 'Did not complete high school'],
    ["Bachelor's"],
    ["Master\'s/Professional (MD, MBA, etc)", "Doctorate"]
])
print(np.mean(fhs[0]), np.mean(fhs[1]), np.mean(fhs[2]))
print(np.std(fhs[0]), np.std(fhs[1]),  np.std(fhs[2]))
print(stats.f_oneway(*fhs))
print(stats.kruskal(*fhs))
print('')

fhg = extract_groups(content['first_half_grades'], [
    ['Postsecondary diploma (e.g. college)', 'Some postsecondary', 'High school', 'Did not complete high school'],
    ["Bachelor's"],
    ["Master\'s/Professional (MD, MBA, etc)", "Doctorate"]
])
print(np.mean(fhg[0]), np.mean(fhg[1]), np.mean(fhg[2]))
print(np.std(fhg[0]), np.std(fhg[1]), np.std(fhg[2]))
print(stats.f_oneway(*fhg))
print(stats.kruskal(*fhg))
print('')

shs = extract_groups(content['second_half_salaries'], [
    ['Postsecondary diploma (e.g. college)', 'Some postsecondary', 'High school', 'Did not complete high school'],
    ["Bachelor's"],
    ["Master\'s/Professional (MD, MBA, etc)", "Doctorate"]
])
print(np.mean(shs[0]), np.mean(shs[1]), np.mean(shs[2]))
print(np.std(shs[0]), np.std(shs[1]), np.std(shs[2]))
print(stats.f_oneway(*shs))
print(stats.kruskal(*shs))
print('')

shg = extract_groups(content['second_half_grades'], [
    ['Postsecondary diploma (e.g. college)', 'Some postsecondary', 'High school', 'Did not complete high school'],
    ["Bachelor's"],
    ["Master\'s/Professional (MD, MBA, etc)", "Doctorate"]
])
print(np.mean(shg[0]), np.mean(shg[1]), np.mean(shg[2]))
print(np.std(shg[0]), np.std(shg[1]), np.std(shg[2]))
print(stats.f_oneway(*shg))
print(stats.kruskal(*shg))

['Postsecondary diploma (e.g. college)', 'Some postsecondary', 'High school', 'Did not complete high school'] 17
["Bachelor's"] 46
["Master's/Professional (MD, MBA, etc)", 'Doctorate'] 48
(4479.299049019608, 4452.612265942029, 4178.923317361111)
(1830.2882535991687, 1970.2009664951063, 1713.6643670386345)
F_onewayResult(statistic=0.3095866416428993, pvalue=0.7343991657991147)
KruskalResult(statistic=0.7932489343638668, pvalue=0.6725865565378386)

['Postsecondary diploma (e.g. college)', 'Some postsecondary', 'High school', 'Did not complete high school'] 18
["Bachelor's"] 46
["Master's/Professional (MD, MBA, etc)", 'Doctorate'] 47
(79.4673611111111, 80.42891304347825, 83.08755319148936)
(9.545717679796347, 8.098610097004801, 6.530707284065745)
F_onewayResult(statistic=1.9815719068522817, pvalue=0.14283289280692127)
KruskalResult(statistic=2.5221067260048957, pvalue=0.2833553931550314)

['Postsecondary diploma (e.g. college)', 'Some postsecondary', 'High school', 'Did not complete high 

In [65]:
content = load_file('admission_avg')

fhs = extract_range(content['first_half_salaries'], [[0, 94], [94, 100]])
print(np.mean(fhs[0]), np.mean(fhs[1]))
print(np.std(fhs[0]), np.std(fhs[1]))
print(stats.f_oneway(*fhs))
print(stats.kruskal(*fhs))
print('')

fhg = extract_range(content['first_half_grades'], [[0, 94], [94, 100]])
print(np.mean(fhg[0]), np.mean(fhg[1]))
print(np.std(fhg[0]), np.std(fhg[1]))
print(stats.f_oneway(*fhg))
print(stats.kruskal(*fhg))
print('')

shs = extract_range(content['second_half_salaries'], [[0, 94], [94, 100]])
print(np.mean(shs[0]), np.mean(shs[1]))
print(np.std(shs[0]), np.std(shs[1]))
print(stats.f_oneway(*shs))
print(stats.kruskal(*shs))
print('')

shg = extract_range(content['second_half_grades'], [[0, 94], [94, 100]])
print(np.mean(shg[0]), np.mean(shg[1]))
print(np.std(shg[0]), np.std(shg[1]))
print(stats.f_oneway(*shg))
print(stats.kruskal(*shg))

[0, 94] 72
[94, 100] 38
(4203.096323148148, 4579.904281578946)
(1873.276216211977, 1792.6590046352624)
F_onewayResult(statistic=1.0176857198121207, pvalue=0.3153234299214731)
KruskalResult(statistic=1.0115925799284102, pvalue=0.3145216082591609)

[0, 94] 73
[94, 100] 37
(80.34551369863013, 83.31675675675676)
(8.025915568465763, 7.263009829173479)
F_onewayResult(statistic=3.5183706179728587, pvalue=0.06339250374700438)
KruskalResult(statistic=2.9832679537477396, pvalue=0.08412925032984256)

[0, 94] 73
[94, 100] 38
(8396.941535616439, 9783.372378947368)
(2677.529843323969, 2800.156744533341)
F_onewayResult(statistic=6.375338741265559, pvalue=0.013010819996977728)
KruskalResult(statistic=5.07545163822339, pvalue=0.02426694937242192)

[0, 94] 73
[94, 100] 37
(81.28920091324201, 84.3439189189189)
(6.194241225643408, 5.300754298075038)
F_onewayResult(statistic=6.44326699011021, pvalue=0.012563097576486857)
KruskalResult(statistic=5.979542037880528, pvalue=0.014472759768020671)


### Gender rating
Check if the program rating is similar between male and female students

In [24]:
values = defaultdict(list)
with open('../private/gender_rating.json', 'r') as f:
    content = json.loads(f.read())
    print(len(content['Male']), len(content['Female']))
    print(stats.f_oneway(content['Male'], content['Female']))
    print(stats.kruskal(content['Male'], content['Female']))

(93, 19)
F_onewayResult(statistic=0.14766472350605012, pvalue=0.7015189429260772)
KruskalResult(statistic=0.00029500832952058517, pvalue=0.986296362944981)


