In [2]:
from Bucket import Bucket
import os
import json

%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt

In [3]:
bucket_path = '../private/result-buckets/'
grade_vs_salary = '../private/grade-vs-salary.json'
grade_vs_attendance = '../private/grade-vs-attendance.json'
TERM = ['1a', '1b', '2a', '2b', '3a', '3b', '4a']

In [4]:
def process_file(filename):
    with open(bucket_path + filename, 'r') as f:
        f = json.loads(f.read())
        b = Bucket.create_from_bucket(filename.replace('_buckets.json', ''), f)
        render_file(b)

In [5]:
def render_file(b):
    fig = plt.figure(figsize=(18, 16))    
    
    ax1 = fig.add_subplot(211)
    cols = []
    for val in b:
        if type(b[val]) == dict and 'message' in b[val]:
            continue
        cols.append(val)
        salaries = []

        salaries = [s.mean() for s in b[val].salaries]
        ax1.plot([1, 2, 3, 4, 5, 6], salaries) # add 7 for term
    ax1.legend(cols, loc='upper left')
#    plt.show()

In [6]:
for i in os.listdir(bucket_path):
    if i.startswith('.'):
        continue
#    process_file(i)

In [7]:
def render_grade_vs_salary(data, term, axis='cumulative_avg'):
    """Creates a scatter plot of the data of a term.
    
    `axis` represents the x axis that's plotted against salary.
    """
    x = []
    y = []
    for salary in data:
        for item in data[salary]:
            x.append(item[axis])
            y.append(item['result_salary'])
    plt.xlim(60, 100)
    plt.scatter(x, y)
    print('Term: ' + term)
    print(np.corrcoef(x, y)[0, 1])
    plt.title('Cumulative average vs Salary, ' + str(term) + 'th coop')
    plt.show()

In [8]:
with open(grade_vs_salary, 'r') as f:
    data = json.loads(f.read())
    for term in range(1, len(data)+1):
        pass
#        render_grade_vs_salary(data[str(term)], str(term))

In [9]:
from scipy.interpolate import interp1d
import math

In [10]:
def render_grade_vs_attendance(data, term):
    x = []
    y = []
    legend = []
    legend_values = {
        '4': 'Attends almost all classes',
        '3': 'Attends most classes',
        '2': 'Sometimes attends class',
        '1': 'Rarely attends class'
    }
    max_val = []
    for i in range(1, 5):
        rating = data.get(str(i), [])
        grades = [0] * 6
        for g in rating:
            if g < 60 or g == 'exchange':
                continue
            index = int(math.floor((float(g) - 60) / (40.0 / 6))) # Normalize to range 0-7
            grades[index] += 1
        if sum(grades) < 5:
            continue
        legend.append(legend_values[str(i)])
        grades = list(map(lambda x: float(x) / sum(grades) * 100, grades))

        x = [num for num in range(60, 100, 7)]
        f = interp1d(x, grades, kind='cubic', fill_value='extrapolate')
        xnew = np.linspace(63, 100, num=40)
        max_val.append(max(f(xnew)))
        plt.plot(xnew, f(xnew))
        
    plt.xlim(60, 100)
    plt.ylim(0, 60)
    plt.legend(legend)
    plt.title('Attendance vs Term Average, ' + TERM[term])
    plt.show()

In [11]:
with open(grade_vs_attendance, 'r') as f:
    data = json.loads(f.read())
    for i, t in enumerate(TERM):
        pass
#        render_grade_vs_attendance(data[t], i)

## Performing an ANOVA of specific data sets.

In [12]:
import scipy.stats as stats
from collections import defaultdict

### Coop Correlations
- Technical background
  - Coding start age
  - Hackathons
  - side projects 
- Background
  - Parental education
  - Admission average
  
### Academic Correlations:
- Admission average
- Parents education

In [13]:
def correlate_code_start_age(content, dv):
    """dv: dependent variable."""
    shs = content[dv]
    values = defaultdict(list)
    values[14].extend(shs['9'])
    values[14].extend(shs['10'])
    values[14].extend(shs['11'])
    values[14].extend(shs['12'])
    values[14].extend(shs['13'])
    values[14].extend(shs['14'])
    values[15].extend(shs['15'])
    values[16].extend(shs['16'])
    values[17].extend(shs['17'])
    values[17].extend(shs['18'])
    values[17].extend(shs['19'])

    print('CALCULATING ' + dv.upper())
    fh = values[14] + values[15]
    sh = values[16] + values[17]
    print('Ages <=15, >15')
    print(stats.f_oneway(fh, sh))
    print(stats.kruskal(fh, sh))
    print(np.mean(fh), np.mean(sh))
    print(np.std(fh), np.std(sh))
    print('[Count] Younger than 16: ' + str(len(fh)) + ', At least 16: ' + str(len(sh)))
    print('')

In [14]:
def extract_range(content, ranges):
    result = []
    for r in ranges:
        current = []

        for val in content:
            if int(val) > r[0] and int(val) <= r[1]:
                current.extend(content[val])
        print r, len(current)
        result.append(current)
    return result

In [15]:
def extract_groups(content, groups):
    result = []
    for r in groups:
        current = []

        for val in content:
            if val.strip() in r:
                current.extend(content[val])
        print r, len(current)
        result.append(current)
    return result

In [16]:
def dict_to_lol(content):
    # list of lists im not a bad programmer
    lol = []
    for i in content:
        lol.append(content[i])
    return lol

In [17]:
def load_file(col_name):
    with open(bucket_path + col_name + '_buckets.json', 'r') as f:
        return json.loads(f.read())

In [18]:
# Code start age
content = load_file('code_start_age')
correlate_code_start_age(content, 'first_half_grades')
correlate_code_start_age(content, 'second_half_grades')

correlate_code_start_age(content, 'first_half_salaries')
correlate_code_start_age(content, 'second_half_salaries')


CALCULATING FIRST_HALF_GRADES
Ages <=15, >15
F_onewayResult(statistic=1.010900093484944, pvalue=0.31693467394518826)
KruskalResult(statistic=1.5483371696092163, pvalue=0.21338108763307484)
(82.17986842105265, 80.65575471698114)
(8.456075056164886, 7.189968404438046)
[Count] Younger than 16: 57, At least 16: 53

CALCULATING SECOND_HALF_GRADES
Ages <=15, >15
F_onewayResult(statistic=0.1914648959030756, pvalue=0.6625739819257515)
KruskalResult(statistic=0.7318345398624917, pvalue=0.392289188305433)
(82.51912280701755, 82.00761006289306)
(6.83026278969861, 5.128680460015708)
[Count] Younger than 16: 57, At least 16: 53

CALCULATING FIRST_HALF_SALARIES
Ages <=15, >15
F_onewayResult(statistic=0.06361994154330908, pvalue=0.8013427318791513)
KruskalResult(statistic=0.07565116830994835, pvalue=0.7832796880126466)
(4392.2269220238095, 4302.318388888889)
(1753.9082864550207, 1948.2961059310712)
[Count] Younger than 16: 56, At least 16: 54

CALCULATING SECOND_HALF_SALARIES
Ages <=15, >15
F_onewayR

In [24]:
# Side projects
content = load_file('side_proj') 

fhs = extract_groups(content['first_half_salaries'], [
    ['0h / month'],
    ['1-10h / month', '10-20h / month', '20-30h / month', '30-40h / month', '40h+ / month']
])

print(np.mean(fhs[0]), np.mean(fhs[1]))
print(np.std(fhs[0]), np.std(fhs[1]))
print(stats.f_oneway(*fhs))
print(stats.kruskal(*fhs))
print('')

fhg = extract_groups(content['first_half_grades'], [
    ['0h / month'],
    ['1-10h / month'],
    ['10-20h / month', '20-30h / month', '30-40h / month', '40h+ / month']
])
print(np.mean(fhg[0]), np.mean(fhg[1]), np.mean(fhg[2]))
print(np.std(fhg[0]), np.std(fhg[1]), np.std(fhg[2]))
print(stats.f_oneway(*fhg))
print(stats.kruskal(*fhg))
print('')

shs = extract_groups(content['second_half_salaries'], [
    ['0h / month'],
    ['1-10h / month','10-20h / month', '20-30h / month', '30-40h / month', '40h+ / month']
])
print(np.mean(shs[0]), np.mean(shs[1]))
print(np.std(shs[0]), np.std(shs[1]))
print(stats.f_oneway(*shs))
print(stats.kruskal(*shs))
print('')

shg = extract_groups(content['second_half_grades'], [
    ['0h / month'],
    ['1-10h / month'],
    ['10-20h / month', '20-30h / month', '30-40h / month', '40h+ / month']
])
print(np.mean(shg[0]), np.mean(shg[1]), np.mean(shg[2]))
print(np.std(shg[0]), np.std(shg[1]), np.std(shg[2]))
print(stats.f_oneway(*shg))
print(stats.kruskal(*shg))

['0h / month'] 27
['1-10h / month', '10-20h / month', '20-30h / month', '30-40h / month', '40h+ / month'] 84
(3952.7395185185187, 4462.292860714287)
(1581.5814869176006, 1907.9451359046275)
F_onewayResult(statistic=1.548979120266919, pvalue=0.2159556287795129)
KruskalResult(statistic=1.9564640245627767, pvalue=0.16189199004805502)

['0h / month'] 27
['1-10h / month'] 62
['10-20h / month', '20-30h / month', '30-40h / month', '40h+ / month'] 22
(83.84009259259258, 80.5620564516129, 80.76034090909091)
(6.473969368909215, 7.640668308627091, 9.39472317119171)
F_onewayResult(statistic=1.72090565020105, pvalue=0.18377314058825098)
KruskalResult(statistic=2.9729396182831156, pvalue=0.22616967000664423)

['0h / month'] 27
['1-10h / month', '10-20h / month', '20-30h / month', '30-40h / month', '40h+ / month'] 85
(8312.672404938272, 9041.68604980392)
(2212.9625380333537, 2923.7764185483215)
F_onewayResult(statistic=1.394807334370658, pvalue=0.2401421882949864)
KruskalResult(statistic=1.3768929187

In [23]:
# Hackathons
content = load_file('num_hackathons')

fhs = extract_groups(content['first_half_salaries'], [
    ['0'],
    ['1-5'],
    ['6-10', '>10']
])
print(np.mean(fhs[0]), np.mean(fhs[1]), np.mean(fhs[2]))
print(np.std(fhs[0]), np.std(fhs[1]), np.std(fhs[2]))
print(stats.f_oneway(*fhs))
print(stats.kruskal(*fhs))
print('')

fhg = extract_groups(content['first_half_grades'], [
    ['0'],
    ['1-5'],
    ['6-10', '>10']
])
print(np.mean(fhg[0]), np.mean(fhg[1]), np.mean(fhg[2]))
print(np.std(fhg[0]), np.std(fhg[1]), np.std(fhg[2]))
print(stats.f_oneway(*fhg))
print(stats.kruskal(*fhg))
print('')

shs = extract_groups(content['second_half_salaries'], [
    ['0'],
    ['1-5'],
    ['6-10', '>10']
])
print(np.mean(shs[0]), np.mean(shs[1]), np.mean(shs[2]))
print(np.std(shs[0]), np.std(shs[1]), np.std(shs[2]))
print(stats.f_oneway(*shs))
print(stats.kruskal(*shs))
print('')

shg = extract_groups(content['second_half_grades'], [
    ['0'],
    ['1-5'],
    ['6-10', '>10']
])
print(np.mean(shg[0]), np.mean(shg[1]), np.mean(shg[2]))
print(np.std(shg[0]), np.std(shg[1]), np.std(shg[2]))
print(stats.f_oneway(*shg))
print(stats.kruskal(*shg))

['0'] 19
['1-5'] 75
['6-10', '>10'] 17
(3620.4732982456135, 4252.69152088889, 5518.571209803921)
(1359.6290448299071, 1803.1864200846428, 1964.6652629771568)
F_onewayResult(statistic=5.331253533698959, pvalue=0.006193736566950103)
KruskalResult(statistic=10.156523600772944, pvalue=0.006230729861060211)

['0'] 19
['1-5'] 75
['6-10', '>10'] 17
(82.07631578947368, 81.2573, 81.26529411764706)
(7.125515464660723, 8.328196911296784, 6.548842167667825)
F_onewayResult(statistic=0.08243048041098244, pvalue=0.9209333340143617)
KruskalResult(statistic=0.14700770959234732, pvalue=0.9291325641679629)

['0'] 19
['1-5'] 76
['6-10', '>10'] 17
(7693.5659087719305, 8727.544521929825, 10794.960778431372)
(2630.5840371388413, 2596.9572242659133, 2816.624717205018)
F_onewayResult(statistic=6.355339673580858, pvalue=0.002450928956351126)
KruskalResult(statistic=10.526986990715704, pvalue=0.005177186611975562)

['0'] 19
['1-5'] 75
['6-10', '>10'] 17
(83.309649122807, 81.90248888888888, 82.9464705882353)
(4.5

In [28]:
# Parents technical
content = load_file('parents_technical')

fhs = dict_to_lol(content['first_half_salaries'])
print(np.size(fhs[0]), np.size(fhs[1]))
print(np.mean(fhs[0]), np.mean(fhs[1]))
print(np.std(fhs[0]), np.std(fhs[1]))
print(stats.f_oneway(*fhs))
print(stats.kruskal(*fhs))
print('')

fhg = dict_to_lol(content['first_half_grades'])
print(np.mean(fhg[0]), np.mean(fhg[1]))
print(np.std(fhg[0]), np.std(fhg[1]))
print(stats.f_oneway(*fhg))
print(stats.kruskal(*fhg))
print('')

shs = dict_to_lol(content['second_half_salaries'])
print(np.mean(shs[0]), np.mean(shs[1]))
print(np.std(shs[0]), np.std(shs[1]))
print(stats.f_oneway(*shs))
print(stats.kruskal(*shs))
print('')

shg = dict_to_lol(content['second_half_grades'])
print(np.mean(shg[0]), np.mean(shg[1]))
print(np.std(shg[0]), np.std(shg[1]))
print(stats.f_oneway(*shg))
print(stats.kruskal(*shg))

(52, 59)
(4760.610567307693, 3966.183352542372)
(1946.2807229306638, 1668.2688214650123)
F_onewayResult(statistic=5.264325195780167, pvalue=0.02368278260248313)
KruskalResult(statistic=6.000394497171199, pvalue=0.014302679946007606)

(81.45649038461539, 81.34779661016948)
(7.677325885665871, 8.068495177214539)
F_onewayResult(statistic=0.005154052497167302, pvalue=0.9428992179194144)
KruskalResult(statistic=0.013970123763356521, pvalue=0.9059129312695484)

(9294.685413836476, 8480.799020903954)
(2494.7104044422586, 2972.730569932684)
F_onewayResult(statistic=2.3898903518209593, pvalue=0.12499440610039)
KruskalResult(statistic=3.2531050131119477, pvalue=0.07128829325986479)

(82.73439102564103, 81.92324858757063)
(5.965646417830094, 6.110647929556666)
F_onewayResult(statistic=0.4889934458226031, pvalue=0.4858662004008151)
KruskalResult(statistic=0.5500847681882083, pvalue=0.4582830599695533)


In [84]:
# Parents education
content = load_file('parents_edu')

fhs = extract_groups(content['first_half_salaries'], [
    ['Postsecondary diploma (e.g. college)', 'Some postsecondary', 'High school', 'Did not complete high school'],
    ["Bachelor's"],
    ["Master\'s/Professional (MD, MBA, etc)", "Doctorate"]
])
print(np.mean(fhs[0]), np.mean(fhs[1]), np.mean(fhs[2]))
print(np.std(fhs[0]), np.std(fhs[1]),  np.std(fhs[2]))
print(stats.f_oneway(*fhs))
print(stats.kruskal(*fhs))
print('')

fhg = extract_groups(content['first_half_grades'], [
    ['Postsecondary diploma (e.g. college)', 'Some postsecondary', 'High school', 'Did not complete high school'],
    ["Bachelor's"],
    ["Master\'s/Professional (MD, MBA, etc)", "Doctorate"]
])
print(np.mean(fhg[0]), np.mean(fhg[1]), np.mean(fhg[2]))
print(np.std(fhg[0]), np.std(fhg[1]), np.std(fhg[2]))
print(stats.f_oneway(*fhg))
print(stats.kruskal(*fhg))
print('')

shs = extract_groups(content['second_half_salaries'], [
    ['Postsecondary diploma (e.g. college)', 'Some postsecondary', 'High school', 'Did not complete high school'],
    ["Bachelor's"],
    ["Master\'s/Professional (MD, MBA, etc)", "Doctorate"]
])
print(np.mean(shs[0]), np.mean(shs[1]), np.mean(shs[2]))
print(np.std(shs[0]), np.std(shs[1]), np.std(shs[2]))
print(stats.f_oneway(*shs))
print(stats.kruskal(*shs))
print('')

shg = extract_groups(content['second_half_grades'], [
    ['Postsecondary diploma (e.g. college)', 'Some postsecondary', 'High school', 'Did not complete high school'],
    ["Bachelor's"],
    ["Master\'s/Professional (MD, MBA, etc)", "Doctorate"]
])
print(np.mean(shg[0]), np.mean(shg[1]), np.mean(shg[2]))
print(np.std(shg[0]), np.std(shg[1]), np.std(shg[2]))
print(stats.f_oneway(*shg))
print(stats.kruskal(*shg))

['Postsecondary diploma (e.g. college)', 'Some postsecondary', 'High school', 'Did not complete high school'] 17
["Bachelor's"] 46
["Master's/Professional (MD, MBA, etc)", 'Doctorate'] 48
(4479.299049019608, 4452.612265942029, 4178.923317361111)
(1830.2882535991687, 1970.2009664951063, 1713.6643670386345)
F_onewayResult(statistic=0.3095866416428993, pvalue=0.7343991657991147)
KruskalResult(statistic=0.7932489343638668, pvalue=0.6725865565378386)

['Postsecondary diploma (e.g. college)', 'Some postsecondary', 'High school', 'Did not complete high school'] 18
["Bachelor's"] 46
["Master's/Professional (MD, MBA, etc)", 'Doctorate'] 47
(79.4673611111111, 80.42891304347825, 83.08755319148936)
(9.545717679796347, 8.098610097004801, 6.530707284065745)
F_onewayResult(statistic=1.9815719068522817, pvalue=0.14283289280692127)
KruskalResult(statistic=2.5221067260048957, pvalue=0.2833553931550314)

['Postsecondary diploma (e.g. college)', 'Some postsecondary', 'High school', 'Did not complete high 

In [26]:
content = load_file('admission_avg')

fhs = extract_range(content['first_half_salaries'], [[0, 93], [93, 100]])
print(np.mean(fhs[0]), np.mean(fhs[1]))
print(np.std(fhs[0]), np.std(fhs[1]))
print(stats.f_oneway(*fhs))
print(stats.kruskal(*fhs))
print('')

fhg = extract_range(content['first_half_grades'], [[0, 93], [93, 100]])
print(np.mean(fhg[0]), np.mean(fhg[1]))
print(np.std(fhg[0]), np.std(fhg[1]))
print(stats.f_oneway(*fhg))
print(stats.kruskal(*fhg))
print('')

shs = extract_range(content['second_half_salaries'], [[0, 93], [93, 100]])
print(np.mean(shs[0]), np.mean(shs[1]))
print(np.std(shs[0]), np.std(shs[1]))
print(stats.f_oneway(*shs))
print(stats.kruskal(*shs))
print('')

shg = extract_range(content['second_half_grades'], [[0, 93], [93, 100]])
print(np.mean(shg[0]), np.mean(shg[1]))
print(np.std(shg[0]), np.std(shg[1]))
print(stats.f_oneway(*shg))
print(stats.kruskal(*shg))

[0, 93] 51
[93, 100] 59
(4211.434541176472, 4438.578582485876)
(1995.5814934344485, 1716.3133779664329)
F_onewayResult(statistic=0.4044197620063245, pvalue=0.5261608361410424)
KruskalResult(statistic=1.0567878800865342, pvalue=0.30394893996268757)

[0, 93] 52
[93, 100] 58
(79.05538461538461, 83.39762931034481)
(8.26628075062932, 6.94893511474733)
F_onewayResult(statistic=8.787175587911126, pvalue=0.0037332380864465563)
KruskalResult(statistic=7.552410492611803, pvalue=0.005992985418552315)

[0, 93] 52
[93, 100] 59
(7968.710664102564, 9667.32081299435)
(2740.9781890478916, 2600.253990945104)
F_onewayResult(statistic=11.008891883553293, pvalue=0.0012330560023470739)
KruskalResult(statistic=9.570687712238962, pvalue=0.0019770870620199714)

[0, 93] 52
[93, 100] 58
(80.09695512820514, 84.3068103448276)
(6.399592065334114, 5.007035837864098)
F_onewayResult(statistic=14.644049407153998, pvalue=0.00021790949363636796)
KruskalResult(statistic=12.330997418434194, pvalue=0.0004454983053349405)


In [89]:
content = load_file('soft_eng_rating')

fhs = extract_range(content['first_half_salaries'], [[0, 4], [4, 5]])
print(np.mean(fhs[0]), np.mean(fhs[1]))
print(np.std(fhs[0]), np.std(fhs[1]))
print(stats.f_oneway(*fhs))
print(stats.kruskal(*fhs))
print('')

fhg = extract_range(content['first_half_grades'], [[0, 4], [4, 5]])
print(np.mean(fhg[0]), np.mean(fhg[1]))
print(np.std(fhg[0]), np.std(fhg[1]))
print(stats.f_oneway(*fhg))
print(stats.kruskal(*fhg))
print('')

shs = extract_range(content['second_half_salaries'], [[0, 4], [4, 5]])
print(np.mean(shs[0]), np.mean(shs[1]))
print(np.std(shs[0]), np.std(shs[1]))
print(stats.f_oneway(*shs))
print(stats.kruskal(*shs))
print('')

shg = extract_range(content['second_half_grades'], [[0, 4], [4, 5]])
print(np.mean(shg[0]), np.mean(shg[1]))
print(np.std(shg[0]), np.std(shg[1]))
print(stats.f_oneway(*shg))
print(stats.kruskal(*shg))

[0, 4] 54
[4, 5] 57
(4076.068500617284, 4586.822250292397)
(1864.5370430932933, 1795.020452697656)
F_onewayResult(statistic=2.1230706402097024, pvalue=0.14797025481904466)
KruskalResult(statistic=3.029434075190794, pvalue=0.08176653150909657)

[0, 4] 54
[4, 5] 57
(81.57606481481481, 81.23070175438596)
(8.341367621482998, 7.428808636191595)
F_onewayResult(statistic=0.052226633281149015, pvalue=0.8196612910819212)
KruskalResult(statistic=0.12118002153698207, pvalue=0.727758168421494)

[0, 4] 54
[4, 5] 58
(8385.703241975307, 9313.060243103448)
(2709.558380220688, 2783.0562593794652)
F_onewayResult(statistic=3.128105926929079, pvalue=0.07972501644366575)
KruskalResult(statistic=2.1276410676716586, pvalue=0.1446632347120044)

[0, 4] 54
[4, 5] 57
(81.92256172839505, 82.66388888888889)
(6.182444719291466, 5.912535625082419)
F_onewayResult(statistic=0.4094740099987712, pvalue=0.5235793463668785)
KruskalResult(statistic=0.5659124874696142, pvalue=0.45188777217628184)


### Gender rating
Check if the program rating is similar between male and female students

In [86]:
values = defaultdict(list)
with open('../private/gender_rating.json', 'r') as f:
    content = json.loads(f.read())
    print(len(content['Male']), len(content['Female']))
    print(stats.f_oneway(content['Male'], content['Female']))
    print(stats.kruskal(content['Male'], content['Female']))

(93, 19)
F_onewayResult(statistic=0.14766472350605012, pvalue=0.7015189429260772)
KruskalResult(statistic=0.00029500832952058517, pvalue=0.986296362944981)
