In [7]:
from Bucket import Bucket
import os
import json

%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt

In [8]:
bucket_path = '../private/result-buckets/'
grade_vs_salary = '../private/grade-vs-salary.json'
grade_vs_attendance = '../private/grade-vs-attendance.json'
TERM = ['1a', '1b', '2a', '2b', '3a', '3b', '4a']

In [9]:
def process_file(filename):
    with open(bucket_path + filename, 'r') as f:
        f = json.loads(f.read())
        b = Bucket.create_from_bucket(filename.replace('_buckets.json', ''), f)
        render_file(b)

In [10]:
def render_file(b):
    fig = plt.figure(figsize=(18, 16))    
    
    ax1 = fig.add_subplot(211)
    cols = []
    for val in b:
        if type(b[val]) == dict and 'message' in b[val]:
            continue
        cols.append(val)
        salaries = []

        salaries = [s.mean() for s in b[val].salaries]
        ax1.plot([1, 2, 3, 4, 5, 6], salaries) # add 7 for term
    ax1.legend(cols, loc='upper left')
#    plt.show()

In [11]:
for i in os.listdir(bucket_path):
    if i.startswith('.'):
        continue
#    process_file(i)

In [12]:
def render_grade_vs_salary(data, term, axis='cumulative_avg'):
    """Creates a scatter plot of the data of a term.
    
    `axis` represents the x axis that's plotted against salary.
    """
    x = []
    y = []
    for salary in data:
        for item in data[salary]:
            x.append(item[axis])
            y.append(item['result_salary'])
    plt.xlim(60, 100)
    plt.scatter(x, y)
    print('Term: ' + term)
    print(np.corrcoef(x, y)[0, 1])
    plt.title('Cumulative average vs Salary, ' + str(term) + 'th coop')
    plt.show()

In [13]:
with open(grade_vs_salary, 'r') as f:
    data = json.loads(f.read())
    for term in range(1, len(data)+1):
        pass
#        render_grade_vs_salary(data[str(term)], str(term))

In [14]:
from scipy.interpolate import interp1d
import math

In [15]:
def render_grade_vs_attendance(data, term):
    x = []
    y = []
    legend = []
    legend_values = {
        '4': 'Attends almost all classes',
        '3': 'Attends most classes',
        '2': 'Sometimes attends class',
        '1': 'Rarely attends class'
    }
    max_val = []
    for i in range(1, 5):
        rating = data.get(str(i), [])
        grades = [0] * 6
        for g in rating:
            if g < 60 or g == 'exchange':
                continue
            index = int(math.floor((float(g) - 60) / (40.0 / 6))) # Normalize to range 0-7
            grades[index] += 1
        if sum(grades) < 5:
            continue
        legend.append(legend_values[str(i)])
        grades = list(map(lambda x: float(x) / sum(grades) * 100, grades))

        x = [num for num in range(60, 100, 7)]
        f = interp1d(x, grades, kind='cubic', fill_value='extrapolate')
        xnew = np.linspace(63, 100, num=40)
        max_val.append(max(f(xnew)))
        plt.plot(xnew, f(xnew))
        
    plt.xlim(60, 100)
    plt.ylim(0, 60)
    plt.legend(legend)
    plt.title('Attendance vs Term Average, ' + TERM[term])
    plt.show()

In [16]:
with open(grade_vs_attendance, 'r') as f:
    data = json.loads(f.read())
    for i, t in enumerate(TERM):
        pass
#        render_grade_vs_attendance(data[t], i)

## Performing an ANOVA of specific data sets.

In [17]:
import scipy.stats as stats
from collections import defaultdict

In [18]:
# Example usage of one-way anova
tillamook = [0.0571, 0.0813, 0.0831, 0.0976, 0.0817, 0.0859, 0.0735, 0.0659, 0.0923, 0.0836]
newport = [0.0873, 0.0662, 0.0672, 0.0819, 0.0749, 0.0649, 0.0835, 0.0725]
petersburg = [0.0974, 0.1352, 0.0817, 0.1016, 0.0968, 0.1064, 0.105]
magadan = [0.1033, 0.0915, 0.0781, 0.0685, 0.0677, 0.0697, 0.0764, 0.0689]
tvarminne = [0.0703, 0.1026, 0.0956, 0.0973, 0.1039, 0.1045]
stats.f_oneway(tillamook, newport, petersburg, magadan, tvarminne)

F_onewayResult(statistic=7.121019471642447, pvalue=0.0002812242314534544)

### Coop Correlations
- Technical background
  - Coding start age
  - Hackathons
  - side projects 
- Background
  - Parental education
  - Admission average
  
### Academic Correlations:
- SE Rating: TODO: do we really want this?
- Admission average
- Parents education

In [130]:
def correlate_code_start_age(content, dv):
    """dv: dependent variable."""
    shs = content[dv]
    values = defaultdict(list)
    values[14].extend(shs['9'])
    values[14].extend(shs['10'])
    values[14].extend(shs['11'])
    values[14].extend(shs['12'])
    values[14].extend(shs['13'])
    values[14].extend(shs['14'])
    values[15].extend(shs['15'])
    values[16].extend(shs['16'])
    values[17].extend(shs['17'])
    values[17].extend(shs['18'])
    values[17].extend(shs['19'])

    print('CALCULATING ' + dv.upper())
    fh = values[14] + values[15]
    sh = values[16] + values[17]
    print('Ages <=15, >15')
    print(stats.f_oneway(fh, sh))
    print(stats.kruskal(fh, sh))
    print(np.mean(fh), np.mean(sh))
    print(np.std(fh), np.std(sh))
    print('[Count] Younger than 16: ' + str(len(fh)) + ', At least 16: ' + str(len(sh)))
    print('')

In [126]:
def extract_range(content, ranges):
    result = []
    for r in ranges:
        current = []

        for val in content:
            if int(val) >= r[0] and int(val) < r[1]:
                current.extend(content[val])
        print r, len(current)
        result.append(current)
    return result

In [127]:
def dict_to_lol(content):
    # list of lists im not a bad programmer
    lol = []
    for i in content:
        lol.append(content[i])
    return lol

In [128]:
def load_file(col_name):
    with open(bucket_path + col_name + '_buckets.json', 'r') as f:
        return json.loads(f.read())

In [132]:
# Code start age
content = load_file('code_start_age')
correlate_code_start_age(content, 'first_half_grades')
correlate_code_start_age(content, 'second_half_grades')

correlate_code_start_age(content, 'first_half_salaries')
correlate_code_start_age(content, 'second_half_salaries')


CALCULATING FIRST_HALF_GRADES
Ages <=15, >15
F_onewayResult(statistic=1.010900093484944, pvalue=0.31693467394518826)
KruskalResult(statistic=1.5483371696092163, pvalue=0.21338108763307484)
(82.17986842105265, 80.65575471698114)
(8.456075056164886, 7.189968404438046)
[Count] Younger than 16: 57, At least 16: 53

CALCULATING SECOND_HALF_GRADES
Ages <=15, >15
F_onewayResult(statistic=0.1914648959030756, pvalue=0.6625739819257515)
KruskalResult(statistic=0.7318345398624917, pvalue=0.392289188305433)
(82.51912280701755, 82.00761006289306)
(6.83026278969861, 5.128680460015708)
[Count] Younger than 16: 57, At least 16: 53

CALCULATING FIRST_HALF_SALARIES
Ages <=15, >15
F_onewayResult(statistic=0.08568229859946307, pvalue=0.7703013445704169)
KruskalResult(statistic=0.22596240651787708, pvalue=0.6345339402758079)
(3986.7442261904766, 3906.2435802469136)
(1264.1047995872339, 1581.5371290516941)
[Count] Younger than 16: 56, At least 16: 54

CALCULATING SECOND_HALF_SALARIES
Ages <=15, >15
F_oneway

In [119]:
# Side projects
content = load_file('side_proj')

fhs = dict_to_lol(content['first_half_salaries'])
print(np.mean(fhs[0]), np.mean(fhs[1]))
print(np.std(fhs[0]), np.std(fhs[1]))
print(stats.f_oneway(*fhs))
print(stats.kruskal(*fhs))
print('')

fhg = dict_to_lol(content['first_half_grades'])
print(np.mean(fhg[0]), np.mean(fhg[1]))
print(np.std(fhg[0]), np.std(fhg[1]))
print(stats.f_oneway(*fhg))
print(stats.kruskal(*fhg))
print('')

shs = dict_to_lol(content['second_half_salaries'])
print(np.mean(shs[0]), np.mean(shs[1]))
print(np.std(shs[0]), np.std(shs[1]))
print(stats.f_oneway(*shs))
print(stats.kruskal(*shs))
print('')

shg = dict_to_lol(content['second_half_grades'])
print(np.mean(shg[0]), np.mean(shg[1]))
print(np.std(shg[0]), np.std(shg[1]))
print(stats.f_oneway(*shg))
print(stats.kruskal(*shg))

(3615.7308641975314, 4610.277777777778)
(1185.6037611056288, 1693.9948857182262)
F_onewayResult(statistic=0.9415827703065925, pvalue=0.4573854860846802)
KruskalResult(statistic=6.230421753546121, pvalue=0.2844392104471866)

(83.84009259259258, 81.16333333333334)
(6.473969368909215, 8.963631456552017)
F_onewayResult(statistic=0.9565170983552662, pvalue=0.44803479836100524)
KruskalResult(statistic=3.7482511908965583, pvalue=0.5862001268550607)

(6870.883950617284, 7858.888888888888)
(1681.994175901661, 1188.5762828892864)
F_onewayResult(statistic=0.5085365613663394, pvalue=0.769255679878339)
KruskalResult(statistic=3.013832504628562, pvalue=0.6978531470620631)

(83.93222222222222, 81.90777777777778)
(5.312219472138232, 8.37182060527538)
F_onewayResult(statistic=1.6103971954323175, pvalue=0.163678077618645)
KruskalResult(statistic=6.2696012299067805, pvalue=0.28086239343933733)


In [120]:
# Hackathons
content = load_file('num_hackathons')

fhs = dict_to_lol(content['first_half_salaries'])
print(np.mean(fhs[0]), np.mean(fhs[1]))
print(np.std(fhs[0]), np.std(fhs[1]))
print(stats.f_oneway(*fhs))
print(stats.kruskal(*fhs))
print('')

fhg = dict_to_lol(content['first_half_grades'])
print(np.mean(fhg[0]), np.mean(fhg[1]))
print(np.std(fhg[0]), np.std(fhg[1]))
print(stats.f_oneway(*fhg))
print(stats.kruskal(*fhg))
print('')

shs = dict_to_lol(content['second_half_salaries'])
print(np.mean(shs[0]), np.mean(shs[1]))
print(np.std(shs[0]), np.std(shs[1]))
print(stats.f_oneway(*shs))
print(stats.kruskal(*shs))
print('')

shg = dict_to_lol(content['second_half_grades'])
print(np.mean(shg[0]), np.mean(shg[1]))
print(np.std(shg[0]), np.std(shg[1]))
print(stats.f_oneway(*shg))
print(stats.kruskal(*shg))

(4632.01282051282, 3373.0957894736844)
(1296.419765006918, 957.5526735319156)
F_onewayResult(statistic=2.506760798498526, pvalue=0.06288672821394259)
KruskalResult(statistic=8.783849143796026, pvalue=0.03230714918406985)

(81.71826923076922, 82.07631578947368)
(7.143384808595196, 7.125515464660723)
F_onewayResult(statistic=0.11326726272171668, pvalue=0.9521727330904038)
KruskalResult(statistic=0.6034201066423299, pvalue=0.8956489736424222)

(8795.205128205127, 6381.473684210527)
(2171.952977322529, 1766.0317936240856)
F_onewayResult(statistic=4.283424704037926, pvalue=0.006738054837611619)
KruskalResult(statistic=11.005831545891786, pvalue=0.011694383960050781)

(84.04974358974358, 83.309649122807)
(6.652069591396626, 4.538179800346942)
F_onewayResult(statistic=0.9493675473510874, pvalue=0.4195786519865531)
KruskalResult(statistic=3.0762206619316683, pvalue=0.38002209300644973)


In [121]:
# Parents technical
content = load_file('parents_technical')

fhs = dict_to_lol(content['first_half_salaries'])
print(np.mean(fhs[0]), np.mean(fhs[1]))
print(np.std(fhs[0]), np.std(fhs[1]))
print(stats.f_oneway(*fhs))
print(stats.kruskal(*fhs))
print('')

fhg = dict_to_lol(content['first_half_grades'])
print(np.mean(fhg[0]), np.mean(fhg[1]))
print(np.std(fhg[0]), np.std(fhg[1]))
print(stats.f_oneway(*fhg))
print(stats.kruskal(*fhg))
print('')

shs = dict_to_lol(content['second_half_salaries'])
print(np.mean(shs[0]), np.mean(shs[1]))
print(np.std(shs[0]), np.std(shs[1]))
print(stats.f_oneway(*shs))
print(stats.kruskal(*shs))
print('')

shg = dict_to_lol(content['second_half_grades'])
print(np.mean(shg[0]), np.mean(shg[1]))
print(np.std(shg[0]), np.std(shg[1]))
print(stats.f_oneway(*shg))
print(stats.kruskal(*shg))

(4255.979679487179, 3663.568700564971)
(1502.0227964014102, 1289.9550198988884)
F_onewayResult(statistic=4.9065507267591055, pvalue=0.028834944994848576)
KruskalResult(statistic=6.6542809920597525, pvalue=0.009891787762800896)

(81.45649038461539, 81.34779661016948)
(7.677325885665871, 8.068495177214539)
F_onewayResult(statistic=0.005154052497167302, pvalue=0.9428992179194144)
KruskalResult(statistic=0.013970123763356521, pvalue=0.9059129312695484)

(7484.114339622641, 6917.788700564972)
(1911.8336862420126, 2097.141312020768)
F_onewayResult(statistic=2.173413609905968, pvalue=0.1432704639811906)
KruskalResult(statistic=2.384952059240671, pvalue=0.12250865064461987)

(82.73439102564103, 81.92324858757063)
(5.965646417830094, 6.110647929556666)
F_onewayResult(statistic=0.4889934458226031, pvalue=0.4858662004008151)
KruskalResult(statistic=0.5500847681882083, pvalue=0.4582830599695533)


In [122]:
# Parents education
content = load_file('parents_edu')

fhs = dict_to_lol(content['first_half_salaries'])
print(np.mean(fhs[0]), np.mean(fhs[1]))
print(np.std(fhs[0]), np.std(fhs[1]))
print(stats.f_oneway(*fhs))
print(stats.kruskal(*fhs))
print('')

fhg = dict_to_lol(content['first_half_grades'])
print(np.mean(fhg[0]), np.mean(fhg[1]))
print(np.std(fhg[0]), np.std(fhg[1]))
print(stats.f_oneway(*fhg))
print(stats.kruskal(*fhg))
print('')

shs = dict_to_lol(content['second_half_salaries'])
print(stats.f_oneway(*shs))
print(stats.kruskal(*shs))
print('')

shg = dict_to_lol(content['second_half_grades'])
print(np.mean(shg[0]), np.mean(shg[1]))
print(np.std(shg[0]), np.std(shg[1]))
print(stats.f_oneway(*shg))
print(stats.kruskal(*shg))

(4049.8804347826085, 3383.3333333333335)
(1588.0441707460125, 1183.3333333333335)
F_onewayResult(statistic=0.5863607672541553, pvalue=0.7405205291347083)
KruskalResult(statistic=3.62106117635981, pvalue=0.7277995425334841)

(80.42891304347825, 73.39)
(8.098610097004801, 2.3900000000000006)
F_onewayResult(statistic=2.610725163143527, pvalue=0.021318894194874562)
KruskalResult(statistic=12.483238262698958, pvalue=0.05201687058114773)

F_onewayResult(statistic=0.6902055731622354, pvalue=0.6579686693497949)
KruskalResult(statistic=4.8895293125454105, pvalue=0.5580580444365144)

(82.67916666666666, 78.63333333333333)
(5.2938403358249975, 2.633333333333333)
F_onewayResult(statistic=3.1048017392748046, pvalue=0.00773600821736511)
KruskalResult(statistic=12.343921851567439, pvalue=0.054721685232419125)


In [123]:
# SE friendships
content = load_file('admission_avg')

fhs = dict_to_lol(content['first_half_salaries'])
print(np.mean(fhs[0]), np.mean(fhs[1]))
print(np.std(fhs[0]), np.std(fhs[1]))
print(stats.f_oneway(*fhs))
print(stats.kruskal(*fhs))
print('')

fhg = dict_to_lol(content['first_half_grades'])
print(np.mean(fhg[0]), np.mean(fhg[1]))
print(np.std(fhg[0]), np.std(fhg[1]))
print(stats.f_oneway(*fhg))
print(stats.kruskal(*fhg))
print('')

shs = dict_to_lol(content['second_half_salaries'])
print(np.mean(shs[0]), np.mean(shs[1]))
print(np.std(shs[0]), np.std(shs[1]))
print(stats.f_oneway(*shs))
print(stats.kruskal(*shs))
print('')

shg = dict_to_lol(content['second_half_grades'])
print(np.mean(shg[0]), np.mean(shg[1]))
print(np.std(shg[0]), np.std(shg[1]))
print(stats.f_oneway(*shg))
print(stats.kruskal(*shg))

(6332.5, 3691.256875)
(1364.166666666667, 1181.0246381352913)
F_onewayResult(statistic=1.0407788593029401, pvalue=0.42102765889064725)
KruskalResult(statistic=13.672569746301434, pvalue=0.4743788298160553)

(90.85, 79.74049999999998)
(5.850000000000001, 6.736065505916639)
F_onewayResult(statistic=2.398202263158802, pvalue=0.006516373416096676)
KruskalResult(statistic=21.34227787865015, pvalue=0.09317740838261462)

(9753.333333333332, 7233.958333333333)
(123.33333333333303, 1819.7620680668667)
F_onewayResult(statistic=1.7543616464569423, pvalue=0.05720677098173708)
KruskalResult(statistic=23.475235047677188, pvalue=0.05296296798875364)

(87.53333333333333, 82.56922222222221)
(5.533333333333331, 5.151097361212595)
F_onewayResult(statistic=2.0390032764310417, pvalue=0.02246902718186436)
KruskalResult(statistic=20.170370352257944, pvalue=0.12486089086195323)


In [124]:
content = load_file('admission_avg')

fhs = extract_range(content['first_half_salaries'], [[0, 94], [94, 100]])
print(np.mean(fhs[0]), np.mean(fhs[1]))
print(np.std(fhs[0]), np.std(fhs[1]))
print(stats.f_oneway(*fhs))
print(stats.kruskal(*fhs))
print('')

fhg = extract_range(content['first_half_grades'], [[0, 94], [94, 100]])
print(np.mean(fhg[0]), np.mean(fhg[1]))
print(np.std(fhg[0]), np.std(fhg[1]))
print(stats.f_oneway(*fhg))
print(stats.kruskal(*fhg))
print('')

shs = extract_range(content['second_half_salaries'], [[0, 94], [94, 100]])
print(np.mean(shs[0]), np.mean(shs[1]))
print(np.std(shs[0]), np.std(shs[1]))
print(stats.f_oneway(*shs))
print(stats.kruskal(*shs))
print('')

shg = extract_range(content['second_half_grades'], [[0, 94], [94, 100]])
print(np.mean(shg[0]), np.mean(shg[1]))
print(np.std(shg[0]), np.std(shg[1]))
print(stats.f_oneway(*shg))
print(stats.kruskal(*shg))

[0, 94] 51
[94, 100] 59
(3883.749673202615, 3987.857570621469)
(1632.6845064020872, 1227.6354363618036)
F_onewayResult(statistic=0.14239515416614343, pvalue=0.7066519289139976)
KruskalResult(statistic=0.797689857242885, pvalue=0.37178496296839825)

[0, 94] 52
[94, 100] 58
(79.05538461538461, 83.39762931034481)
(8.26628075062932, 6.94893511474733)
F_onewayResult(statistic=8.787175587911126, pvalue=0.0037332380864465563)
KruskalResult(statistic=7.552410492611803, pvalue=0.005992985418552315)

[0, 94] 52
[94, 100] 59
(6580.73576923077, 7734.169491525424)
(2119.7696416118415, 1798.8869297323522)
F_onewayResult(statistic=9.440237535514806, pvalue=0.0026814510824896094)
KruskalResult(statistic=10.27828584818024, pvalue=0.0013460494212840111)

[0, 94] 52
[94, 100] 58
(80.09695512820514, 84.3068103448276)
(6.399592065334114, 5.007035837864098)
F_onewayResult(statistic=14.644049407153998, pvalue=0.00021790949363636796)
KruskalResult(statistic=12.330997418434194, pvalue=0.0004454983053349405)


### Gender rating
Check if the program rating is similar between male and female students

In [23]:
values = defaultdict(list)
with open('../private/gender_rating.json', 'r') as f:
    content = json.loads(f.read())
    print(len(content['Male']), len(content['Female']))
    print(stats.f_oneway(content['Male'], content['Female']))
    print(stats.kruskal(content['Male'], content['Female']))

(93, 19)
F_onewayResult(statistic=0.14766472350605012, pvalue=0.7015189429260772)
KruskalResult(statistic=0.00029500832952058517, pvalue=0.986296362944981)
