In [7]:
from Bucket import Bucket
import os
import json

%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt

In [8]:
bucket_path = '../private/result-buckets/'
grade_vs_salary = '../private/grade-vs-salary.json'
grade_vs_attendance = '../private/grade-vs-attendance.json'
TERM = ['1a', '1b', '2a', '2b', '3a', '3b', '4a']

In [9]:
def process_file(filename):
    with open(bucket_path + filename, 'r') as f:
        f = json.loads(f.read())
        b = Bucket.create_from_bucket(filename.replace('_buckets.json', ''), f)
        render_file(b)

In [10]:
def render_file(b):
    fig = plt.figure(figsize=(18, 16))    
    
    ax1 = fig.add_subplot(211)
    cols = []
    for val in b:
        if type(b[val]) == dict and 'message' in b[val]:
            continue
        cols.append(val)
        salaries = []

        salaries = [s.mean() for s in b[val].salaries]
        ax1.plot([1, 2, 3, 4, 5, 6], salaries) # add 7 for term
    ax1.legend(cols, loc='upper left')
#    plt.show()

In [11]:
for i in os.listdir(bucket_path):
    if i.startswith('.'):
        continue
#    process_file(i)

In [12]:
def render_grade_vs_salary(data, term, axis='cumulative_avg'):
    """Creates a scatter plot of the data of a term.
    
    `axis` represents the x axis that's plotted against salary.
    """
    x = []
    y = []
    for salary in data:
        for item in data[salary]:
            x.append(item[axis])
            y.append(item['result_salary'])
    plt.xlim(60, 100)
    plt.scatter(x, y)
    print('Term: ' + term)
    print(np.corrcoef(x, y)[0, 1])
    plt.title('Cumulative average vs Salary, ' + str(term) + 'th coop')
    plt.show()

In [13]:
with open(grade_vs_salary, 'r') as f:
    data = json.loads(f.read())
    for term in range(1, len(data)+1):
        pass
#        render_grade_vs_salary(data[str(term)], str(term))

In [14]:
from scipy.interpolate import interp1d
import math

In [15]:
def render_grade_vs_attendance(data, term):
    x = []
    y = []
    legend = []
    legend_values = {
        '4': 'Attends almost all classes',
        '3': 'Attends most classes',
        '2': 'Sometimes attends class',
        '1': 'Rarely attends class'
    }
    max_val = []
    for i in range(1, 5):
        rating = data.get(str(i), [])
        grades = [0] * 6
        for g in rating:
            if g < 60 or g == 'exchange':
                continue
            index = int(math.floor((float(g) - 60) / (40.0 / 6))) # Normalize to range 0-7
            grades[index] += 1
        if sum(grades) < 5:
            continue
        legend.append(legend_values[str(i)])
        grades = list(map(lambda x: float(x) / sum(grades) * 100, grades))

        x = [num for num in range(60, 100, 7)]
        f = interp1d(x, grades, kind='cubic', fill_value='extrapolate')
        xnew = np.linspace(63, 100, num=40)
        max_val.append(max(f(xnew)))
        plt.plot(xnew, f(xnew))
        
    plt.xlim(60, 100)
    plt.ylim(0, 60)
    plt.legend(legend)
    plt.title('Attendance vs Term Average, ' + TERM[term])
    plt.show()

In [16]:
with open(grade_vs_attendance, 'r') as f:
    data = json.loads(f.read())
    for i, t in enumerate(TERM):
        pass
#        render_grade_vs_attendance(data[t], i)

## Performing an ANOVA of specific data sets.

In [17]:
import scipy.stats as stats
from collections import defaultdict

In [18]:
# Example usage of one-way anova
tillamook = [0.0571, 0.0813, 0.0831, 0.0976, 0.0817, 0.0859, 0.0735, 0.0659, 0.0923, 0.0836]
newport = [0.0873, 0.0662, 0.0672, 0.0819, 0.0749, 0.0649, 0.0835, 0.0725]
petersburg = [0.0974, 0.1352, 0.0817, 0.1016, 0.0968, 0.1064, 0.105]
magadan = [0.1033, 0.0915, 0.0781, 0.0685, 0.0677, 0.0697, 0.0764, 0.0689]
tvarminne = [0.0703, 0.1026, 0.0956, 0.0973, 0.1039, 0.1045]
stats.f_oneway(tillamook, newport, petersburg, magadan, tvarminne)

F_onewayResult(statistic=7.121019471642447, pvalue=0.0002812242314534544)

### Coop Correlations
- Technical background
  - Coding start age
  - Hackathons
  - side projects 
- Background
  - Parental education
  - Admission average
  
### Academic Correlations:
- SE Rating: TODO: do we really want this?
- Admission average
- Parents education

In [68]:
def correlate_code_start_age(content, dv):
    """dv: dependent variable."""
    shs = content[dv]
    values = defaultdict(list)
    values[14].extend(shs['9'])
    values[14].extend(shs['10'])
    values[14].extend(shs['11'])
    values[14].extend(shs['12'])
    values[14].extend(shs['13'])
    values[14].extend(shs['14'])
    values[15].extend(shs['15'])
    values[16].extend(shs['16'])
    values[17].extend(shs['17'])
    values[17].extend(shs['18'])
    values[17].extend(shs['19'])

    print('CALCULATING ' + dv.upper())
    print('Ages <=14, 15, 16, >=17')
    print(stats.f_oneway(values[14], values[15], values[16], values[17]))
    print(stats.kruskal(values[14], values[15], values[16], values[17]))

    fh = values[14] + values[15]
    sh = values[16] + values[17]
    print('Ages <=15, >15')
    print(stats.f_oneway(fh, sh))
    print(stats.kruskal(fh, sh))
    print('[Count] Younger than 16: ' + str(len(fh)) + ', At least 16: ' + str(len(sh)))
    print('')

In [77]:
def dict_to_lol(content):
    # list of lists im not a bad programmer
    lol = []
    for i in content:
        lol.append(content[i])
    return lol

In [78]:
def load_file(col_name):
    with open(bucket_path + col_name + '_buckets.json', 'r') as f:
        return json.loads(f.read())

In [90]:
# Code start age
content = load_file('code_start_age')
correlate_code_start_age(content, 'second_half_salaries')
correlate_code_start_age(content, 'second_half_grades')

CALCULATING SECOND_HALF_SALARIES
Ages <=14, 15, 16, >=17
F_onewayResult(statistic=0.651020949293906, pvalue=0.5840682412071629)
KruskalResult(statistic=4.905301761281646, pvalue=0.17886420598961159)
Ages <=15, >15
F_onewayResult(statistic=0.1975193610367772, pvalue=0.6576367099139557)
KruskalResult(statistic=0.05603908591947665, pvalue=0.8128695485264505)
[Count] Younger than 16: 56, At least 16: 52

CALCULATING SECOND_HALF_GRADES
Ages <=14, 15, 16, >=17
F_onewayResult(statistic=0.38409784022826854, pvalue=0.7646772747357937)
KruskalResult(statistic=2.428543112121674, pvalue=0.48834344874301083)
Ages <=15, >15
F_onewayResult(statistic=0.842538187110741, pvalue=0.3607956813835519)
KruskalResult(statistic=1.6182546680236425, pvalue=0.20333532102776145)
[Count] Younger than 16: 56, At least 16: 50



In [93]:
# Side projects
content = load_file('side_proj')
shs = dict_to_lol(content['second_half_salaries'])
print(stats.f_oneway(*shs))
print(stats.kruskal(*shs))
print('')

shg = dict_to_lol(content['second_half_grades'])
print(stats.f_oneway(*shg))
print(stats.kruskal(*shg))

F_onewayResult(statistic=0.5330144898815782, pvalue=0.7508250054358696)
KruskalResult(statistic=3.564623597620794, pvalue=0.6136326097920375)

F_onewayResult(statistic=0.8508273464409961, pvalue=0.5170357273156141)
KruskalResult(statistic=2.5078206100674865, pvalue=0.7753168138567612)


In [None]:
# Hackathons
content = load_file('num_hackathons')
shs = dict_to_lol(content['second_half_salaries'])
print(stats.f_oneway(*shs))
print(stats.kruskal(*shs))
print('')

shg = dict_to_lol(content['second_half_grades'])
print(stats.f_oneway(*shg))
print(stats.kruskal(*shg))

In [94]:
# Parents technical
content = load_file('parents_technical')
shs = dict_to_lol(content['second_half_salaries'])
print(stats.f_oneway(*shs))
print(stats.kruskal(*shs))
print('')

shg = dict_to_lol(content['second_half_grades'])
print(stats.f_oneway(*shg))
print(stats.kruskal(*shg))

F_onewayResult(statistic=4.779510936674631, pvalue=0.03097907828600782)
KruskalResult(statistic=6.217274498136696, pvalue=0.012650971561287876)

F_onewayResult(statistic=0.06210205830644868, pvalue=0.8036907080493545)
KruskalResult(statistic=0.0035186391638502056, pvalue=0.9526987301729297)


In [95]:
# Parents education
content = load_file('parents_edu')
shs = dict_to_lol(content['second_half_salaries'])
print(stats.f_oneway(*shs))
print(stats.kruskal(*shs))
print('')

shg = dict_to_lol(content['second_half_grades'])
print(stats.f_oneway(*shg))
print(stats.kruskal(*shg))

F_onewayResult(statistic=0.7466149229842779, pvalue=0.6134498037919203)
KruskalResult(statistic=5.183134960813293, pvalue=0.5205485688773377)

F_onewayResult(statistic=2.879294386152679, pvalue=0.012482010110334469)
KruskalResult(statistic=12.57935218028971, pvalue=0.050224039893708994)


In [103]:
# SE friendships
content = load_file('admission_avg')
shs = dict_to_lol(content['second_half_salaries'])
print(stats.f_oneway(*shs))
print(stats.kruskal(*shs))
print('')

shg = dict_to_lol(content['second_half_grades'])
print(stats.f_oneway(*shg))
print(stats.kruskal(*shg))

F_onewayResult(statistic=1.6544081093108292, pvalue=0.07916474561227647)
KruskalResult(statistic=22.341168008322903, pvalue=0.07187076009913813)

F_onewayResult(statistic=2.34363464759891, pvalue=0.008148324937284606)
KruskalResult(statistic=21.26712007544074, pvalue=0.09498055460336366)


### Gender rating
Check if the program rating is similar between male and female students

In [23]:
values = defaultdict(list)
with open('../private/gender_rating.json', 'r') as f:
    content = json.loads(f.read())
    print(len(content['Male']), len(content['Female']))
    print(stats.f_oneway(content['Male'], content['Female']))
    print(stats.kruskal(content['Male'], content['Female']))

(93, 19)
F_onewayResult(statistic=0.14766472350605012, pvalue=0.7015189429260772)
KruskalResult(statistic=0.00029500832952058517, pvalue=0.986296362944981)
