# Correlations

This file contains all of the correlations that we want to calculate. This means that we need at least 2 columns to create a result.

Most of these correlations have to do with coop salary, or term average. We use these as metrics of student success because they're numeric, making them easier to process. Other properties are very subjective. Salary and grades are not the most indicative of how successful a student is, but with the existing data, it's the best indication we have.

In [10]:
from IPython.display import display

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

from Bucket import Bucket
from Distribution import Distribution

pd.set_option('display.max_columns', None)
plt.style.use('ggplot')

# Show matplotlib plots in this notebook
%matplotlib inline
# Setting plot parameters
from pylab import rcParams
params = {
    'figure.figsize': (8, 8),
    'legend.fontsize': 15
}
rcParams.update(params)

In [11]:
def isnan(a):
    return a != a

In [12]:
df = pd.read_csv('../private/results-04-10.csv')
COOP = ['1', '2', '3', '4', '5', '6']
TERM = ['1a', '1b', '2a', '2b', '3a', '3b', '4a']
ROWS_DIR = '../private/rows/'
SALARY_COL = ['coop_salary_' + i + '.csv' for i in COOP]
GRADE_COL = ['term_avg_' + i + '.csv' for i in TERM]

In [31]:
def correlate_columns(df, col_name):
    # Get unique values (some rows have multiple values)
    col_values = np.array([])
    for i in df[col_name]:
        if isnan(i):
            print 'Skipping ', i
            continue
        val = str(i)
        if ',' in val:
            col_values = np.append(col_values, val.split(','))
        else:
            col_values = np.append(col_values, val)
            
    buckets = {}
    for col_val in col_values:
        salaries = {}
        grades = {}
        for i in TERM:
            grades[i] = np.array([])
        for i in COOP:
            salaries[i] = np.array([])
        
        for i in range(0, df.shape[0]):
            val = df[col_name][i]
            if isnan(val):
                continue
            if col_val not in str(val):
                continue
                
            for t in TERM:
                avg = df['term_avg_' + t][i]
                if avg == 'exchange':
                    continue
                if isnan(avg):
                    continue
                grades[t] = np.append(grades[t], avg)
                
            for c in COOP:
                salary = df['coop_salary_' + c][i]
                if type(salary) == float and math.isnan(salary):
                    continue
                if ',' in salary:
                    salary = salary.replace(',', '')
                salaries[c] = np.append(salaries[c], salary)
                
        buckets[col_val] = Bucket(col_name, col_val, 
                                 [Distribution(grades[i].astype(float)) for i in TERM],
                                 [Distribution(salaries[i].astype(float)) for i in COOP])
    return buckets

In [32]:
# From https://github.com/se2018/class-profile/tree/master/analyses
to_correlate = [
    'gender',
    'is_international',
    'parents_edu',
    'parents_technical',
    'admission_avg',
    'code_start_age',
    'fav_lang',
    'num_hackathons',
    'side_proj',
    'exercise',
    'cooking',
    'bedtime'
]

for i in to_correlate:
    correlate_columns(df, i)

Skipping  nan
Skipping  nan
Skipping  nan


KeyError: 'bedtime'