# Import Libs

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import altair as alt
alt.renderers.enable('notebook')

import matplotlib.pyplot as plt
import warnings
import folium
warnings.filterwarnings('ignore')

In [None]:
filename = '../data/processed/CriticalPath_Data_EM_Confidential_lessNoise.csv'

data = pd.read_csv(filename).drop(columns=['Unnamed: 0'])

mapper = {70: "Accepted - not Enrolled", 80: "Enrolled"}

for status in data['Admission_status'].unique():
    if status!=70 and status!=80:
#         print(status)
        mapper[status] = "Applied"
        

data['Admission_status'] = data['Admission_status'].map(mapper)

# Evaluate Wages v. Test Scores

## Parental Wages

In [None]:
for col in ['Father_Wages','Mther_Wages','Net_worth_parents_investments','Parent_income_AGI','Parent_cash','Net_worth_parents_bus']:

    sns.lmplot(data=data,y=col,x='MAXSATVerbalMath',size=5,hue='Admission_status',
               scatter_kws={"alpha":0.2})
    plt.ylim(np.nanpercentile(data[col],10),
            np.nanpercentile(data[col],90))
    
    
    sns.lmplot(fit_reg=True,data=data,y=col,x='ACTComposite',size=5, scatter_kws={"alpha":0.2},
               hue='Admission_status')
    plt.ylim(np.nanpercentile(data[col],10),
            np.nanpercentile(data[col],90))

## Student Wages

In [None]:
for col in ['Student_Wages','Student_income_AGI','Net_worth_students_investments']:

    sns.lmplot(fit_reg=True,data=data[data['Enrolled']==80],y=col,x='MAXSATVerbalMath',size=5)
    plt.ylim(np.nanpercentile(data[col],2.5),
            np.nanpercentile(data[col],97.5))

    sns.lmplot(fit_reg=True,data=data[data['Enrolled']==80],y=col,x='ACTComposite',size=5)
    plt.ylim(np.nanpercentile(data[col],2.5),
            np.nanpercentile(data[col],97.5))

# Test scores by School of Science, Liberal Arts, Business

In [None]:
f, axes = plt.subplots(figsize=(10,6))

sns.distplot(data[data['CollegeCode']=='SD']['MAXSATVerbalMath'].dropna(),
             color='skyblue',label='School of Science',hist_kws={"alpha":0.5});

sns.distplot(data[data['CollegeCode']=='AD']['MAXSATVerbalMath'].dropna(),
             color='red',label='School of Liberal Arts',hist_kws={"alpha":0.4});

sns.distplot(data[data['CollegeCode']=='BD']['MAXSATVerbalMath'].dropna(),
             color='yellow',label='School of Business',hist_kws={"alpha":0.3});

plt.legend(loc='best');

## What percentage of students report test scores?

In [None]:
reported = []
for dept in data['CollegeCode'].unique():
    reported.append(100* (len(data[data['CollegeCode']==dept]['MAXSATVerbalMath'].dropna())+
                    len(data[data['CollegeCode']==dept]['SAT_combined'].dropna())
                    )/len(data[data['CollegeCode']==dept]['MAXSATVerbalMath']) )

source = pd.DataFrame.from_dict({"Department":['School of Liberal Arts','School of Business',
                                               'School of Science'],
                       "Percentage Reported":reported})

alt.Chart(source).mark_bar().encode(
    x='Percentage Reported:Q',
    y="Department:O"
).properties(height=300,width=200,title="SAT Reporting by Department")

In [None]:
reported = []
for dept in data['CollegeCode'].unique():
    reported.append(100* (len(data[data['CollegeCode']==dept]['ACTComposite'].dropna())
                    )/len(data[data['CollegeCode']==dept]['ACTComposite']) )

source = pd.DataFrame.from_dict({"Department":['School of Liberal Arts','School of Business',
                                               'School of Science'],
                       "Percentage Reported":reported})

alt.Chart(source).mark_bar().encode(
    x='Percentage Reported:Q',
    y="Department:O"
).properties(height=300,width=200,title="ACT Reporting by Department")