### Test Score Analysis

This notebook was created to take an in-depth look into students test scores to see if there is anything worth taking away.

Import necessary libraries.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import altair as alt
alt.renderers.enable('notebook')

import matplotlib.pyplot as plt
import warnings
import folium
warnings.filterwarnings('ignore')

import sys
sys.path.insert(0, '../src/visualization/')
import visualize as vis

Read in the .csv file as a DataFrame.

In [None]:
filename = '../data/processed/CriticalPath_Data_EM_Confidential_lessNoise.csv'
df =  pd.read_csv(filename).drop(columns=['Unnamed: 0'])

The following visualizations are trying to determine if their is a correlation between wages and SAT/ACT scores.

ScatterPlots: Parental financial information vs. `MAXSATVerbalMath` and `ACTComposite`.

In [None]:
for col in ['Father_Wages','Mther_Wages','Net_worth_parents_investments','Parent_income_AGI','Parent_cash','Net_worth_parents_bus']:

    sns.lmplot(data=df,y=col,x='MAXSATVerbalMath',size=5,hue='Admission_status',
               scatter_kws={"alpha":0.2})
    plt.ylim(np.nanpercentile(df[col],10),
            np.nanpercentile(df[col],90))
    
    
    sns.lmplot(fit_reg=True,data=df,y=col,x='ACTComposite',size=5, scatter_kws={"alpha":0.2},
               hue='Admission_status')
    plt.ylim(np.nanpercentile(df[col],10),
            np.nanpercentile(df[col],90))

ScatterPlots: Student financial information vs. `MAXSATVerbalMath` and `ACTComposite`.

In [None]:
for col in ['Student_Wages','Student_income_AGI','Net_worth_students_investments']:

    sns.lmplot(fit_reg=True,data=df,y=col,x='MAXSATVerbalMath',size=5,hue='Admission_status')
    plt.ylim(np.nanpercentile(df[col],2.5),
            np.nanpercentile(df[col],97.5))

    sns.lmplot(fit_reg=True,data=df,y=col,x='ACTComposite',size=5,hue='Admission_status')
    plt.ylim(np.nanpercentile(df[col],2.5),
            np.nanpercentile(df[col],97.5))

Now plot the distribution of grades based on School of Science/Business/Liberal Arts.

In [None]:
def grade_dist(exam):
    f, axes = plt.subplots(figsize=(10,6))
    sns.distplot(df[df['CollegeCode']=='SD'][exam].dropna(),
                 color='skyblue',label='School of Science',hist_kws={"alpha":0.5});
    sns.distplot(df[df['CollegeCode']=='AD'][exam].dropna(),
                 color='red',label='School of Liberal Arts',hist_kws={"alpha":0.4});
    sns.distplot(df[df['CollegeCode']=='BD'][exam].dropna(),
                 color='yellow',label='School of Business',hist_kws={"alpha":0.3});
    plt.legend(loc='best');
    plt.ylabel('Kernel Density Estimate')
    plt.title(exam+' by College Code')
    return f, axes
    
grade_dist('MAXSATVerbalMath'), grade_dist('ACTComposite');

Lets find out what percentage of students report their test scores.

In [None]:
def score_report(reported_scores,test):
    
    source = pd.DataFrame.from_dict({"Department":['School of Liberal Arts','School of Business',
                                                   'School of Science'],
                           "Percentage Reported":reported_scores})

    return alt.Chart(source).mark_bar(color='green').encode(
        x=alt.X('Percentage Reported:Q',axis=alt.Axis(format='%',title='Reported')),
        y=alt.Y("Department:O",axis=alt.Axis(title='',ticks=False)),
        color="Department:O"        
    ).properties(height=300,width=200,title=f"{test} Reporting by Department")

In [None]:
sat_scores = []

for dept in df['CollegeCode'].unique():
    sat_scores.append((len(df[df['CollegeCode']==dept]['MAXSATVerbalMath'].dropna())+
                    len(df[df['CollegeCode']==dept]['SAT_combined'].dropna())
                    )/len(df[df['CollegeCode']==dept]) )

act_scores = []
for dept in df['CollegeCode'].unique():
    act_scores.append((len(df[df['CollegeCode']==dept]['ACTComposite'].dropna())
                    )/len(df[df['CollegeCode']==dept]['ACTComposite']) )


(score_report(sat_scores, "SAT")) | (score_report(act_scores,"ACT"))

Is there an anti-correlation between athletes and submitted SAT Scores??

Contrary to stereotypes there actually **shouldn't** be, due to NCAA Clearing House rules.

In [None]:
ath_df = df[df['Recruited_athlete']==df['Recruited_athlete']]

sat_scores = []

for dept in ath_df['CollegeCode'].unique():
    sat_scores.append((len(ath_df[ath_df['CollegeCode']==dept]['MAXSATVerbalMath'].dropna())+
                    len(ath_df[ath_df['CollegeCode']==dept]['SAT_combined'].dropna())
                    )/len(ath_df[ath_df['CollegeCode']==dept]) )

act_scores = []
for dept in ath_df['CollegeCode'].unique():
    act_scores.append((len(ath_df[ath_df['CollegeCode']==dept]['ACTComposite'].dropna())
                    )/len(ath_df[ath_df['CollegeCode']==dept]['ACTComposite']) )


(score_report(sat_scores, "SAT")) | (score_report(act_scores,"ACT"))