### Distance Penetration

Which schools have the best draw from distant areas???

Import Libraries

In [1]:
import numpy as np
import pandas as pd
import altair as alt
import matplotlib.pyplot as plt
import seaborn as sns
alt.renderers.enable('notebook')

import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.insert(0,'../src/visualization/')
import visualize as vis

# from tqdm import tqdm_notebook
# tqdm_notebook().pandas()

Assign each student a score based on their distance to the school, 
the freshman class size at said school, and the population of the county in which the school is in.

$$
S = Dist\_to\_Ccbnm \frac{1}{Ccbnm\_Fresh\_Class} \frac{1}{College\_County\_population} \frac{1}{\# Students\_choosing\_Ccbnm}
$$


In [2]:
df = pd.read_csv('../data/processed/CriticalPath_Data_EM_Confidential_lessNoise.csv').drop(columns='Unnamed: 0')

siena_collegetown_pop = df.where(df.County_perm_res=='NY001')['County_perm_res_pop'].dropna().mean()

df['College_chosen_by_non-matrics'] = df.apply(
    lambda row: 'SIENA COLLEGE' if row['Enrolled'] else row['College_chosen_by_non-matrics'], axis=1
)

df['Dist_to_Ccbnm'] = df.apply(
    lambda row: row['Dist_to_Siena'] if row['Enrolled'] else row['Dist_to_Ccbnm'],
    axis=1
)

df['CollegeTown_pop'] = df.apply(
    lambda row: siena_collegetown_pop if row['Enrolled'] else row['CollegeTown_pop'],
    axis=1
)

df['Fresh_enroll'] = df.apply(
    lambda row: 714 if row['Enrolled'] else row['Fresh_enroll'],
    axis=1
)

df = df.rename(columns={"College_chosen_by_non-matrics":"Ccbnm"})

In [3]:
df['Distance_Penetration_Score'] = df.Dist_to_Ccbnm / (df.Fresh_enroll * df.County_perm_res_pop) 
df['Distance_Penetration_Score_Siena'] = df.Dist_to_Siena / (df.Fresh_enroll * df.County_perm_res_pop) 

# Normalize the score:

# df['Distance_Penetration_Score']=(df['Distance_Penetration_Score']-df['Distance_Penetration_Score'].mean())/df['Distance_Penetration_Score'].std()
# df['Distance_Penetration_Score_Siena']=(df['Distance_Penetration_Score_Siena']-df['Distance_Penetration_Score_Siena'].mean())/df['Distance_Penetration_Score_Siena'].std()
    
distance_penetration = df[['Distance_Penetration_Score','Ccbnm']].groupby(['Ccbnm']).agg(['mean','count']).reset_index()
distance_penetration_siena = df[['Distance_Penetration_Score_Siena','Ccbnm']].groupby(['Ccbnm']).agg(['mean','count']).reset_index()

distance_penetration['Distance_Penetration_Score_Mean'] = distance_penetration[("Distance_Penetration_Score","mean")]
distance_penetration['Distance_Penetration_Score_Count'] = distance_penetration[("Distance_Penetration_Score","count")]

distance_penetration_siena['Distance_Penetration_Score_Mean_Siena'] = distance_penetration_siena[("Distance_Penetration_Score_Siena","mean")]
distance_penetration_siena['Distance_Penetration_Score_Count_Siena'] = distance_penetration_siena[("Distance_Penetration_Score_Siena","count")]

distance_penetration = distance_penetration.drop(columns=['Distance_Penetration_Score'])
distance_penetration_siena = distance_penetration_siena.drop(columns=['Distance_Penetration_Score_Siena'])

distance_penetration.columns = distance_penetration.columns.droplevel(1)
distance_penetration_siena.columns = distance_penetration_siena.columns.droplevel(1)

distance_penetration_siena = distance_penetration_siena.where(distance_penetration_siena.Distance_Penetration_Score_Count_Siena >= 15).dropna()
distance_penetration_siena = distance_penetration_siena.where(distance_penetration_siena.Distance_Penetration_Score_Count_Siena >= 15).dropna()

chart = alt.Chart(distance_penetration.where(distance_penetration.Distance_Penetration_Score_Count>50).dropna().sort_values("Distance_Penetration_Score_Mean",ascending=False)[:50]).mark_bar(
    ).encode(
        y=alt.Y('Ccbnm:N', axis=alt.Axis(title=''),
                sort=alt.EncodingSortField(
                    field='Distance_Penetration_Score_Mean',
                    op='sum',
                    order='descending'
                    )
                ),
        x=alt.X('Distance_Penetration_Score_Mean:Q',axis=alt.Axis(title='Score')),
        color=alt.condition(
        alt.datum.Ccbnm == 'SIENA COLLEGE',  # If the college is Siena this test returns True,
        alt.value('gold'),     # which sets the bar gold.
        alt.value('green')   # And if it's not true it sets the bar green.
    )
    ).properties(title='Distance Penetration Score: Freshmen Enrollment NOT Factored in (Min 15 Students in Range)')

In [4]:
df['Distance_Penetration_Score'] = df.Dist_to_Ccbnm / ( df.County_perm_res_pop) 
df['Distance_Penetration_Score_Siena'] = df.Dist_to_Siena / (df.County_perm_res_pop) 

# Normalize the score:

# df['Distance_Penetration_Score']=(df['Distance_Penetration_Score']-df['Distance_Penetration_Score'].mean())/df['Distance_Penetration_Score'].std()
# df['Distance_Penetration_Score_Siena']=(df['Distance_Penetration_Score_Siena']-df['Distance_Penetration_Score_Siena'].mean())/df['Distance_Penetration_Score_Siena'].std()
    
distance_penetration = df[['Distance_Penetration_Score','Ccbnm']].groupby(['Ccbnm']).agg(['mean','count']).reset_index()
distance_penetration_siena = df[['Distance_Penetration_Score_Siena','Ccbnm']].groupby(['Ccbnm']).agg(['mean','count']).reset_index()

distance_penetration['Distance_Penetration_Score_Mean'] = distance_penetration[("Distance_Penetration_Score","mean")]
distance_penetration['Distance_Penetration_Score_Count'] = distance_penetration[("Distance_Penetration_Score","count")]

distance_penetration_siena['Distance_Penetration_Score_Mean_Siena'] = distance_penetration_siena[("Distance_Penetration_Score_Siena","mean")]
distance_penetration_siena['Distance_Penetration_Score_Count_Siena'] = distance_penetration_siena[("Distance_Penetration_Score_Siena","count")]

distance_penetration = distance_penetration.drop(columns=['Distance_Penetration_Score'])
distance_penetration_siena = distance_penetration_siena.drop(columns=['Distance_Penetration_Score_Siena'])

distance_penetration.columns = distance_penetration.columns.droplevel(1)
distance_penetration_siena.columns = distance_penetration_siena.columns.droplevel(1)

distance_penetration_siena = distance_penetration_siena.where(distance_penetration_siena.Distance_Penetration_Score_Count_Siena >= 15).dropna()
distance_penetration_siena = distance_penetration_siena.where(distance_penetration_siena.Distance_Penetration_Score_Count_Siena >= 15).dropna()

chart2 = alt.Chart(distance_penetration.where(distance_penetration.Distance_Penetration_Score_Count>50).dropna().sort_values("Distance_Penetration_Score_Mean",ascending=False)[:30]).mark_bar(
    ).encode(
        y=alt.Y('Ccbnm:N', axis=alt.Axis(title=''),
                sort=alt.EncodingSortField(
                    field='Distance_Penetration_Score_Mean',
                    op='sum',
                    order='descending'
                    )
                ),
        x=alt.X('Distance_Penetration_Score_Mean:Q',axis=alt.Axis(title='Score')),
        color=alt.condition(
        alt.datum.Ccbnm == 'SIENA COLLEGE',  # If the college is Siena this test returns True,
        alt.value('gold'),     # which sets the bar gold.
        alt.value('green')   # And if it's not true it sets the bar green.
    )
    ).properties(title='Distance Penetration Score: (Min 15 Students in Range)')