### Dist to College Relationships

The goal of this notebook is to try and identify any relationships between distance to the school a student chooses, specifically their Academic Rating, and household income.

Import necessary libraries.

In [None]:
import numpy as np
import pandas as pd
import sys

import altair as alt
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick

import warnings
warnings.filterwarnings('ignore')

sys.path.insert(0,'../src/visualization/')
import visualize as vis

Read in the .csv file with HEOP and AMC 'noise' removed.

In [None]:
df = pd.read_csv('../data/processed/CriticalPath_Data_EM_Confidential_lessNoise.csv').drop(columns='Unnamed: 0')

Re-organize the `df['HD_Academic_Rating']` column.

In [None]:
ratings_names = [rating for rating in df['HD_Academic_Rating'].unique() if ('AR' in str(rating) and 'X' not in str(rating))]
ratings_names += ['ARX']
ratings_numbers = [rating[2] for rating in df['HD_Academic_Rating'].unique() if ('AR' in str(rating) and 'X' not in str(rating))]
ratings_numbers += ['0']

df['HD_Academic_Rating_Numeric'] = df.HD_Academic_Rating.map(dict(zip(ratings_names,ratings_numbers))).astype(int,errors='ignore')
df.HD_Academic_Rating = df.HD_Academic_Rating.map(dict(zip(ratings_names,ratings_names)))

Create a data source for plotting distance to schools vs. income and academic rating.

In [None]:
source = df[['HD_Academic_Rating','HD_Academic_Rating_Numeric','Dist_to_Siena',
             'Dist_to_Ccbnm','Enrolled','Parent_income_AGI']]

source['Distance_to_School'] = source['Enrolled'].map({True:1,False:np.nan})
source['Distance_to_School'] = source['Distance_to_School'].fillna(source['Dist_to_Ccbnm'])
source['Distance_to_School'] = source['Distance_to_School'].replace({1:np.nan})
source['Distance_to_School'] = source['Distance_to_School'].fillna(source['Dist_to_Siena'])
source['School Attended'] = source['Enrolled'].map({True:"Siena",False:"Other"})

Stripplot of Academic Rating v. Distance to School

In [None]:
f, axes = plt.subplots(figsize=(12,6))

sns.stripplot(data=source,y='HD_Academic_Rating',x='Distance_to_School',hue="School Attended",
            jitter=True,dodge=True,size=3);
plt.xlim(0,3000);
plt.xlabel("Distance to School (mi)")
plt.ylabel("Academic Rating");

Boxplot of above plot, excluding outliers.

In [None]:
f, axes = plt.subplots(figsize=(12,6))

sns.boxplot(data=source,x='Distance_to_School',y='HD_Academic_Rating',hue="School Attended",showfliers=False,
            order=['AR1','AR1B','AR2','AR3','AR4','AR5']);
plt.xlabel("Distance to School (mi)")
plt.ylabel("Academic Rating");

Parent's income v. Distance to school.

In [None]:
f, axes = plt.subplots(figsize=(12,6))

sns.scatterplot(data=source,x='Parent_income_AGI',y='Distance_to_School',hue='School Attended',alpha=0.3);
fmt = '$%.0f'
tick = mtick.FormatStrFormatter(fmt)
axes.xaxis.set_major_formatter(tick)
plt.xlim(0,1000000);

plt.ylabel("Distance to School (mi)")
plt.xlabel("Parents' Adjusted Gross Income");