## Import Packages

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import altair as alt
import matplotlib.pyplot as plt
import warnings
import folium
import sys
warnings.filterwarnings('ignore')

In [None]:
sys.path.insert(0, '../src/visualization/')
import visualize as vis

## Read in File

In [None]:
filename = '../data/processed/CriticalPath_Data_EM_Confidential_lessNoise.csv'

data = pd.read_csv(filename).drop(columns=["Unnamed: 0"])
data.head()

### KDE of High School GPA

#### All Applicants

In [None]:
# plt.figure(figsize=(10,6))
all_hs_gpa = data[~data['HS_GPA'].isnull()][data['HS_GPA']>5][data['HS_GPA']<400]

vis.kde_w_mean(all_hs_gpa['HS_GPA'],20, "# of Students")

plt.xlabel("High School GPA (out of 100)")
plt.title("KDE of High School GPA: Applied");

#### Enrolled

In [None]:
vis.kde_w_mean(all_hs_gpa['HS_GPA'][all_hs_gpa['Admission_status']==80],20, "# of Students")

plt.xlabel("High School GPA (out of 100)")
plt.title("KDE of High School GPA: Enrolled");

### KDE of HS Percentile Rankings

In [None]:
all_hs_percentile = data[~data['HS_Percentile_rank'].isnull()][data['HS_Percentile_rank']>=0]
vis.kde_w_mean(all_hs_percentile['HS_Percentile_rank'],100, "# of Students")
plt.xlabel("Percentile Rank in HS Class")
plt.title("Distribution of HS Percentile Rankings");

#### Enrolled

In [None]:
vis.kde_w_mean(all_hs_percentile['HS_Percentile_rank'][all_hs_percentile['Admission_status']==80],100, "# of Students")
plt.xlabel("Percentile Rank in HS Class")
plt.title("Distribution of HS Percentile Rankings");

### KDE of High School Class Size

In [None]:
hs_size = data[~data['HS_Class_size'].isnull()]
vis.kde_w_mean(hs_size['HS_Class_size'],100, "# of Students")
plt.ylabel("Probability Density Function")
plt.title("Distribution of Class Sizes: All Aplicants");
plt.xlim(0,1400);

#### Enrolled

In [None]:
hs_size = data[~data['HS_Class_size'].isnull()]
vis.kde_w_mean(hs_size['HS_Class_size'][hs_size['Admission_status']==80],100, "# of Students")
plt.ylabel("Probability Density Function")
plt.title("Distribution of Class Sizes: Enrolled");
plt.xlim(0,1400);

### KDE of ACT Scores

#### Non-Enrolled

In [None]:
act_scores = data[~data['ACTComposite'].isnull()]
vis.kde_w_mean(act_scores['ACTComposite'][act_scores['Enrolled']==False],25,'# of Students')
plt.title("Non-Enrolled");

#### Enrolled

In [None]:
vis.kde_w_mean(act_scores['ACTComposite'][act_scores['Admission_status']==80],20,'# of Students')
plt.title("Enrolled");

### KDEs of SAT Scores

#### New Format

In [None]:
for column in data.columns:
    if "NEW" in column:
        plt.figure()
        vis.kde_w_mean(data[column][~data[column].isnull()], 20, "# of Students")

#### Enrolled

In [None]:
for column in data.columns:
    if "NEW" in column:
        plt.figure()
        df = data[[column,'Enrolled']][~data[column].isnull()]
        vis.kde_w_mean(df[column][~df['Enrolled'].isnull()], 20, "# of Students")

#### Old Format

In [None]:
for column in data.columns:
    if "NEW" not in column and "SAT" in column:
        plt.figure()
        vis.kde_w_mean(data[column][~data[column].isnull()], 20, "# of Students")

### Students Enrollment by Major 

In [None]:
num_majors_applied = []
num_majors_enrolled = []

for major in data['Major'].unique():
    num_majors_applied.append(data.Major.str.count(major).sum())
    num_majors_enrolled.append(data.Major[data['Enrolled']].str.count(major).sum())

majorsDF = pd.DataFrame({"Major": list(data.Major.unique()) + list(data.Major.unique()), 
                         "Num Students":list(num_majors_applied) + list(num_majors_enrolled),
                         "Enrolled":  list(data.Major.unique() == 'false') + list(data.Major.unique() != 'false')})

mapper = {True:"Enrolled", False:"Applied"}

majorsDF['Enrolled'] = majorsDF['Enrolled'].map(mapper)

enrolled = majorsDF['Num Students'][majorsDF['Enrolled']!='Applied']
applied = majorsDF['Num Students'][majorsDF['Enrolled']=='Applied']

percentage_enrolled = np.array(enrolled)/np.array(applied)
majorsDF['Percentage Enrolled'] = [np.nan]*len(applied) + list(percentage_enrolled)



alt.Chart(majorsDF).mark_bar().encode(
    y='Num Students:Q',
    color = "Enrolled",
    x=alt.X(
        "Major:O",
        sort=alt.EncodingSortField(
          field = 'Num Students',
          op = 'sum',
          order = 'descending'
        )
    )
).properties(height=200)

#### ERROR BARS

#### Percentage of Students Enrolled by Major

In [None]:
alt.Chart(majorsDF).mark_bar().encode(
    y='Percentage Enrolled:Q',
    x=alt.X(
        "Major:O",
        sort=alt.EncodingSortField(
            field="Percentage Enrolled",
            op="sum",
            order="descending"
        )
    )
).properties(height=200)

### By State (excluding NY)

In [None]:
state_counts = []
states = []

for state in data['State_perm_res'][data['International_student']=='US'].unique():
  
  if type(state)==str and state!='NY':
    
    states.append(state)
                   
    state_counts.append(data['State_perm_res'].str.count(state).sum())
    
statesDF = pd.DataFrame({"State": states, "Num Students":state_counts})
statesDF = statesDF.sort_values("Num Students",ascending = False).reset_index()

#### Top Ten States outside of NY

In [None]:
alt.Chart(statesDF.iloc[:10]).mark_bar().encode(
    x='Num Students:Q',
    y=alt.Y(
        "State:O",
        sort=alt.EncodingSortField(
            field="Num Students",
            op="sum",
            order="descending"
        )
    )
).properties(height=300)

#### Choropleth Map: All States excluding NY

In [None]:
geo_data1 = '../data/processed/us-states.json'

m = folium.Map(
    location=[42.65, -73.75],
    tiles='Mapbox Bright',
    zoom_start=5
)

folium.Choropleth(
    geo_data=geo_data1,
    nan_fill_opacity=0,
    name='choropleth',
    data=statesDF,
    columns=['State', 'Num Students'],
    key_on='feature.id',
    fill_color='YlOrRd',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='Admission by State',
    highlight=True
).add_to(m)

folium.LayerControl().add_to(m)

m

#### Choropleth Map: Excluding NY, CT, MA, NJ

In [None]:
geo_data1 = '../data/processed/us-states.json'

m = folium.Map(
    location=[32.65, -93.75],
    tiles='Mapbox Bright',
    zoom_start=5,
    width='75%',
    height='75%'
)

folium.Map()

folium.Choropleth(
    geo_data=geo_data1,
    nan_fill_opacity=0,
    name='choropleth',
    data=statesDF.sort_values("Num Students",ascending=False).iloc[3:],
    columns=['State', 'Num Students'],
    key_on='feature.id',
    fill_color='YlOrRd',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='Applications by State',
    highlight=True
).add_to(m)

folium.LayerControl().add_to(m)

m

#### Choropleth Map: Non-Athletes, Excluding NY, CT, MA, NJ

In [None]:
slicer = pd.DataFrame({"Athlete":data['Recruited_athlete'].isnull(),
                       'International_student':data['International_student']=='US', 
                       'In-State': data['State_perm_res']!='NY'})

state_counts_non_athletes = []
states_non_athletes = []

for state in data['State_perm_res'].unique():
  
  if type(state)==str and  state !='NY' and state!='MA' and state!= 'NJ' and state != 'CT':
    
    states_non_athletes.append(state)
                   
    state_counts_non_athletes.append(data['State_perm_res'][slicer.all(axis='columns')].str.count(state).sum())
    
non_athlethesDF = pd.DataFrame({"State": states_non_athletes, "Num Students":state_counts_non_athletes})

In [None]:
geo_data1 = '../data/processed/us-states.json'

m = folium.Map(
    location=[42.65, -73.75],
    tiles='Mapbox Bright',
    zoom_start=5
)

folium.Choropleth(
    geo_data=geo_data1,
    nan_fill_opacity=0,
    name='choropleth',
    data=non_athlethesDF,
    columns=['State', 'Num Students'],
    key_on='feature.id',
    fill_color='YlOrRd',
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='Admission by State',
    highlight=True
).add_to(m)

folium.LayerControl().add_to(m)

m

### By City

In [None]:
citiesDF = data[data['Enrolled']].groupby(["City_perm_res",
  "State_perm_res"]).count().sort_values("Unique_student_ID",
  ascending=False)[['Unique_student_ID']].rename(columns = {"Unique_student_ID":"Enrolled"})

citiesDF = citiesDF.merge(
    data.groupby(["City_perm_res","State_perm_res"]).count().sort_values("Unique_student_ID",
    ascending=False)[['Unique_student_ID']].rename(columns = {"Unique_student_ID":"Applied"}),
    left_index=True, right_index=True)

citiesDF['PercentEnrolled'] = citiesDF['Enrolled']/citiesDF['Applied'] * 100

citiesDF.reset_index(inplace=True)
citiesDF = citiesDF.sort_values("Enrolled",ascending=False)

#### Top Cities By Enrollment Total

In [None]:
alt.Chart(citiesDF.iloc[:10]).mark_bar().encode(
    x='Enrolled:Q',
    y=alt.Y(
        'City_perm_res:O',
        sort = alt.EncodingSortField(
                field='Enrolled',
                op = "sum",
                order = "descending"
        )
    )
).properties(height=400)

#### Top Cities outside of NY by Enrollment

In [None]:
alt.Chart(citiesDF[citiesDF["State_perm_res"]!='NY'][:10]).mark_bar().encode(
    x='Enrolled:Q',
    y=alt.Y(
        'City_perm_res:O',
        sort = alt.EncodingSortField(
                field='Enrolled',
                op = "sum",
                order = "descending"
        )
    )
).properties(height=400)

#### Top/Bottom 15 Cities by Enrollment Percentage (min 20 applied)

In [None]:
top15_bottom15_enroll = pd.concat([citiesDF[citiesDF['Applied']>20].sort_values("PercentEnrolled",ascending=False).iloc[:15],
          citiesDF[citiesDF['Applied']>20].sort_values("PercentEnrolled",ascending=False).iloc[-15:]])


alt.Chart(top15_bottom15_enroll).mark_bar().encode(
    x='PercentEnrolled:Q',
    y=alt.Y(
        'City_perm_res:O',
        sort = alt.EncodingSortField(
                field='PercentEnrolled',
                op = "sum",
                order = "descending"
        )
    )
).properties(height=400)

### Applied v. Attended

In [None]:
f, axes = plt.subplots(figsize=(10,6))
sns.scatterplot(data = citiesDF, x="Applied", y="Enrolled");
plt.title("Applied v. Attended");

### By County

In [None]:
countiesDF = data[data['Enrolled']].groupby(["County_perm_res",
  "State_perm_res"]).count().sort_values("Unique_student_ID",
  ascending=False)[['Unique_student_ID']].rename(columns = {"Unique_student_ID":"Enrolled"})

countiesDF = countiesDF.merge(
    data.groupby(["County_perm_res","State_perm_res"]).count().sort_values("Unique_student_ID",
    ascending=False)[['Unique_student_ID']].rename(columns = {"Unique_student_ID":"Applied"}),
    left_index=True, right_index=True)

countiesDF['PercentEnrolled'] = countiesDF['Enrolled']/countiesDF['Applied'] * 100

countiesDF.reset_index(inplace=True)
countiesDF = countiesDF.sort_values("Enrolled",ascending=False)


## convert county codes to names 

fips_codes = pd.read_csv('../data/processed/FIPS.csv')
fips_codes.State + fips_codes.FIPS.astype(str)
fips_codes['FIPS'] = (fips_codes['FIPS'] - np.floor(fips_codes['FIPS']/1000)*1000).astype(int).astype(str)
fips_codes['FIPS'] = fips_codes['FIPS'].apply(lambda x: x.zfill(3))
fips_codes['FIPS'] = fips_codes.State + fips_codes.FIPS

mapper = fips_codes.set_index('FIPS')[['Name']].to_dict()['Name']
countiesDF['County_perm_res'] = countiesDF['County_perm_res'].map(mapper)

#### Top Counties by Enrollment Total

In [None]:
alt.Chart(countiesDF.iloc[:10]).mark_bar().encode(
    x='Enrolled:Q',
    y=alt.Y(
        'County_perm_res:O',
        sort = alt.EncodingSortField(
                field='Enrolled',
                op = "sum",
                order = "descending"
        )
    )
).properties(height=400)

#### Top Counties outside of NY by Enrollment Total

In [None]:
alt.Chart(countiesDF[countiesDF["State_perm_res"]!='NY'][:10]).mark_bar().encode(
    x='Enrolled:Q',
    y=alt.Y(
        'County_perm_res:O',
        sort = alt.EncodingSortField(
                field='Enrolled',
                op = "sum",
                order = "descending"
        )
    )
).properties(height=400)

#### Top/Bottom 15 Counties by Enrollment Percentage (min 20 applicants)

In [None]:
top15_bottom15_enroll = pd.concat([countiesDF[countiesDF['Applied']>20].sort_values("PercentEnrolled",ascending=False).iloc[:15],
          countiesDF[countiesDF['Applied']>20].sort_values("PercentEnrolled",ascending=False).iloc[-15:]])


alt.Chart(top15_bottom15_enroll).mark_bar().encode(
    x='PercentEnrolled:Q',
    y=alt.Y(
        'County_perm_res:O',
        sort = alt.EncodingSortField(
                field='PercentEnrolled',
                op = "sum",
                order = "descending"
        )
    )
).properties(height=400)

#### Applied v. Enrolled

In [None]:
f, axes = plt.subplots(figsize=(10,6))
sns.scatterplot(data = countiesDF, x="Applied", y="Enrolled");
plt.title("Applied v. Attended");

In [None]:
# data.groupby("City_perm_res").count()[['Unique_student_ID','Enrolled']].sort_values("Unique_student_ID",ascending=False)

### By Ethnicity, and Income

In [None]:
ethnicity_AGIDF = pd.merge(left=data[['Ethnicity','Parent_income_AGI']].groupby("Ethnicity").mean(),
  right=data[["Ethnicity","Enrolled","Unique_student_ID"]].groupby("Ethnicity").count(),
          how='outer', left_index=True, right_index=True).rename(columns={"Unique_student_ID":"TotApplicants",
                                                                         "Parent_income_AGI":"AvgParent_income_AGI"})
ethnicity_AGIDF.reset_index(inplace=True)
ethnicity_AGIDF = ethnicity_AGIDF.drop(7)

#### Race v. Family Income

In [None]:
alt.Chart(ethnicity_AGIDF).mark_bar().encode(
    x='AvgParent_income_AGI:Q',
    y=alt.Y(
        'Ethnicity:O',
        sort = alt.EncodingSortField(
                field='AvgParent_income_AGI',
                op = "sum",
                order = "descending"
        )
    )
).properties(height=400)

In [None]:
f, axes = plt.subplots(figsize=(10,7))
sns.boxplot(data=data[data['Ethnicity']!='Unknown'],y="Ethnicity",x="Parent_income_AGI")
plt.xlim(0,500000);