## Import Libraries and Load in Files

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import altair as alt
import matplotlib.pyplot as plt
import warnings
import folium
warnings.filterwarnings('ignore')

In [None]:
filename = '../data/processed/CriticalPath_Data_EM_Confidential_lessNoise.csv'

data = pd.read_csv(filename)

In [None]:
filename =  '../data/processed/collegeCEEBS.xlsx'
college_codes = pd.read_excel(filename)

## Apply College Codes to data
College CEEB Codes found [here](https://help.liaisonedu.com/WebAdMIT_Help_Center/Documents_and_Reference_Guides/Master_College_Code_List)


In [None]:
mapper = college_codes[~college_codes['CEEB Code'].isnull()][['Name']]
mapper.index = college_codes[~college_codes['CEEB Code'].isnull()].index = college_codes[~college_codes['CEEB Code'].isnull()]['CEEB Code'].astype(float)
mapper = mapper.to_dict()

In [None]:
college_by_major = data[['College_chosen_by_non-matrics','Major','Unique_student_ID']]
college_by_major['College_chosen_by_non-matrics'] = college_by_major['College_chosen_by_non-matrics'].map(mapper['Name'])
college_by_major = college_by_major[~college_by_major['College_chosen_by_non-matrics'].isnull()]

college_by_major = college_by_major.groupby(["College_chosen_by_non-matrics",
                                              "Major"]).count().rename(columns={"Unique_student_ID":"Num Students"})
college_by_major = college_by_major.reset_index()

## Breakdown of students choosing UAlbany over Siena by major

In [None]:
alt.Chart(college_by_major[college_by_major['College_chosen_by_non-matrics']=='UNIVERSITY AT ALBANY']).mark_bar().encode(
    x='Num Students:Q',
    y=alt.Y(
        'Major:O',
        sort = alt.EncodingSortField(
                field='Num Students',
                op = "sum",
                order = "descending"
        )
    )
).properties(height=400)

## Top 30 Schools chosen by students over Siena College

In [None]:
alt.Chart(college_by_major.groupby("College_chosen_by_non-matrics").sum().reset_index(
).sort_values("Num Students",ascending=False).iloc[:30]).mark_bar().encode(
    x='Num Students:Q',
    y=alt.Y(
        'College_chosen_by_non-matrics:O',
        sort = alt.EncodingSortField(
                field='Num Students',
                op = "sum",
                order = "descending"
        )
    )
).properties(height=400,width=400)

## Breakdown of students going to other schools by major.

In [None]:
alt.Chart(college_by_major.groupby("Major").sum().reset_index(
).sort_values("Num Students",ascending=False).iloc[:30]).mark_bar().encode(
    x='Num Students:Q',
    y=alt.Y(
        'Major:O',
        sort = alt.EncodingSortField(
                field='Num Students',
                op = "sum",
                order = "descending"
        )
    )
).properties(height=400,width=400)

## By intent to enroll in SOS, SOB, or SOA, where do students choose to go over Siena?

In [None]:
college_by_school = data[['College_chosen_by_non-matrics','CollegeCode','Unique_student_ID']]
college_by_school['College_chosen_by_non-matrics'] = college_by_school['College_chosen_by_non-matrics'].map(mapper['Name'])
college_by_school = college_by_school[~college_by_school['College_chosen_by_non-matrics'].isnull()]

college_by_school = college_by_school.groupby(["College_chosen_by_non-matrics",
                                              "CollegeCode"]).count().rename(columns={"Unique_student_ID":"Num Students"})
college_by_school = college_by_school.reset_index().rename(columns={"CollegeCode":"School"})
college_by_school['School'] = college_by_school['School'].map({"AD":"School of Art","BD":"School of Business","SD":"School of Science"})

In [None]:
num_colleges = 30
height = 500
width = 300

top_choices = college_by_major.groupby("College_chosen_by_non-matrics").sum().sort_values("Num Students",
                                                        ascending=False).iloc[:num_colleges].index.values

bars = alt.Chart(college_by_school.set_index("College_chosen_by_non-matrics").loc[top_choices].reset_index()).mark_bar().encode(
    x=alt.X('Num Students:Q', stack='zero'),
    y=alt.Y('College_chosen_by_non-matrics:O',axis=alt.Axis(title='')),
    color=alt.Color('School')
).properties(height=height,width=width,title="Colleges Chosen by intended School at Siena")

text = alt.Chart(college_by_school.set_index("College_chosen_by_non-matrics").loc[top_choices].reset_index()).mark_text(
    dx=-6, dy=3, color='white').encode(
    x=alt.X('Num Students:Q', stack='zero'),
    y=alt.Y('College_chosen_by_non-matrics:O'),
    detail='School:O',
    text=alt.Text('Num Students:Q', format='.0f')
).properties(height=height,width=width)

bars + text

## How does this look by Major??

In [None]:
# num_colleges = 30

# top_choices = college_by_major.groupby("College_chosen_by_non-matrics").sum().sort_values("Num Students",
#                                                         ascending=False).iloc[:num_colleges].index.values

# bars = alt.Chart(college_by_major.set_index("College_chosen_by_non-matrics").loc[top_choices].reset_index()).mark_bar().encode(
#     x=alt.X('Num Students:Q', stack='zero'),
#     y=alt.Y('College_chosen_by_non-matrics:O'),
#     color=alt.Color('Major')
# )

# bars

## Where do undeclared liberal arts majors go??

In [None]:
alt.Chart(college_by_major[college_by_major['Major']=='UNAR'].groupby(
    "College_chosen_by_non-matrics").sum().reset_index(
).sort_values("Num Students",ascending=False).iloc[:30]).mark_bar().encode(
    x='Num Students:Q',
    y=alt.Y(
        'College_chosen_by_non-matrics:O',
        sort = alt.EncodingSortField(
                field='Num Students',
                op = "sum",
                order = "descending"
        )
    )
).properties(height=400,width=400,title='Colleges Chosen by UNAR Majors')