In [20]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [21]:
df = pd.read_csv('../data/raw/CriticalPath_Data_EM_Confidential.csv')

Make `df['enrolled']` a T/F categorical value to eliminate NaNs

In [22]:
df['Enrolled'] = df['Enrolled'].map({80:True})
df['Enrolled'] = df['Enrolled'].fillna(False)

Eliminate all HEOP and Albany Med applicants from `df['Application_Type']` in order to lessen noise

In [23]:
df = df[(df['Application_Type']!='AM') & (df['Application_Type']!='HE')]

In [24]:
df.to_csv('../data/processed/CriticalPath_Data_EM_Confidential_lessNoise.csv')

Let's assume that all the NaN values for the interested columns (i.e. `df['IntsStudyAbroad']`) are them saying no.  It was on the application, and if they were truly interested in it, they would've 
taken the time to hit yes.

In [25]:
for col in df.columns.values:
    if col.startswith("Ints"):
        df[col] = df[col].map(dict(zip(df[col].unique(), [False,True])))

Identify the columns we MUST get rid of for either: 
* not enough info to obtain any usefulness out of 
* are data ONLY obtained after enrollment decision given to the college (i.e. college_chosen_by_non-matrics, work_study, reason_student_withdrew_app, etc.)

In [26]:
flagged = []
for col in df.columns.values:
    col_len = len(df[~df[col].isnull()])
    if col_len < 5000:
        
        percent_enrolled_col = (len(df[col][df['Enrolled']]) - df[col][df['Enrolled']].isna().sum()) / len(df[~df[col].isnull()])
        if  percent_enrolled_col > 0.9 or percent_enrolled_col < 0.1 or col_len < 200:
            flagged.append(col)

In [27]:
df = df.drop(columns=flagged)
df = df.drop(columns=['Indicated_intent_to_apply_for_FA'])

There are some columns that repeat information from other columns.
* HS_Numeric_Rank, class size, and percentile rank for example
* SAT data can probably be concatenated into one column, and the writing section can be dropped (since most colleges didn't seem to look at it anyway)

In [28]:
df['MAX_SAT_combined'] = df[['MAXSATVerbalMath','SAT_combined']].max(axis=1)
df['MAX_SAT_math'] = df[['NEWSATMath','SAT_math']].max(axis=1)
df['MAX_SAT_verbal-reading'] = df[['NEWSATVerbal','SAT_reading']].max(axis=1)

df = df.drop(columns=['MAXSATVerbalMath','NEWSATVerbal','NEWSATMath','NEWSATVerbalMath','SAT_combined','SAT_reading','SAT_math','SAT_writing'])

df = df.drop(columns=['HS_Numeric_rank'])

In [29]:
df['Test_Optional'] = df['Test_Optional'].map({"TOPT":True})
df['Test_Optional'] = df['Test_Optional'].fillna(False)

### List of columns to potentially ignore:
* Initial_inquiry_date
* Initial_visit_date
* Withdrawal_date
* WeightatAcpt
* TotalWeight
* Admission_date $\to$ could be useful
* COA_BUDGET
* DOB
* COA $\to$ this DOES look useful
* FT_Tuition_Fees ---- this is directly coorelated with year_of_entry
* Net_worth_students_bus
* Net_worth_students_investments
* Students_cash
* ANYTHING with "date"
* REEVAL_Status

In [30]:
dates = [
    'Initial_inquiry_date',
    'Initial_visit_date',
    'Admission_application_date',
    'Admission_date',
    'Withdrawal_date',
    'First_ISIR_Date',
    'Last_ISIR_Date',
    'Last_award_letter_date',
    'REEVAL_status_date',
    'FAFSA_Complete_Date'
]

other_droppables = [
    'WeightatAcpt',
    'TotalWeight',
    'COA_BUDGET',
    'DOB',
    'FT_Tuition_Fees',
    'Net_worth_students_bus',
    'Net_worth_students_investments',
    'Student_cash',
    'REEVAL_status'
]
        
df = df.drop(columns=dates)
df = df.drop(columns=other_droppables)

One hot encode the following columns:

* Ethnicity
* Application Type
* Dorm/Commuter

In [35]:
hot_encs = ['Application_Type','Dorm_or_commuter_student']

for col in hot_encs:
    df=df.merge(pd.get_dummies(df[col],dtype=int),how='outer',left_index=True,right_index=True).drop(columns=[col])

## Others that are droppable 

In [36]:
df = df.drop(columns=['ADMT_DEC_CODE','Athletic_based_inst_aid','Recruited_athlete'])

Write this new data set to a new file.

In [37]:
df.to_csv("../data/interim/CriticalPath_Data_EM_Confidential_second_order.csv")