# Student Retention Predictor
> Predicting college dropout risk using U.S. Department of Education data

## Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

## Load and Preview Data

In [7]:
df = pd.read_csv('../data/college_scorecard_apr2025.csv', low_memory=False)
df.shape
df.head()

Unnamed: 0,UNITID,OPEID,OPEID6,INSTNM,CITY,STABBR,ZIP,ACCREDAGENCY,INSTURL,NPCURL,...,COUNT_WNE_MALE1_P11,GT_THRESHOLD_P11,MD_EARN_WNE_INC1_P11,MD_EARN_WNE_INC2_P11,MD_EARN_WNE_INC3_P11,MD_EARN_WNE_INDEP0_P11,MD_EARN_WNE_INDEP1_P11,MD_EARN_WNE_MALE0_P11,MD_EARN_WNE_MALE1_P11,SCORECARD_SECTOR
0,100654,100200.0,1002.0,Alabama A & M University,Normal,AL,35762,Southern Association of Colleges and Schools C...,www.aamu.edu/,www.aamu.edu/admissions-aid/tuition-fees/net-p...,...,777.0,0.625,36650.0,41070.0,47016.0,38892.0,41738.0,38167.0,40250.0,4
1,100663,105200.0,1052.0,University of Alabama at Birmingham,Birmingham,AL,35294-0110,Southern Association of Colleges and Schools C...,https://www.uab.edu/,https://tcc.ruffalonl.com/University of Alabam...,...,1157.0,0.7588,47182.0,51896.0,54368.0,50488.0,51505.0,46559.0,59181.0,4
2,100690,2503400.0,25034.0,Amridge University,Montgomery,AL,36117-3553,Southern Association of Colleges and Schools C...,https://www.amridgeuniversity.edu/,https://www2.amridgeuniversity.edu:9091/,...,67.0,0.5986,35752.0,41007.0,,,38467.0,32654.0,49435.0,5
3,100706,105500.0,1055.0,University of Alabama in Huntsville,Huntsville,AL,35899,Southern Association of Colleges and Schools C...,www.uah.edu/,finaid.uah.edu/,...,802.0,0.781,51208.0,62219.0,62577.0,55920.0,60221.0,47787.0,67454.0,4
4,100724,100500.0,1005.0,Alabama State University,Montgomery,AL,36104-0271,Southern Association of Colleges and Schools C...,www.alasu.edu/,www.alasu.edu/cost-aid/tuition-costs/net-price...,...,1049.0,0.5378,32844.0,36932.0,37966.0,34294.0,31797.0,32303.0,36964.0,4


## Column Selection and Cleaning

In [None]:
cols = [
    'PCTPELL',         # Percentage of undergraduates who receive a Pell Grant
    'ADM_RATE',        # Admission rate
    'REGION',          # Region (IPEDS)
    'LOCALE',          # Locale of institution
    'RET_FT4'          # Target variable: First-time, full-time student retention rate at four-year institutions
]

# Select relevant columns and drop rows with NaN values
df = df[cols].dropna()

## Create Binary Target

In [None]:
# if retention rate is above 0.70, it is a success
# otherwise, it is a failure
df['TARGET'] = (df['RET_FT4'] > 0.70).astype(int)

## Convert and Encode Categorical Data

In [None]:
# group LOCALE values into broader categories
def simplify_locale(val):
    if val in [11, 12, 13]:
        return 'City'
    elif val in [21, 22, 23]:
        return 'Suburb'
    elif val in [31, 32, 33]:
        return 'Town'
    elif val in [41, 42, 43]:
        return 'Rural'
    else:
        return 'Unknown'
    
# apply the function to the LOCALE column
df['LOCALE_GROUPED'] = df['LOCALE'].apply(simplify_locale)

# convert categorical variables to numeric 0 or 1
# drop first category to avoid multicollinearity
df = pd.get_dummies(df, columns=['REGION', 'LOCALE_GROUPED'], drop_first=True)