# Import dependencies and data

In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt

# Python SQL toolkit and Object Relational Mapper
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func
import psycopg2
from config import db_password

from sklearn.model_selection import train_test_split
from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.metrics import classification_report_imbalanced

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix

In [2]:
#Load data
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/University_Salary"
engine = create_engine(db_string)

df = pd.read_sql_table("college_statistics",
                            con=engine)
df.head()

Unnamed: 0,Rank,University_Name,State,Region,Division,Early_Career_Pay,Mid_Career_Pay,Make_World_Better_Percent,Stem_Percent,Type,...,Black_Diversity_Percent,Hispanic_Diversity_Percent,Native_Hawaiian_Pacific_Islander_Diversity_Percent,Two_Or_More_Races_Diversity_Percent,Total_Minority_Diversity_Percent,Non-Resident_Foreign_Diversity_Percent,Unknown_Diversity_Percent,White_Diversity_Percent,Women_Diversity_Percent,Men_Diversity_Percent
0,1,Auburn University,Alabama,South,East South Central,54400,104500,51.0,31,Public,...,0.07,0.02,0.0,0.0,0.13,0.06,0.01,0.81,0.49,0.51
1,1,University Alaska Fairbanks,Alaska,West,Pacific,59100,101800,54.0,20,Public,...,0.02,0.05,0.0,0.04,0.24,0.03,0.28,0.45,0.58,0.42
2,1,Embry Riddle Aeronautical University Prescott,Arizona,West,Mountain,65600,117900,59.0,43,Private,...,0.02,0.06,0.01,0.09,0.23,0.1,0.1,0.57,0.23,0.77
3,1,University Arkansas,Arkansas,South,West South Central,52500,98000,49.0,18,Public,...,0.05,0.06,0.0,0.03,0.18,0.06,0.01,0.76,0.51,0.49
4,1,Harvey Mudd College,California,West,Pacific,88800,158200,55.0,85,Private,...,0.02,0.1,0.0,0.06,0.39,0.13,0.05,0.44,0.46,0.54


# Data cleaning and feature engineering

In [3]:
#Check column types and nulls
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 907 entries, 0 to 906
Data columns (total 29 columns):
 #   Column                                              Non-Null Count  Dtype  
---  ------                                              --------------  -----  
 0   Rank                                                907 non-null    int64  
 1   University_Name                                     907 non-null    object 
 2   State                                               907 non-null    object 
 3   Region                                              907 non-null    object 
 4   Division                                            907 non-null    object 
 5   Early_Career_Pay                                    907 non-null    int64  
 6   Mid_Career_Pay                                      907 non-null    int64  
 7   Make_World_Better_Percent                           878 non-null    float64
 8   Stem_Percent                                        907 non-null    int64  
 9  

In [4]:
#Replace nulls with 0
df.fillna(0, inplace=True)

#Drop unnecessary columns and recheck column types and nulls
df = df.drop(columns=["University_Name","Rank","Mid_Career_Pay","Degree_Length","State","Region"])

#Convert the target column values to low and medium/high income
def income(pay):
    if pay < 45000:
        return "Low"
    else:
        return "Medium/High"    

df["Early_Career_Pay"] = df.apply(lambda row: income(row["Early_Career_Pay"]), axis=1)

#Convert features to integers
df_binary = pd.get_dummies(df, columns=["Division","Type"])

#Recheck column types and nulls
df_binary.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 907 entries, 0 to 906
Data columns (total 32 columns):
 #   Column                                              Non-Null Count  Dtype  
---  ------                                              --------------  -----  
 0   Early_Career_Pay                                    907 non-null    object 
 1   Make_World_Better_Percent                           907 non-null    float64
 2   Stem_Percent                                        907 non-null    int64  
 3   Room_And_Board                                      907 non-null    float64
 4   In_State_Tuition                                    907 non-null    int64  
 5   In_State_Total                                      907 non-null    int64  
 6   Out_Of_State_Tuition                                907 non-null    int64  
 7   Out_Of_State_Total                                  907 non-null    int64  
 8   Total_Enrollment                                    907 non-null    float64
 9  

# Create features and target and split into training and testing

In [5]:
#Create features and target
X = df_binary.drop(columns="Early_Career_Pay")
y = df_binary["Early_Career_Pay"]

y.value_counts()

Medium/High    719
Low            188
Name: Early_Career_Pay, dtype: int64

In [6]:
#Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, train_size=0.75, random_state=1,
                                                   shuffle=True, stratify=None)

# Easy Ensemble AdaBoost Classifier

In [7]:
# Train the EasyEnsembleClassifier
clf = EasyEnsembleClassifier(n_estimators=100,base_estimator=None,warm_start=False,
                             sampling_strategy=1,replacement=False, n_jobs=None,random_state=1,
                            verbose=0).fit(X_train, y_train)



In [8]:
# Calculate the balanced accuracy score for Easy Ensemble
y_pred = clf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)



0.8746273790415042

In [9]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 48,   1],
       [ 41, 137]], dtype=int64)

In [10]:
# Print the imbalanced classification report for Easy Ensemble
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        Low       0.54      0.98      0.77      0.70      0.87      0.77        49
Medium/High       0.99      0.77      0.98      0.87      0.87      0.74       178

avg / total       0.89      0.81      0.93      0.83      0.87      0.74       227

