In [3]:
%matplotlib inline
import pandas as pd
import numpy as np

In [4]:
import ipywidgets as widgets
from IPython.display import display
import statsmodels
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, VarianceThreshold
from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
from sklearn.metrics import mean_squared_error, explained_variance_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.decomposition import PCA
from functools import reduce
from matplotlib import pyplot as plt
import seaborn as sns
import featuretools as ft

In [58]:
df = pd.read_csv("data/cleaned_df.csv", index_col=["UnitID"])

In [6]:
revenues_distribution = df.iloc[:,1:7] #Revenues distribution (percentages of total revenues to each area)
revenues_dollar_amount = df.iloc[:,7:13]
expenses_distribution = df.iloc[:,13:21] #Expenses as a distribution
expenses_dollar_amount = df.iloc[:, 21:28]

expenses_dollar_amount.head(3)

Unnamed: 0_level_0,Instruction_Expenses_As_Dollar_Amount,Research_Expenses_As_Dollar_Amount,Public_Service_Expenses_As_Dollar_Amount,Academic_Support_Expenses_As_Dollar_Amount,Student_Service_Expenses_As_Dollar_Amount,Institutional_Support_Expenses_As_Dollar_Amount,Other_Core_Expenses_As_Dollar_Amount
UnitID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
180203.0,2115359.76,218830.32,510604.08,218830.32,583547.52,1167095.04,2625963.84
138558.0,21084472.08,0.0,1506033.72,7530168.6,4518101.16,9538213.56,6526146.12
126182.0,18934322.61,0.0,1071754.11,2500759.59,6430524.66,5358770.55,1786256.85


## Staff diversity and Enrollment Diversity as principle components

In [7]:
staff_total = df.iloc[:,29:58] 
staff_total.head() 
staff_diversity_distribution = df.iloc[:, 58:87]
staff_diversity_distribution
enrollment_diversity = df.iloc[:,87:100]

enrollment_diversity.head()

Unnamed: 0_level_0,Percent of full-time first-time undergraduates awarded any financial aid (SFA1819),Percent of full-time first-time undergraduates awarded federal state local or institutional grant aid (SFA1819),Percent of total enrollment that are American Indian or Alaska Native (DRVEF2013_RV),Percent of total enrollment that are Asian/Native Hawaiian/Pacific Islander (DRVEF2013_RV),Percent of total enrollment that are Asian (DRVEF2013_RV),Percent of total enrollment that are Native Hawaiian or Other Pacific Islander (DRVEF2013_RV),Percent of total enrollment that are Black or African American (DRVEF2013_RV),Percent of total enrollment that are Hispanic/Latino (DRVEF2013_RV),Percent of total enrollment that are White (DRVEF2013_RV),Percent of total enrollment that are Race/ethnicity unknown (DRVEF2013_RV),Percent of total enrollment that are Nonresident Alien (DRVEF2013_RV),Percent of total enrollment that are two or more races (DRVEF2013_RV),Percent of total enrollment that are women (DRVEF2013_RV)
UnitID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
180203.0,85.0,82.0,88.0,0.0,0.0,0.0,0.0,1.0,10.0,0.0,0.0,0.0,57.0
138558.0,91.0,86.0,0.0,1.0,1.0,0.0,12.0,6.0,78.0,0.0,2.0,1.0,54.0
126182.0,98.0,92.0,1.0,2.0,1.0,0.0,7.0,26.0,56.0,5.0,0.0,3.0,56.0
100654.0,90.0,87.0,0.0,1.0,1.0,0.0,92.0,1.0,5.0,1.0,0.0,0.0,55.0
100724.0,95.0,89.0,0.0,0.0,0.0,0.0,91.0,1.0,3.0,1.0,2.0,1.0,61.0


# Age as principle component

In [8]:
age_df = df.iloc[:, 100:107]
age_distribution = df.iloc[:, 107:113]

# ACT/SAT Scores a principle component

In [9]:
test_scores = df.iloc[:,115:118]
test_scores.head()

Unnamed: 0_level_0,SAT Critical Reading 75th percentile score (IC2012_RV),SAT Math 75th percentile score (IC2012_RV),ACT Composite 75th percentile score (IC2012_RV)
UnitID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
180203.0,608.0,548.0,25.4
138558.0,420.0,410.0,17.0
126182.0,530.0,550.0,23.0
100654.0,455.0,460.0,19.0
100724.0,460.0,480.0,19.0


# Pre Processing and Modeling Graduation Rates

In [59]:
black_grad_rates_df = df.loc[:,:].dropna(subset=[df.columns[-1]])

x1 = df.loc[:,:].drop([df.columns[-1], df.columns[-2]], axis=1)
y1 = df[df.columns[-2]]
x2 = black_grad_rates_df.drop(df.columns[-2:], axis=1)
y2 = black_grad_rates_df.loc[:,df.columns[-2]]

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x1,y1, test_size=0.15, random_state=42)

In [12]:
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("classifier", Lasso(alpha=0.1))   
])

In [13]:
pipeline.fit(x_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()), ('classifier', Lasso(alpha=0.1))])

In [14]:
y_pred = pipeline.predict(x_test)

In [15]:
mean_squared_error(y_test, y_pred)

208.0299669688023

In [16]:
#visualizing a few of the coeficients
names = list(x1.columns)
lasso_coef = pipeline.named_steps['classifier'].coef_
coef_dict = dict(zip(names, lasso_coef))

In [17]:
#_ = plt.plot(range(stop-start), lasso_coef[start:stop])
#_ = plt.xticks(range(stop-start), names[start:stop], rotation=90)
#_ = plt.ylabel("Coeficients")
#plt.show()

In [18]:
sorted_coefs = sorted(coef_dict.items(), key=lambda item: item[1])
important = sorted_coefs[:5] + sorted_coefs[-6:]
important

[('Government_Grants', -2.956014951635693),
 ('Student_Service_Expenses', -1.9143288187384138),
 ('Parent/child indicator - Finance (FLAGS2019)_Partial child record - reports revenues/expenses. Assets/liabilties reported with parent',
  -1.832919107696392),
 ('Percent of total enrollment that are Black or African American (DRVEF2013_RV)',
  -1.7524508614301495),
 ('Grand total (EF2013B_RV  Undergraduate  Age 25 and over total)',
  -1.6792271950924624),
 ('Total women (EF2013B_RV  Undergraduate  Age under 25 total)',
  1.6108417324538622),
 ('Total price for out-of-state students living on campus 2012-13 (DRVIC2012_RV)',
  1.7375755276655354),
 ('Research_Expenses', 1.9353767017443706),
 ('SAT Math 75th percentile score (IC2012_RV)', 2.6937049159452395),
 ('ACT Composite 75th percentile score (IC2012_RV)', 2.907155236538547),
 ('Total women (EF2013B_RV  Undergraduate  Age under 25 total)_as_percentage',
  3.6155865684824224)]

In an Initial Model top 5 Negative Coefficients Were:

<ol>
    <li>Government_Grants</li>
    <li>Student_Service_Expense</li>
    <li>Parent/Child indicator</li>
    <li>Percent of total enrollment that are Black or African American</li>
    <li>Grand Total Undergraduate Age 25 and over</li>
</ol>
Top 5 Positive Coefficients
<ol>
    <li>Grand Total Undergraduate Age Under 25</li>
    <li>Research_Expenses</li>
    <li>SAT Math</li>
    <li>ACT Composite</li>
    <li>Total Women Uner 25</li>
</ol>
       

## Feature Engineering #1 Adding Principle Components

In [19]:
# First scale x2 using standard scaler

standard_scaler = StandardScaler()
x1_scaled = standard_scaler.fit_transform(x1)

In [20]:
pca_transform = PCA(n_components=1)

In [21]:
x1_df = pd.DataFrame(x1_scaled, columns=x1.columns)

x1_df["Revenues_Dist_PC"] = pca_transform.fit_transform(x1_df[revenues_distribution.columns])

x1_df["Revenues_Dollar_Amount_PC"] = pca_transform.fit_transform(x1_df[revenues_dollar_amount.columns])

x1_df["Expenses_Dollar_Amount_PC"] = pca_transform.fit_transform(x1_df[expenses_dollar_amount.columns])

x1_df["Expenses_Dist_PC"] = pca_transform.fit_transform(x1_df[expenses_distribution.columns])

x1_df["Staff_Total_PC"] = pca_transform.fit_transform(x1_df[staff_total.columns])

x1_df["Staff_Dist_PC"] = pca_transform.fit_transform(x1_df[staff_diversity_distribution.columns])

x1_df["Age_Total_PC"] = pca_transform.fit_transform(x1_df[age_df.columns])

x1_df["Age_Dist_PC"] = pca_transform.fit_transform(x1_df[age_distribution.columns])

x1_df["Enrollment_Diversity_PC"] = pca_transform.fit_transform(x1_df[enrollment_diversity.columns])

x1_df["Test_Scores_PC"] = pca_transform.fit_transform(x1_df[test_scores.columns])

# modeling with new Principle Components

In [22]:
x_train2, x_test2, y_train2, y_test2 = train_test_split(x1_df, y1, test_size=0.15, random_state=42)

In [23]:
lasso2 = Lasso(alpha=0.1)

In [24]:
lasso2.fit(x_train2, y_train2)

Lasso(alpha=0.1)

In [25]:
y_pred2 = lasso2.predict(x_test2)

In [26]:
mean_squared_error(y_test2, y_pred2)

208.25304727049996

## Removing Collinearity
No notable difference with Principle components than without. This may because our features are highly related. We can try to reduce the amount of collinearity between features to improve model performance

In [27]:
# Removing collinearity with heat map
cor_matrix = x1_df.corr().abs()

upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(bool))

to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]

#drop the first half of these columns with correlation coefficients > 0.95
x1_df_without_corr = x1_df.drop(to_drop[:22], axis=1)

### Retrying Lasso Regression with less correlated features

In [28]:
x_train3, x_test3, y_train3, y_test3 = train_test_split(x1_df, y1, test_size=0.15, random_state=42)

In [29]:
lasso3 = Lasso(alpha=0.1)

In [30]:
lasso3.fit(x_train3, y_train3)

Lasso(alpha=0.1)

In [31]:
y_pred3 = lasso3.predict(x_test3)

In [32]:
mean_squared_error(y_pred3, y_test3)

208.25304727049996

### Still no noteable difference
Now we can try feature engineering with feature tools

In [62]:
x1_fs = x1.reset_index()

In [61]:
y1_fs = pd.DataFrame(y1).reset_index()

In [63]:
x1_fs

Unnamed: 0,UnitID,Core_Revenues,Tuition_And_Fees,Government_Grants,Private_Gifts,Investment_Return,Sales_And_Services,Other_Revenues,Tuition_And_Fees_As_Dollar_Amount,Government_Grants_As_Dollar_Amount,...,State abbreviation (HD2018)_Texas,State abbreviation (HD2018)_Utah,State abbreviation (HD2018)_Vermont,State abbreviation (HD2018)_Virgin Islands,State abbreviation (HD2018)_Virginia,State abbreviation (HD2018)_Washington,State abbreviation (HD2018)_West Virginia,State abbreviation (HD2018)_Wisconsin,State abbreviation (HD2018)_Wyoming,State abbreviation (HD2018)_isMissing
0,180203.0,7078959.0,2.0,60.0,7.0,1.0,0.0,29.0,1.415792e+05,4.247375e+06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,138558.0,70491879.0,17.0,17.0,2.0,0.0,0.0,28.0,1.198362e+07,1.198362e+07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,126182.0,52099218.0,34.0,51.0,5.0,1.0,0.0,9.0,1.771373e+07,2.657060e+07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,100654.0,143112797.0,33.0,30.0,2.0,0.0,0.0,6.0,4.722722e+07,4.293384e+07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,100724.0,134993142.0,30.0,25.0,2.0,1.0,0.0,7.0,4.049794e+07,3.374829e+07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2310,224679.0,14868051.0,59.0,35.0,0.0,0.0,0.0,6.0,8.772150e+06,5.203818e+06,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2311,494597.0,12628252.2,66.0,7.8,22.6,2.0,0.0,1.6,7.669841e+06,3.255452e+06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2312,262165.0,27659104.2,79.4,15.0,0.4,0.8,0.0,4.2,2.635699e+07,8.050718e+05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2313,494603.0,12628252.2,66.0,7.8,22.6,2.0,0.0,1.6,7.669841e+06,3.255452e+06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
## Feature Engineering Adding Principle Components

# First scale x2 using standard scaler

standard_scaler = StandardScaler()

In [None]:
x2_scaled = standard_scaler.fit_transform(x2)

In [None]:
pca_transform = PCA(n_components=1)

In [None]:
x2_df = pd.DataFrame(x2_scaled, columns=x2.columns)

x2_df["Revenues_Dist_PC"] = pca_transform.fit_transform(x2_df[revenues_distribution.columns])

x2_df["Revenues_Dollar_Amount_PC"] = pca_transform.fit_transform(x2_df[revenues_dollar_amount.columns])

x2_df["Expenses_Dollar_Amount_PC"] = pca_transform.fit_transform(x2_df[expenses_dollar_amount.columns])

x2_df["Expenses_Dist_PC"] = pca_transform.fit_transform(x2_df[expenses_distribution.columns])

x2_df["Staff_Total_PC"] = pca_transform.fit_transform(x2_df[staff_total.columns])

x2_df["Staff_Dist_PC"] = pca_transform.fit_transform(x2_df[staff_diversity_distribution.columns])

x2_df["Age_Total_PC"] = pca_transform.fit_transform(x2_df[age_df.columns])

x2_df["Age_Dist_PC"] = pca_transform.fit_transform(x2_df[age_distribution.columns])

x2_df["Enrollment_Diversity_PC"] = pca_transform.fit_transform(x2_df[enrollment_diversity.columns])

x2_df["Test_Scores_PC"] = pca_transform.fit_transform(x2_df[test_scores.columns])

# modeling with new Principle Components