In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [66]:
df = pd.read_csv('newdata.csv')
df = df.fillna(df.mean())

In [67]:
df.columns

Index(['act_avg', 'city', 'zip', 'percent_receiving_aid', 'cost_after_aid',
       'state', 'rankingSortRank', 'hs_gpa_avg', 'businessRepScore', 'tuition',
       'engineeringRepScore', 'displayName', 'institutionalControl',
       'institution_name', 'branches', 'region', 'admission_rate',
       'ug_enrollment', 'percent_white', 'percent_black', 'percent_hispanic',
       'percent_asian', 'percent_aian', 'percent_nhpi', 'percent_twoormore',
       'percent_nra', 'percent_unknown', 'percent_parttime', 'avg_cost',
       'instante_tuition', 'outstate_tuition', 'tuition_revenue_per',
       'instructional_expenditure_per', 'avg_faculty_salary',
       'ft_faculty_rate', 'avg_net_price', 'number_titleIV', 'sat_avg'],
      dtype='object')

In [68]:
df['rankingSortRank'] = df['rankingSortRank'].replace(-1, 300)

In [69]:
bins = [0, 75, 150, 225, float('inf')]
labels = ['high', 'med_high', 'med_low', 'low']
df['categories'] = pd.cut(df['rankingSortRank'], bins=bins, labels=labels)

In [70]:
X = df[['act_avg', 'percent_receiving_aid', 'cost_after_aid',
        'hs_gpa_avg', 'businessRepScore', 'tuition',
       'engineeringRepScore','branches', 'region', 'admission_rate',
       'ug_enrollment', 'percent_white', 'percent_black', 'percent_hispanic',
       'percent_asian', 'percent_aian', 'percent_nhpi', 'percent_twoormore',
       'percent_nra', 'percent_unknown', 'percent_parttime', 'avg_cost',
       'instante_tuition', 'outstate_tuition', 'tuition_revenue_per',
       'instructional_expenditure_per', 'avg_faculty_salary',
       'ft_faculty_rate', 'avg_net_price', 'number_titleIV', 'sat_avg']]

In [71]:
model = sm.ols(formula = 'rankingSortRank~' + '+'.join(X.columns),data = df).fit()
model.summary()

0,1,2,3
Dep. Variable:,rankingSortRank,R-squared:,0.894
Model:,OLS,Adj. R-squared:,0.881
Method:,Least Squares,F-statistic:,72.63
Date:,"Sun, 05 Mar 2023",Prob (F-statistic):,3.63e-112
Time:,20:30:23,Log-Likelihood:,-1464.0
No. Observations:,300,AIC:,2992.0
Df Residuals:,268,BIC:,3110.0
Df Model:,31,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-9885.2509,2.52e+04,-0.393,0.695,-5.95e+04,3.97e+04
act_avg,-3.7605,1.803,-2.086,0.038,-7.309,-0.212
percent_receiving_aid,0.2265,0.249,0.911,0.363,-0.263,0.716
cost_after_aid,0.0008,0.001,1.580,0.115,-0.000,0.002
hs_gpa_avg,-38.3910,13.057,-2.940,0.004,-64.099,-12.683
businessRepScore,-21.4875,5.852,-3.672,0.000,-33.010,-9.965
tuition,0.0009,0.001,0.857,0.392,-0.001,0.003
engineeringRepScore,4.3522,5.223,0.833,0.405,-5.932,14.636
branches,-1.7699,1.304,-1.358,0.176,-4.336,0.797

0,1,2,3
Omnibus:,8.964,Durbin-Watson:,1.495
Prob(Omnibus):,0.011,Jarque-Bera (JB):,8.93
Skew:,0.384,Prob(JB):,0.0115
Kurtosis:,3.354,Cond. No.,3400000000.0


In [72]:
prediction = model.predict(df)

In [73]:
bins = [-float('inf'), 75, 150, 225, float('inf')]
labels = ['high', 'med_high', 'med_low', 'low']
categories = pd.cut(prediction, bins=bins, labels=labels)
df['pred_category'] = categories

In [80]:
num_matches = df['categories'].eq(df['pred_category']).value_counts(normalize=True)[True] * len(df)
num_matches/len(df)

0.82

### Splitting into Train and Test

In [81]:
test = df.sample(n=50, random_state=1)
train = df.drop(test.index)

In [82]:
model1 = sm.ols(formula = 'rankingSortRank~' + '+'.join(X.columns),data = train).fit()
model1.summary()

0,1,2,3
Dep. Variable:,rankingSortRank,R-squared:,0.896
Model:,OLS,Adj. R-squared:,0.881
Method:,Least Squares,F-statistic:,60.41
Date:,"Sun, 05 Mar 2023",Prob (F-statistic):,5.4699999999999995e-90
Time:,21:02:36,Log-Likelihood:,-1219.5
No. Observations:,250,AIC:,2503.0
Df Residuals:,218,BIC:,2616.0
Df Model:,31,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.129e+04,2.9e+04,0.735,0.463,-3.58e+04,7.84e+04
act_avg,-2.9711,2.026,-1.466,0.144,-6.964,1.022
percent_receiving_aid,0.1447,0.275,0.526,0.600,-0.398,0.687
cost_after_aid,0.0006,0.001,1.018,0.310,-0.001,0.002
hs_gpa_avg,-33.4451,14.271,-2.344,0.020,-61.571,-5.319
businessRepScore,-18.0375,6.467,-2.789,0.006,-30.783,-5.292
tuition,0.0008,0.001,0.732,0.465,-0.001,0.003
engineeringRepScore,5.9757,5.825,1.026,0.306,-5.504,17.455
branches,-4.9690,2.011,-2.471,0.014,-8.933,-1.005

0,1,2,3
Omnibus:,7.094,Durbin-Watson:,1.49
Prob(Omnibus):,0.029,Jarque-Bera (JB):,6.851
Skew:,0.384,Prob(JB):,0.0325
Kurtosis:,3.263,Cond. No.,3510000000.0


In [83]:
y_pred = model.predict(test)

In [84]:
bins = [-float('inf'), 75, 150, 225, float('inf')]
labels = ['high', 'med_high', 'med_low', 'low']
categories = pd.cut(y_pred, bins=bins, labels=labels)
test['pred_category'] = categories

In [85]:
matches = test['categories'].eq(test['pred_category']).value_counts(normalize=True)[True] * len(df)
matches/len(df)

0.78