In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("newdata.csv")

In [3]:
df.head()

Unnamed: 0,act_avg,city,zip,percent_receiving_aid,cost_after_aid,state,rankingSortRank,hs_gpa_avg,businessRepScore,tuition,...,avg_cost,instante_tuition,outstate_tuition,tuition_revenue_per,instructional_expenditure_per,avg_faculty_salary,ft_faculty_rate,avg_net_price,number_titleIV,sat_avg
0,32.0,Princeton,8544,60.0,16793.0,NJ,1,3.9,,47140,...,64390.0,47140.0,47140.0,13216.0,53658.0,17519.0,0.849,10027.0,346.0,1500.0
1,32.0,Cambridge,2138,55.0,16338.0,MA,2,4.0,,48949,...,66900.0,48949.0,48949.0,31930.0,43823.0,19741.0,0.8589,14327.0,350.0,1523.0
2,32.0,Chicago,60637,42.0,27767.0,IL,3,4.0,,54825,...,72717.0,56034.0,56034.0,28508.0,89151.0,18115.0,0.8239,25455.0,494.0,1524.0
3,32.0,New Haven,6520,50.0,18385.0,CT,3,,,51400,...,68950.0,51400.0,51400.0,17934.0,120956.0,17517.0,0.7459,18627.0,387.0,1520.0
4,32.0,New York,10027,48.0,21041.0,NY,5,,,57208,...,71972.0,57208.0,57208.0,37294.0,89742.0,18713.0,0.4688,24231.0,526.0,1522.0


In [4]:
df.columns

Index(['act_avg', 'city', 'zip', 'percent_receiving_aid', 'cost_after_aid',
       'state', 'rankingSortRank', 'hs_gpa_avg', 'businessRepScore', 'tuition',
       'engineeringRepScore', 'displayName', 'institutionalControl',
       'institution_name', 'branches', 'region', 'admission_rate',
       'ug_enrollment', 'percent_white', 'percent_black', 'percent_hispanic',
       'percent_asian', 'percent_aian', 'percent_nhpi', 'percent_twoormore',
       'percent_nra', 'percent_unknown', 'percent_parttime', 'avg_cost',
       'instante_tuition', 'outstate_tuition', 'tuition_revenue_per',
       'instructional_expenditure_per', 'avg_faculty_salary',
       'ft_faculty_rate', 'avg_net_price', 'number_titleIV', 'sat_avg'],
      dtype='object')

In [6]:
pred_high = pd.read_csv("predicted_h_newdata.csv")
pred_high_230 = pd.read_csv("predicted_h_230rows_newdata.csv")

In [7]:
pred_high = pred_high.fillna(pred_high.mean())
pred_high_230 = pred_high_230.fillna(pred_high_230.mean())

In [8]:
# selecting columns in int and float type for loocv
preds = pred_high[['act_avg', 'sat_avg','percent_receiving_aid',
       'cost_after_aid', 'hs_gpa_avg','businessRepScore', 'tuition',
       'engineeringRepScore','branches', 'admission_rate', 
       'ug_enrollment', 'percent_white', 'percent_black', 'percent_hispanic',
       'percent_asian', 'percent_aian', 'percent_nhpi', 'percent_twoormore',
       'percent_nra', 'percent_unknown', 'percent_parttime', 'avg_cost',
       'instante_tuition', 'outstate_tuition', 'tuition_revenue_per',
       'instructional_expenditure_per', 'avg_faculty_salary',
       'ft_faculty_rate', 'avg_net_price', 'number_titleIV', 'sat_avg']]
target = pred_high[['rankingSortRank']]

In [9]:
# use loocv to predict and get accuracy
loocv = LeaveOneOut()
loocv.get_n_splits(preds)

model = LogisticRegression()

true = []
predicted = []

for train_index, test_index in loocv.split(preds):

    X_train = preds.loc[train_index]
    X_test = preds.loc[test_index]
    y_train = target.loc[train_index]
    y_test = target.loc[test_index]
    
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    true.append(y_test['rankingSortRank'].values[0])
    predicted.append(y_pred[0])
    
print(true)
print("-----")
print(predicted)
accuracy = accuracy_score(true, predicted)

print("Accuracy:", accuracy) # accuracy score for specific rank predicting, will do classification accuracy later

[1, 2, 3, 3, 5, 5, 5, 8, 9, 10, 11, 11, 11, 14, 14, 14, 18, 18, 20, 21, 21, 21, 25, 25, 27, 28, 29, 30, 32, 34, 34, 37, 37, 40, 42, 42, 46, 46, 46, 56, 56, 56, 61, 81, 87, 87, 94, 97, 103, 103, 110, 124, 124, 133, 140, 151, 156, 159, 165, 171, 176, 176, 181, 187, 187, 192, 192, 198, 198, 202, 202, 202, 202, 207, 207, 207, 207, 216, 216, 216, 216, 216, 223, 223, 223, 223, 223, 223, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
-----
[3, 14, 14, 10, 5, 5, 5, 5, 11, 3, 14, 18, 14, 25, 11, 3, 46, 3, 25, 11, 18, 21, 20, 140, 11, 25, 32, 28, 46, 42, 18, 46, 11, 46, 94, 46, 27, 42, 56, 87, 46, 207, 37, 42, 42, -1, 40, 25, 87, 124, 87, 156, 103, -1, 46, 176, -1, 192, 103, -1, -1, -1, 87, -1, -1, -1, 198, -1, 216, -1, -1, 181, 124, -1, -1, 159, -1, 176, -1, -1, -1, -1, -1, 103, -1, -1, -1, -1, 22

In [23]:
loocv = LeaveOneOut()
loocv.get_n_splits(preds)

model = LogisticRegression()

true = []
predicted = []

high_df = []
med_high_df = []
high_predicted = []
med_high_predicted = []

for train_index, test_index in loocv.split(preds):

    X_train = preds.loc[train_index]
    X_test = preds.loc[test_index]
    y_train = target.loc[train_index]
    y_test = target.loc[test_index]
    
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    true.append(y_test['rankingSortRank'].values[0])
    predicted.append(y_pred[0])
    
    #classifying into high and medium high
    
    if y_test['rankingSortRank'].values[0] < 30:
        high_df.append(test_index[0])
    else:
        med_high_df.append(test_index[0])
    if y_pred[0] < 50:
        high_predicted.append(test_index[0])
    else:
        med_high_predicted.append(test_index[0])

print(high_df)
print("-----")
print(med_high_df)
print("prediction below-----------------")
print(high_predicted)
print("-----")
print(med_high_predicted)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149]
-----
[27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87]
prediction below-----------------
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35, 36, 37, 40, 42, 43, 44, 45, 46, 47, 53, 54, 56, 59, 60, 61, 63, 64, 65, 67, 69, 70, 73, 74, 76, 78, 79, 80, 81, 82, 84, 85, 86, 87, 89, 90, 91, 93, 94, 95, 96, 98, 99, 102, 103, 104, 105, 106

In [20]:
correct_h = 0
for i in high_df:
    for j in high_predicted:
        if i ==j:
            correct_h = correct_h + 1 
correct_h          

69

In [21]:
correct_m = 0
for i in med_high_df:
    for j in med_high_predicted:
        if i ==j:
            correct_m = correct_m + 1 
correct_m

21

In [22]:
# accuracy rate for classifying
(correct_h +correct_m)/pred_high.shape[0]

0.6

In [24]:
# selecting columns in int and float type for loocv but using the dataset that drops -1 rankings
preds = pred_high_230[['act_avg', 'sat_avg','percent_receiving_aid',
       'cost_after_aid', 'hs_gpa_avg','businessRepScore', 'tuition',
       'engineeringRepScore','branches', 'admission_rate', 
       'ug_enrollment', 'percent_white', 'percent_black', 'percent_hispanic',
       'percent_asian', 'percent_aian', 'percent_nhpi', 'percent_twoormore',
       'percent_nra', 'percent_unknown', 'percent_parttime', 'avg_cost',
       'instante_tuition', 'outstate_tuition', 'tuition_revenue_per',
       'instructional_expenditure_per', 'avg_faculty_salary',
       'ft_faculty_rate', 'avg_net_price', 'number_titleIV', 'sat_avg']]
target = pred_high_230[['rankingSortRank']]

In [25]:
# use loocv to predict and get accuracy
loocv = LeaveOneOut()
loocv.get_n_splits(preds)

model = LogisticRegression()

true = []
predicted = []

for train_index, test_index in loocv.split(preds):

    X_train = preds.loc[train_index]
    X_test = preds.loc[test_index]
    y_train = target.loc[train_index]
    y_test = target.loc[test_index]
    
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    true.append(y_test['rankingSortRank'].values[0])
    predicted.append(y_pred[0])
    
print(true)
print("-----")
print(predicted)
accuracy = accuracy_score(true, predicted)

print("Accuracy:", accuracy) # accuracy score for specific rank predicting, will do classification accuracy later

[1, 2, 3, 3, 5, 5, 5, 8, 9, 10, 11, 11, 11, 14, 14, 14, 18, 18, 20, 21, 21, 21, 25, 25, 27, 28, 29, 30, 32, 34, 34, 37, 37, 40, 42, 42, 46, 46, 46, 56, 56, 56, 61, 81, 81, 87, 94, 97, 103, 110, 124, 140, 145, 159, 165]
-----
[3, 14, 14, 10, 8, 5, 3, 5, 21, 3, 14, 18, 18, 25, 11, 1, 46, 3, 11, 11, 30, 21, 11, 140, 11, 25, 32, 25, 46, 94, 11, 46, 11, 94, 94, 124, 29, 42, 56, 42, 61, 145, 37, 103, 56, 42, 40, 25, 42, 81, 165, 46, 18, 81, 124]
Accuracy: 0.03636363636363636


In [26]:
loocv = LeaveOneOut()
loocv.get_n_splits(preds)

model = LogisticRegression()

true = []
predicted = []

high_df = []
med_high_df = []
high_predicted = []
med_high_predicted = []

for train_index, test_index in loocv.split(preds):

    X_train = preds.loc[train_index]
    X_test = preds.loc[test_index]
    y_train = target.loc[train_index]
    y_test = target.loc[test_index]
    
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    true.append(y_test['rankingSortRank'].values[0])
    predicted.append(y_pred[0])
    
    #classifying into high and medium high
    
    if y_test['rankingSortRank'].values[0] < 30:
        high_df.append(test_index[0])
    else:
        med_high_df.append(test_index[0])
    if y_pred[0] < 50:
        high_predicted.append(test_index[0])
    else:
        med_high_predicted.append(test_index[0])

print(high_df)
print("-----")
print(med_high_df)
print("prediction below-----------------")
print(high_predicted)
print("-----")
print(med_high_predicted)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
-----
[27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54]
prediction below-----------------
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 30, 31, 32, 36, 37, 39, 42, 45, 46, 47, 48, 51, 52]
-----
[23, 29, 33, 34, 35, 38, 40, 41, 43, 44, 49, 50, 53, 54]


In [27]:
correct_h = 0
for i in high_df:
    for j in high_predicted:
        if i ==j:
            correct_h = correct_h + 1 
correct_h          

26

In [28]:
correct_m = 0
for i in med_high_df:
    for j in med_high_predicted:
        if i ==j:
            correct_m = correct_m + 1 
correct_m

13

In [30]:
# accuracy rate for classifying
(correct_h +correct_m)/pred_high_230.shape[0]

0.7090909090909091

In [33]:
predicted_high = pred_high.iloc[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 30, 31, 32, 36, 37, 39, 42, 45, 46, 47, 48, 51, 52]]
predicted_high.to_csv("predicted_high_submodel_230.csv")
predicted_med_high = pred_high.iloc[[23, 29, 33, 34, 35, 38, 40, 41, 43, 44, 49, 50, 53, 54]]
predicted_med_high.to_csv("predicted_medium_high_submodel_230.csv")