# Making our own college rankings

## Libraries
We need a few libraries installed. Let us check to make sure they are installed first:

In [3]:
!pip install openpyxl pandas numpy 

Defaulting to user installation because normal site-packages is not writeable


Now we must import those libraries

In [4]:
import numpy as np

import scipy.stats as stats

import pandas as pd

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', None)

pd.options.mode.chained_assignment = None

## Read in an excel file that is uploaded to the current directory
If you have a csv file, you do `data = pd.read_csv("college_rankings.csv")`

In [5]:
data = pd.read_excel("college_rankings_data.xlsx")

### Exploring the data

In [6]:
data

Unnamed: 0,name,urban_rural,aapi_serving,acceptance_rate,average_tuition_1yr,sat_avg,teaching_spent_per_student,carnegie_classification,completion_rate_4yr,cost_roomboard_offcampus,cost_roomboard_oncampus,percent_degrees_business,percent_degrees_comm,percent_degrees_compsci,percent_degrees_eng,percent_degrees_health,percent_degrees_socsci,endowment,avg_faculty_salary_mo,completion_rate_6yrs,hispanic_serving,historically_black,latitude,longitude,median_debt_at_grad,median_debt_at_withdraw,sat_avg_math,sat_avg_reading,men_only,net_price,net_price_parents_under_30k,num_undergrads,online_only,percent_fulltime_faculty,ugs_percent_anyloans,ugs_percent_asian,ugs_percent_black,ugs_percent_hipi,ugs_percent_hispanic,ugs_percent_intl,ugs_percent_multirace,ugs_percent_over25,ugs_percent_pellgrant,ugs_percent_white,ugs_percent_women,geo_region,state,zip_code
0,Alabama A & M University,1,0,0.9175,2353.0,939.0,5546.0,18,0.0797,9240.0,9240.0,0.1589,0.0000,0.0554,0.1196,0.0000,0.0250,,7709.0,0.2974,0,1,34.783368,-86.568502,31000.0,10221.0,455.0,470.0,0,14990.0,1431.0,5271.0,0,0.7110,0.7361,0.0019,0.9072,0.0011,0.0091,0.0070,0.0114,0.0383,0.7019,0.0140,0.6033,5,AL,35762
1,University of Alabama at Birmingham,1,0,0.7366,24495.0,1234.0,14983.0,15,0.4023,12307.0,12307.0,0.2008,0.0377,0.0160,0.0635,0.2214,0.0335,537349307.0,11049.0,0.6340,0,0,33.505697,-86.799345,22250.0,9500.0,608.0,620.0,0,16953.0,14554.0,13328.0,0,0.7754,0.4798,0.0678,0.2414,0.0003,0.0565,0.0230,0.0446,0.1996,0.3512,0.5586,0.6110,5,AL,35294
2,University of Alabama in Huntsville,1,0,0.8257,23917.0,1319.0,8488.0,16,0.2696,10400.0,10400.0,0.1809,0.0113,0.0743,0.2875,0.1562,0.0225,77250279.0,9688.0,0.5768,0,0,34.724557,-86.640449,21450.0,9500.0,695.0,669.0,0,15860.0,131.0,7785.0,0,0.6434,0.3976,0.0347,0.0879,0.0014,0.0566,0.0218,0.0361,0.1543,0.2536,0.7161,0.4190,5,AL,35899
3,Alabama State University,1,0,0.9690,21866.0,946.0,9346.0,19,0.1648,7320.0,6050.0,0.1098,0.0941,0.0686,0.0020,0.1431,0.0157,94536751.0,7221.0,0.3276,0,1,32.364317,-86.295677,31000.0,10489.0,470.0,480.0,0,13650.0,13531.0,3750.0,0,0.6501,0.8232,0.0037,0.9275,0.0008,0.0099,0.0147,0.0120,0.0691,0.7627,0.0163,0.6309,5,AL,36104
4,The University of Alabama,1,0,0.8268,29872.0,1261.0,9983.0,15,0.5020,13636.0,13636.0,0.2824,0.1060,0.0129,0.1301,0.0854,0.0394,832842409.0,10291.0,0.7110,0,0,33.211875,-87.545978,23072.0,9500.0,605.0,605.0,0,22597.0,19322.0,31900.0,0,0.7604,0.3802,0.0119,0.1008,0.0012,0.0505,0.0235,0.0352,0.0828,0.1772,0.7683,0.5549,5,AL,35487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1994,Mechon L'hoyroa,2,0,,,,44299.0,24,,,,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,,2367.0,,0,0,41.117029,-74.058177,,,,,0,,,55.0,0,,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0909,0.8000,1.0000,0.0000,2,NY,10952
1995,Bais Medrash Mayan Hatorah,1,0,0.6667,1665.0,,4706.0,24,,14560.0,2200.0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,,1118.0,,0,0,40.107414,-74.204332,,,,,0,13498.0,,46.0,0,,0.0000,0.0000,0.0000,0.0000,0.0000,0.0435,0.0000,0.0435,0.1515,0.9565,0.0000,2,NJ,8701
1996,Purdue University Northwest,2,0,0.2857,17323.0,1074.0,9425.0,18,0.1844,7800.0,7765.0,0.1563,0.0384,0.0384,0.0777,0.3930,0.0237,30008886.0,8324.0,0.3778,0,0,41.584324,-87.474236,21207.0,9500.0,535.0,535.0,0,11250.0,6911.0,7546.0,0,0.9971,0.3518,0.0298,0.0999,0.0007,0.2120,0.0219,0.0284,0.2387,0.3291,0.5914,0.5510,3,IN,46323
1997,Yeshiva Kollel Tifereth Elizer,1,0,0.7143,19643.0,,4119.0,24,,14560.0,4600.0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,,2615.0,,0,0,40.637096,-73.992171,,,,,0,8283.0,8283.0,128.0,0,0.9167,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0078,0.8925,1.0000,0.0000,2,NY,11219


In [7]:
data.columns

Index(['name', 'urban_rural', 'aapi_serving', 'acceptance_rate',
       'average_tuition_1yr', 'sat_avg', 'teaching_spent_per_student',
       'carnegie_classification', 'completion_rate_4yr',
       'cost_roomboard_offcampus', 'cost_roomboard_oncampus',
       'percent_degrees_business', 'percent_degrees_comm',
       'percent_degrees_compsci', 'percent_degrees_eng',
       'percent_degrees_health', 'percent_degrees_socsci', 'endowment',
       'avg_faculty_salary_mo', 'completion_rate_6yrs', 'hispanic_serving',
       'historically_black', 'latitude', 'longitude', 'median_debt_at_grad',
       'median_debt_at_withdraw', 'sat_avg_math', 'sat_avg_reading',
       'men_only', 'net_price', 'net_price_parents_under_30k',
       'num_undergrads', 'online_only', 'percent_fulltime_faculty',
       'ugs_percent_anyloans', 'ugs_percent_asian', 'ugs_percent_black',
       'ugs_percent_hipi', 'ugs_percent_hispanic', 'ugs_percent_intl',
       'ugs_percent_multirace', 'ugs_percent_over25', 'ugs_p

In [8]:
data.sample(3)

Unnamed: 0,name,urban_rural,aapi_serving,acceptance_rate,average_tuition_1yr,sat_avg,teaching_spent_per_student,carnegie_classification,completion_rate_4yr,cost_roomboard_offcampus,cost_roomboard_oncampus,percent_degrees_business,percent_degrees_comm,percent_degrees_compsci,percent_degrees_eng,percent_degrees_health,percent_degrees_socsci,endowment,avg_faculty_salary_mo,completion_rate_6yrs,hispanic_serving,historically_black,latitude,longitude,median_debt_at_grad,median_debt_at_withdraw,sat_avg_math,sat_avg_reading,men_only,net_price,net_price_parents_under_30k,num_undergrads,online_only,percent_fulltime_faculty,ugs_percent_anyloans,ugs_percent_asian,ugs_percent_black,ugs_percent_hipi,ugs_percent_hispanic,ugs_percent_intl,ugs_percent_multirace,ugs_percent_over25,ugs_percent_pellgrant,ugs_percent_white,ugs_percent_women,geo_region,state,zip_code
652,Regis College,2,0,0.7926,5936.0,,9295.0,26,0.6194,16000.0,15790.0,0.0261,0.0114,0.0,0.0,0.8203,0.0131,41835527.0,8203.0,0.6716,0,0,42.351291,-71.310561,25250.0,10500.0,,,0,30408.0,23281.0,1328.0,0,0.3108,0.6797,0.0324,0.1054,0.0,0.1363,0.0075,0.0143,0.1886,0.2727,0.4292,0.8208,1,MA,2493
370,Monmouth College,3,0,0.6667,49178.0,1099.0,15889.0,21,0.5436,5500.0,9330.0,0.2041,0.0612,0.0051,0.0,0.0,0.1327,122619245.0,6812.0,0.6179,0,0,40.914824,-90.63731,27000.0,6500.0,530.0,550.0,0,17061.0,13231.0,900.0,0,0.7838,0.7177,0.0089,0.0922,0.0,0.1011,0.0344,0.0289,0.011,0.3665,0.6733,0.5,3,IL,61462
957,Manhattan School of Music,1,0,0.3554,68686.0,,17878.0,30,0.8519,16195.0,16195.0,0.0,0.0,0.0,0.0,0.0,0.0,27018007.0,8207.0,0.8765,0,0,40.812312,-73.961334,26672.0,12000.0,,,0,54902.0,33759.0,518.0,0,0.2946,0.3432,0.0483,0.0656,0.0019,0.0328,0.3185,0.056,0.0347,0.1292,0.4305,0.4942,2,NY,10027


In any view of a dataframe, add `.T` to transpose: flip rows and columns to see easier

data.sample(3).T

## Columns to input
For lab, change these so they include all the columns you are interested in. We also want to keep the name of the school. Note the spacing. You can put it all on one line, or split it into multiple lines. If you split it into multiple lines, be careful about the spacing/indendation.

In [None]:
input_columns = ['name',
                'sat_avg', 
                'average_tuition_1yr', 
                'cost_roomboard_offcampus', 
                'completion_rate_6yrs']

## Cleaning up the data, removing rows where there is blank data for certain columns

If we do a search query for a particular school, sometimes we see `NaN`, which is how python and pandas represent blank or no data. Datasets are often incomplete! San Francisco Art Institute did not report the average SAT score. 

In [None]:
data.query("name == 'San Francisco Art Institute'")

In [None]:
data.query("name == 'San Francisco Art Institute'").T

## Dropping schools that have NaN values for the columns we care about (the `input_columns`)

We want to keep our existing `data` variable, so we will save this as a new variable called `data_cleaned`. We will then select only the input_columns and save that again as `data_cleaned`:

In [None]:
data_cleaned = data.dropna(subset=input_columns)
data_cleaned = data_cleaned[input_columns]
data_cleaned

## Getting to the rankings
### Normalizing data
Normalizing is a process where all columns are on the same scale, or the difference between the largest and the smallest value. Some columns in the dataset are percentages, on a scale between 0 and 1, like `completion_rate_6yrs`. Others like `average_tuition_1yr` can go from 0 to over 70,000. If we had a model that was 50% each of these two variables, and we just multipled the data by 0.5, then `average_tuition` would have a much bigger impact.  

So the most straightforward way to deal with this is to convert each column to be a percent of the largest value in the dataset. If we are normalizing tuition, then the school with the highest tuition would be 1.0, the school with the lowest tuition would be 0.0. We do that by dividing each value by the largest one (or the `max`imum). You can see an example of this here:

In [None]:
data_cleaned['norm_sat_avg'] = data_cleaned['sat_avg'] / max(data_cleaned['sat_avg'])
data_cleaned[['norm_sat_avg','sat_avg']]

### Z-scores

There is a better way of normalizing with Z-scores, where the average (or mean) is converted to 0.0, then most scores are between -1.0 and +1.0, but extreme outliers can get to +/- 3 or 4. You don't need to know much more than that for now. Thankfully, scipy has a function in stats that calculates it for you:

In [None]:
stats.zscore(data_cleaned['sat_avg'])

The code below runs through each column in our input columns, then if it isn't the name column, it creates a new column that begins with `z_` and then the original column name. We could do it manually, but this saves time.

In [None]:
for column in input_columns:
    if column != 'name':
        z_name = "z_" + column
        print("Calculating z-scores for column " + column + ", saving as " + z_name)
        data_cleaned[z_name] = stats.zscore(data_cleaned[column])

If we look at a histogram of `sat_avg` and `z_sat_avg`, they have the same shape or distribution. But the first goes from 700 to 1600, with the peak around 1100, while the second goes from -3 to +3, with the peak around 0.

In [None]:
data_cleaned['sat_avg'].plot(kind='hist')

In [None]:
data_cleaned['z_sat_avg'].plot(kind='hist')

### Weighted rankings

So we need to define our weights for each. The total for absolute value of all the weights (ignore negative sign when adding) should add up to 1.0.

If you have something where the higher it is, the better it is (like completion rate or SAT average), then the weight should be negative. If you have something where the higher it is, the worse it is (like tuition or cost of living), then the weight should be negative.

In [None]:
data_cleaned['z_ranking_final'] = data_cleaned['z_sat_avg'] * 0.25 + \
                                  data_cleaned['z_average_tuition_1yr'] * -0.25 + \
                                  data_cleaned['z_cost_roomboard_offcampus'] * -0.2 + \
                                  data_cleaned['z_completion_rate_6yrs'] * 0.3

In [None]:
if 'z_ranking_final' not in input_columns:
    input_columns.append('z_ranking_final')
    
data_cleaned[input_columns].sort_values('z_ranking_final')

In [None]:
sorted_list = data_cleaned.sort_values('z_ranking_final',ascending=False)
sorted_list

## Save and export to Excel

In [None]:
sorted_list.to_excel("final_ranking.xlsx")