In [1]:
#imports
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
from sklearn.metrics import r2_score
import plotly.graph_objects as go

In [2]:
# load csv
df_college = pd.read_csv('college_data.csv')
df_college.head(3)

Unnamed: 0,College,SAT,Acceptance Rate,Earnings 6 yrs after entry,Share First Generation,Retention Rate,Student Faculty Ratio,Mean Net Price,Diversity,Students with Loans,Faculty Salary
0,Abilene Christian University,1186.0,0.7011,45389.0,0.244523,0.7923,13.0,29300.0,0.3659,0.913365,8455.0
1,Abraham Baldwin Agricultural College,1026.0,0.7536,29825.0,0.344345,0.6515,19.0,7203.0,0.1983,0.712761,6436.0
2,Academy of Art University,,,31825.0,0.40837,0.6899,14.0,32736.0,0.7945,0.917342,9482.0


In [3]:
df_neu = df_college.where(df_college['College'] == 'Northeastern University').dropna()
df_neu
#TODO - acceptance rate for northeastern is .7981?

Unnamed: 0,College,SAT,Acceptance Rate,Earnings 6 yrs after entry,Share First Generation,Retention Rate,Student Faculty Ratio,Mean Net Price,Diversity,Students with Loans,Faculty Salary
879,Northeastern University,1395.0,0.7981,50014.0,0.331536,0.6179,8.0,22037.0,0.6667,0.93578,10284.0


In [4]:
def isolate_columns(df, cols):
    """ creates subset of given data, removes NaNs

    Arguments:
        df (pandas dataframe): original dataset
        cols (list of string): names of columns to keep in subset
    
    Returns:
        pandas dataframe with certain columns
    """
    return df[cols].dropna().reset_index().drop(columns=['index'])

In [5]:
def col_to_array(df, col, reshape=False):
    """ converts a column of a dataframe to an array 
    
    Arguments:
        df (pandas dataframe): dataframe containing column
        col (str): name of columnn
        reshape (boolean, default=False): determines if reshaping is necessary
    
    Returns:
        arr (numpy array): converted column
    """
    
    arr = df[col].to_numpy()
    if reshape:
        arr = arr.reshape((len(arr), 1))
    return arr

In [6]:
def coef_to_poly_str(coef, include_zero=False):
    """ creates a str of polynomial corresponding to coef   
    modified function from Day 16 Notes 

    Args:
        coef (np.array): coefficients of polynomial
        
    Returns:
        poly_str (str): str of polynomial
    """
    
    str_monomial_list = []
    for deg, coef in enumerate(coef):
        if coef == 0 and include_zero == False:
            continue
        str_monomial_list.append(f'{coef:+.1f} x^{deg:d}')
        
    # handle case of all zero coef
    if len(str_monomial_list):
        poly_str = ' '.join(str_monomial_list)
    else:
        poly_str = '0'
        
    return f'y = {poly_str}'

In [7]:
def fit_plot_poly(x, y, degree, cols=False):
    """ fits and plots a polynomial of given degree
    modified function from Day 16 Notes
    
    Arguments:
        x (np.array): (n_sample, 1) array of x inputs
        y (np.array): (n_sample, 1) array of target values 
        degree (int): max degree of polynomial
        cols (boolean): determines if columns need to be converted to arrays 
    
    Returns:
        dictionary of resulting line information
    """

    d = isolate_columns(df_college, [x, y])
    x = col_to_array(d, x, True)
    y = col_to_array(d, y)
    
    # project x to polynomial
    poly_project = PolynomialFeatures(degree=degree)
    X_poly = poly_project.fit_transform(x)

    # fit polynomial regression
    b = np.matmul(np.linalg.inv(np.matmul(X_poly.T, X_poly)), np.matmul(X_poly.T, y))
    
    # predict y values (line of polynomial)
    x_fine = np.linspace(x.min(), x.max(), 101).reshape(-1, 1)
    X_fine_poly = poly_project.fit_transform(x_fine)
    y_pred_fine = np.matmul(X_fine_poly, b)
    
    # compute r2
    y_pred = np.matmul(X_poly, b)
    r2 = r2_score(y_true=y, y_pred=y_pred)
    
    # get the predicted line
    str_poly_pred = coef_to_poly_str(b)
    
    # return dictionary of line information
    return {'x_fine':x_fine, 'y_pred_fine':y_pred_fine, 'str_poly_pred':str_poly_pred, 'degree':degree, 'r2':r2}

In [8]:
def scatter_plot_line(df, x, y, degree=1):
    """ creates a scatter plot based on given data

    Arguments:
        df (pandas dataframe): dataframe containing data
        x (str): name of column to be x axis data
        y (str): name of column to be y axis data
        degree (int,): degree of polynomial of regression line

    Returns:
        none
    """

    # get regression line data
    line = fit_plot_poly(x, y, degree, cols=False)

    # plot figure
    fig = go.Figure(data=[

        # scatter data
        go.Scatter(x=df[x], y=df[y], mode='markers', marker=dict(opacity=0.4), hovertext=df['College'], hoverinfo='text', showlegend=False),

        # regression line
        go.Scatter(mode='lines', x=line['x_fine'].flatten(), y=line['y_pred_fine'], name='y', line=dict(color='#170e80'))])

    # format titles and labels
    fig.update_layout(title=dict(
            text='College ' + y + ' vs. ' + x,
            x=0.5  # Center the title horizontally
        ),
        xaxis_title=x,  # label x axis
        yaxis_title=y, # label y axis
        annotations=[
            dict(text='degree: ' + str(degree), xref='paper', y=1.18, yref='paper',
                showarrow=False, font=dict(size=12)),
            dict(text='R^2 = ' + str(round(line['r2'], 4)), xref='paper', y=1.13, yref='paper',
                showarrow=False, font=dict(size=12)),
            dict(text=line['str_poly_pred'], xref='paper', y=1.08, yref='paper',
                showarrow=False, font=dict(size=12))
        ])

    # show
    fig.show()

In [9]:
# columns to compare with retention rate
cols = [list(df_college.columns)[i] for i in range(len(list(df_college.columns))) if i not in [0, 5]]

for c in cols:
    scatter_plot_line(df_college, c, 'Retention Rate', degree=3)

# TODO - each plot may have its own best fitting degree