<img src="https://news.illinois.edu/files/6367/543635/116641.jpg" alt="University of Illinois" width="250"/>

# Project 22: UIUC GPA

## Team Members
* Yiping Li - [yipingl4@illinois.edu](mailto:yipingl4@illinois.edu)
* Leo Yang - [junjiey3@illinois.edu](mailto:junjiey3@illinois.edu)
* Shijie Sun - [shijies5@illinois.edu](mailto:shijies5@illinois.edu)
* Richwell Perez - [richwell@illinois.edu](mailto:richwell@illinois.edu)

## Problem Summary
The purpose of this project is to implement deep learning concepts and 
techniques on a real dataset: UIUC GPA. The general questions that will require the application of deep learning is predicting the GPA/grade distribution of UIUC courses in the future. The project will provide some visualization of the data and descriptive statistics, implement linear or logistic regression, and recurrent neural networks.

## License
Dataset is obtained from Professor Ulmschneider's uiuc-gpa-dataset. Project 
curated by Jared Canty (Summer 2022 Blackwell Program). All rights are reserved.


Dataset on UIUC GPA is available at
https://github.com/wadefagen/datasets/tree/master/gpa (“uiuc-gpa-dataset.csv”)



In [125]:
import numpy as np
import pandas as pd
import time
import random
import matplotlib
#%matplotlib notebook
import matplotlib.pyplot as plt
import scipy.stats
import matplotlib.offsetbox as offsetbox
from matplotlib.ticker import StrMethodFormatter

In [126]:
#for some reason, this needs to be in a separate cell
params={
    "font.size":15,
    "lines.linewidth":5,
}
plt.rcParams.update(params)

##**10/30 Milestone:**##

###**Dataset to pandas**

In [127]:
file_url = "https://raw.githubusercontent.com/wadefagen/datasets/master/gpa/uiuc-gpa-dataset.csv"

In [128]:
gpa_data = pd.read_csv(file_url, header=0)
gpa_data

Unnamed: 0,Year,Term,YearTerm,Subject,Number,Course Title,Sched Type,A+,A,A-,...,B-,C+,C,C-,D+,D,D-,F,W,Primary Instructor
0,2022,Spring,2022-sp,AAS,100,Intro Asian American Studies,LCD,6,13,0,...,1,0,3,0,1,1,0,0,0,"Lee, Sang S"
1,2022,Spring,2022-sp,AAS,100,Intro Asian American Studies,DIS,0,11,5,...,2,1,0,1,1,0,0,0,0,"Zheng, Reanne"
2,2022,Spring,2022-sp,AAS,100,Intro Asian American Studies,DIS,0,10,7,...,1,0,0,0,0,0,0,2,0,"Zheng, Reanne"
3,2022,Spring,2022-sp,AAS,100,Intro Asian American Studies,DIS,17,8,1,...,0,0,0,0,0,0,0,0,0,"Rosado-Torres, Alexander"
4,2022,Spring,2022-sp,AAS,100,Intro Asian American Studies,OD,0,8,4,...,2,1,0,0,0,0,1,3,1,"Wang, Yu"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64043,2010,Summer,2010-su,STAT,410,Statistics and Probability II,LEC,5,10,2,...,1,0,1,3,0,0,0,2,1,"Stepanov, Alexei G"
64044,2010,Summer,2010-su,STAT,440,Statistical Data Management,LEC,4,12,8,...,0,0,0,0,0,0,0,0,0,"Unger, David"
64045,2010,Summer,2010-su,TAM,212,Introductory Dynamics,LEC,0,1,3,...,7,5,1,1,0,2,0,1,0,"Morgan, William T"
64046,2010,Summer,2010-su,TAM,251,Introductory Solid Mechanics,LCD,1,2,2,...,0,3,3,2,0,0,1,1,0,"Ott-Monsivais, Stephanie"


## Get Average GPA and Remove Grade Features

In [129]:
def get_average_gpa(gpa_data):
    gpa_scale = {
        'A+' : 4.0,
        'A' : 4.0,
        'A-' : 3.67,
        'B+' : 3.33,
        'B' : 3.0,
        'B-' : 2.67,
        'C+' : 2.33,
        'C' : 2.0,
        'C-' : 1.67,
        'D+' : 1.33,
        'D' : 1.0,
        'D-' : 0.67,
        'F' : 0.0,
    } # defined from https://registrar.illinois.edu/courses-grades/explanation-of-grades/

    # calculate average GPA from letter grades
    avg_gpa = np.zeros(len(gpa_data))
    count = np.zeros(len(gpa_data))
    for l in gpa_scale:
        avg_gpa += gpa_data[l] * gpa_scale[l]
        count += gpa_data[l]

    avg_gpa /= count
    gpa_data['avg_gpa'] = avg_gpa

    # drop letter grades features
    gpa_data = gpa_data.drop(columns=gpa_scale.keys())
    
    return gpa_data

gpa_data = pd.read_csv(file_url, header=0)
gpa_data = get_average_gpa(gpa_data)
gpa_data


Unnamed: 0,Year,Term,YearTerm,Subject,Number,Course Title,Sched Type,W,Primary Instructor,avg_gpa
0,2022,Spring,2022-sp,AAS,100,Intro Asian American Studies,LCD,0,"Lee, Sang S",3.413793
1,2022,Spring,2022-sp,AAS,100,Intro Asian American Studies,DIS,0,"Zheng, Reanne",3.440400
2,2022,Spring,2022-sp,AAS,100,Intro Asian American Studies,DIS,0,"Zheng, Reanne",3.358519
3,2022,Spring,2022-sp,AAS,100,Intro Asian American Studies,DIS,0,"Rosado-Torres, Alexander",3.928571
4,2022,Spring,2022-sp,AAS,100,Intro Asian American Studies,OD,1,"Wang, Yu",2.921429
...,...,...,...,...,...,...,...,...,...,...
64043,2010,Summer,2010-su,STAT,410,Statistics and Probability II,LEC,1,"Stepanov, Alexei G",3.183226
64044,2010,Summer,2010-su,STAT,440,Statistical Data Management,LEC,0,"Unger, David",3.774643
64045,2010,Summer,2010-su,TAM,212,Introductory Dynamics,LEC,0,"Morgan, William T",2.595714
64046,2010,Summer,2010-su,TAM,251,Introductory Solid Mechanics,LCD,0,"Ott-Monsivais, Stephanie",2.603333


## Feature Selection and Preprocessing

In [130]:
from pandas.core.resample import g
from sklearn.preprocessing import OneHotEncoder

def get_one_hot_label_by_column(gpa_data, column_name):
    column = np.unique(gpa_data[column_name])
    for unique_value in np.unique(gpa_data[column_name]):
        label = np.array(gpa_data[column_name] == unique_value, dtype='uint8')
        gpa_data[unique_value] = label

    gpa_data = gpa_data.drop(columns=[column_name])


    return gpa_data

def feature_selection(gpa_data):
    # turning "term" into one-hot labels
    gpa_data = get_one_hot_label_by_column(gpa_data, 'Term')

    # turning "Subject" into one-hot labels
    gpa_data = get_one_hot_label_by_column(gpa_data, 'Subject')

    # turning "Number" into one-hot labels (100 -> 100-level, 450 -> 400-level)
    # Max course number is 798
    number_columns = gpa_data['Number'] 
    for number in range(1, 8):
        label = np.array((number_columns >= number * 100) & (number_columns < (number * 100 + 100)), dtype='uint8')
        gpa_data['{}00-level'.format(number)] = label

    gpa_data = gpa_data.drop(columns=['Number'])

    # fill nan value in "Sched Type" with unknown
    gpa_data['Sched Type'] = gpa_data['Sched Type'].replace(np.nan, 'Unknown Sched Type')

    # turning "Sched Type" into one-hot labels
    gpa_data = get_one_hot_label_by_column(gpa_data, 'Sched Type')

    # remove text columns (for now)
    gpa_data = gpa_data.drop(columns=['Course Title', 'Primary Instructor', 'YearTerm'])


    return gpa_data


gpa_data = pd.read_csv(file_url, header=0)
gpa_data = get_average_gpa(gpa_data)
gpa_data = feature_selection(gpa_data)
gpa_data


  import sys


Unnamed: 0,Year,W,avg_gpa,Fall,Spring,Summer,Winter,AAS,ABE,ACCY,...,OLB,OLC,ONL,Onl,PKG,PR,Q,SEM,ST,Unknown Sched Type
0,2022,0,3.413793,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2022,0,3.440400,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2022,0,3.358519,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2022,0,3.928571,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2022,1,2.921429,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64043,2010,1,3.183226,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
64044,2010,0,3.774643,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
64045,2010,0,2.595714,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
64046,2010,0,2.603333,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Train Test Split

In [131]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(gpa_data.drop(columns=['avg_gpa']), gpa_data['avg_gpa'].to_numpy(), test_size=0.3, random_state=42)
x_train

Unnamed: 0,Year,W,Fall,Spring,Summer,Winter,AAS,ABE,ACCY,ACE,...,OLB,OLC,ONL,Onl,PKG,PR,Q,SEM,ST,Unknown Sched Type
3563,2021,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41631,2014,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31004,2016,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
52681,2011,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3766,2021,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62570,2010,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38158,2014,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
860,2022,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15795,2019,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Normalization

In [132]:
from sklearn import preprocessing

def normalization(x_train, x_test, y_train, y_test):
    # normalization only uses the data from x_train
    x_scale = preprocessing.StandardScaler().fit(x_train)

    columns = x_train.columns
    x_train = x_scale.transform(x_train)
    x_train = pd.DataFrame(x_train, columns=columns)

    columns = x_test.columns
    x_test = x_scale.transform(x_test)
    x_test = pd.DataFrame(x_test, columns=columns)

    # TODO:
    # maybe apply normalization on y

    return x_train, x_test, y_train, y_test

x_train, x_test, y_train, y_test = normalization(x_train, x_test, y_train, y_test)
x_train

Unnamed: 0,Year,W,Fall,Spring,Summer,Winter,AAS,ABE,ACCY,ACE,...,OLB,OLC,ONL,Onl,PKG,PR,Q,SEM,ST,Unknown Sched Type
0,1.460542,1.144758,1.022719,-0.862358,-0.292840,-0.076079,-0.05947,-0.049594,-0.207812,-0.129013,...,-0.054547,-0.158522,-0.354493,-0.013359,-0.024999,-0.039827,-0.124193,-0.00818,-0.024089,-0.431854
1,-0.441996,-0.339937,-0.977786,1.159612,-0.292840,-0.076079,-0.05947,-0.049594,-0.207812,-0.129013,...,-0.054547,-0.158522,-0.354493,-0.013359,-0.024999,-0.039827,-0.124193,-0.00818,-0.024089,-0.431854
2,0.101586,1.144758,-0.977786,1.159612,-0.292840,-0.076079,-0.05947,-0.049594,-0.207812,-0.129013,...,-0.054547,-0.158522,-0.354493,-0.013359,-0.024999,-0.039827,-0.124193,-0.00818,-0.024089,-0.431854
3,-1.257370,1.144758,1.022719,-0.862358,-0.292840,-0.076079,-0.05947,-0.049594,-0.207812,-0.129013,...,-0.054547,-0.158522,-0.354493,-0.013359,-0.024999,-0.039827,-0.124193,-0.00818,-0.024089,-0.431854
4,1.460542,-0.339937,1.022719,-0.862358,-0.292840,-0.076079,-0.05947,-0.049594,-0.207812,-0.129013,...,-0.054547,-0.158522,-0.354493,-0.013359,-0.024999,-0.039827,-0.124193,-0.00818,-0.024089,-0.431854
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44828,-1.529161,-0.339937,-0.977786,1.159612,-0.292840,-0.076079,-0.05947,-0.049594,-0.207812,-0.129013,...,-0.054547,-0.158522,-0.354493,-0.013359,-0.024999,-0.039827,-0.124193,-0.00818,-0.024089,-0.431854
44829,-0.441996,-0.339937,1.022719,-0.862358,-0.292840,-0.076079,-0.05947,-0.049594,-0.207812,-0.129013,...,-0.054547,-0.158522,-0.354493,-0.013359,-0.024999,-0.039827,-0.124193,-0.00818,-0.024089,-0.431854
44830,1.732333,-0.339937,-0.977786,1.159612,-0.292840,-0.076079,-0.05947,-0.049594,-0.207812,-0.129013,...,-0.054547,-0.158522,-0.354493,-0.013359,-0.024999,-0.039827,-0.124193,-0.00818,-0.024089,-0.431854
44831,0.916960,1.144758,-0.977786,1.159612,-0.292840,-0.076079,-0.05947,-0.049594,-0.207812,-0.129013,...,-0.054547,-0.158522,-0.354493,-0.013359,-0.024999,-0.039827,-0.124193,-0.00818,-0.024089,-0.431854


## Test Validation Split

In [133]:
# Train, Test, Val -> 0.7, 0.15, 0.15
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5, random_state=42)
x_test

Unnamed: 0,Year,W,Fall,Spring,Summer,Winter,AAS,ABE,ACCY,ACE,...,OLB,OLC,ONL,Onl,PKG,PR,Q,SEM,ST,Unknown Sched Type
19116,1.460542,-0.339937,1.022719,-0.862358,-0.29284,-0.076079,-0.05947,-0.049594,-0.207812,-0.129013,...,-0.054547,-0.158522,-0.354493,-0.013359,-0.024999,-0.039827,-0.124193,-0.00818,-0.024089,-0.431854
5646,-1.529161,-0.339937,-0.977786,1.159612,-0.29284,-0.076079,-0.05947,-0.049594,-0.207812,-0.129013,...,-0.054547,-0.158522,-0.354493,-0.013359,-0.024999,-0.039827,-0.124193,-0.00818,-0.024089,-0.431854
1821,0.916960,1.144758,1.022719,-0.862358,-0.29284,-0.076079,-0.05947,-0.049594,-0.207812,-0.129013,...,-0.054547,-0.158522,-0.354493,-0.013359,-0.024999,-0.039827,-0.124193,-0.00818,-0.024089,-0.431854
13103,-0.713788,-0.339937,1.022719,-0.862358,-0.29284,-0.076079,-0.05947,-0.049594,-0.207812,-0.129013,...,-0.054547,-0.158522,-0.354493,-0.013359,-0.024999,-0.039827,-0.124193,-0.00818,-0.024089,-0.431854
7413,0.645168,-0.339937,1.022719,-0.862358,-0.29284,-0.076079,-0.05947,-0.049594,-0.207812,-0.129013,...,-0.054547,-0.158522,-0.354493,-0.013359,-0.024999,-0.039827,-0.124193,-0.00818,-0.024089,-0.431854
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,1.460542,-0.339937,-0.977786,1.159612,-0.29284,-0.076079,-0.05947,-0.049594,-0.207812,-0.129013,...,-0.054547,-0.158522,2.820929,-0.013359,-0.024999,-0.039827,-0.124193,-0.00818,-0.024089,-0.431854
11964,0.645168,-0.339937,-0.977786,1.159612,-0.29284,-0.076079,-0.05947,-0.049594,-0.207812,-0.129013,...,-0.054547,-0.158522,-0.354493,-0.013359,-0.024999,-0.039827,-0.124193,-0.00818,-0.024089,2.315599
5390,1.460542,-0.339937,-0.977786,1.159612,-0.29284,-0.076079,-0.05947,-0.049594,-0.207812,-0.129013,...,-0.054547,-0.158522,2.820929,-0.013359,-0.024999,-0.039827,-0.124193,-0.00818,-0.024089,-0.431854
860,1.460542,1.144758,1.022719,-0.862358,-0.29284,-0.076079,-0.05947,-0.049594,4.812038,-0.129013,...,-0.054547,-0.158522,-0.354493,-0.013359,-0.024999,-0.039827,-0.124193,-0.00818,-0.024089,-0.431854


## Linear Regression with Lasso

In [140]:
from sklearn import linear_model

def get_rmse(model, X, Y):
    try:
        Y_p = model.predict(X)
    except:
        Y_p = model(X).to_numpy()

    MSE = np.linalg.norm(Y - Y_p)
    return np.sqrt(MSE)

def linear_fitting(x_train, x_test, y_train, y_test):
    l2_lambdas = np.linspace(0.01, 20000, 10)
    best = None
    best_rmse = 10000
    for i, l2_lambda in enumerate(l2_lambdas):
        model = linear_model.Ridge(alpha=l2_lambda)
        model.fit(x_train, y_train)
        train_rmse = get_rmse(model, x_train, y_train)
        test_rmse = get_rmse(model, x_test, y_test)
        print('l2_lambda={}, train_rmse={},test_rmse={}'.format(l2_lambda, train_rmse, test_rmse))

        if test_rmse < best_rmse:
            best_rmse = test_rmse
            best = model

    return best

best_model = linear_fitting(x_train, x_test, y_train, y_test)


l2_lambda=0.01, train_rmse=8.25678521816731,test_rmse=5.595504251714886
l2_lambda=2222.2311111111117, train_rmse=8.260405829367883,test_rmse=5.599670133558477
l2_lambda=4444.452222222223, train_rmse=8.266534571428377,test_rmse=5.6035375901167495
l2_lambda=6666.673333333334, train_rmse=8.275064635390232,test_rmse=5.609170862201399
l2_lambda=8888.894444444446, train_rmse=8.28520628884881,test_rmse=5.615989628719289
l2_lambda=11111.115555555558, train_rmse=8.296433622325077,test_rmse=5.623607230474856
l2_lambda=13333.336666666668, train_rmse=8.308382579958485,test_rmse=5.63175690564347
l2_lambda=15555.55777777778, train_rmse=8.320793623571845,test_rmse=5.640249215726159
l2_lambda=17777.77888888889, train_rmse=8.333477758673958,test_rmse=5.648946729220733
l2_lambda=20000.0, train_rmse=8.34629530885489,test_rmse=5.657748221803006


In [141]:
from traitlets.traitlets import validate
predicted_y = best_model.predict(x_val)
validation = pd.DataFrame()
validation['Predicted Y'] = predicted_y
validation['Y'] = y_val
validation


Unnamed: 0,Predicted Y,Y
0,3.441533,3.545366
1,3.582435,3.455357
2,3.325249,3.310357
3,3.431266,2.833636
4,3.460289,3.750357
...,...,...
9603,3.360477,3.894545
9604,3.572770,3.626800
9605,3.208688,2.777593
9606,3.696984,3.800000
