In [None]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
import sklearn.metrics
import pandas as pd
import csv
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import math
from scipy.stats import sem
matplotlib.style.use('ggplot')

RANDOM_SEED = 10
NUMBER_FOLDS = 10

In [None]:
def kfold_student(train_data, grade_label, k):
    """
    Returns
    -------
    folds: list of lists
        each list within the list contains the training data, testing data, training outcomes and testing outcomes
    """
    folds = []
    kf = KFold(n_splits=k, random_state = RANDOM_SEED, shuffle = True)
    for train_index, test_index in kf.split(train_data):
        train, test = train_data[train_index], train_data[test_index]
        train_grade, test_grade = grade_label[train_index], grade_label[test_index]
        
        folds.append([train, test, train_grade, test_grade])

    return folds

def calc_classification_error(predictions, scores, test_grade_labels, outcome):
    #scores were used for binary outcomes
    
    if outcome == "GPA" or outcome == "longGPA":
        accuracy = math.sqrt(sklearn.metrics.mean_squared_error(test_grade_labels, predictions))
        return accuracy 

In [None]:
def logisticRegression(train_features, train_grade_labels, test_features):
    skl_LogReg = LogisticRegression(solver='lbfgs')
    skl_LogReg.fit(train_features, train_grade_labels)
    predictions = skl_LogReg.predict(test_features)
    scores = skl_LogReg.predict_proba(test_features)
    return predictions, scores[:,1], skl_LogReg.coef_

def linearRegression(train_features, train_grade_labels, test_features):
    reg = LinearRegression()
    reg.fit(train_features, train_grade_labels)
    predictions = reg.predict(test_features)
    return predictions, [], reg.coef_

def SVM(train_features, train_grade_labels, test_features):
    clf = SVC(gamma = 'auto')
    clf.fit(train_features, train_grade_labels)
    predictions = clf.predict(test_features)
    decisions = clf.decision_function(test_features)

    return predictions, decisions

def SVMR(train_features, train_grade_labels, test_features):
    clf = SVR(gamma = 'auto')
    clf.fit(train_features, train_grade_labels)
    predictions = clf.predict(test_features)
    return predictions, []

def RR(train_features, train_grade_labels, test_features):
    reg = Ridge()
    reg.fit(train_features, train_grade_labels)
    predictions = reg.predict(test_features)
    return predictions, [], reg.coef_

In [None]:
def student_level(classes, types, outcome):
    """
    Parameters
    ----------
    types: str
        the type of feature set
    outcome: str
        the outcomes (either GPA, or longGPA)
        
    Returns
    -------
    train_features: dictionary
        keys are fold numbers and values are the train features data
    train_grade_labels: dictionary
        keys are fold numbers and values are the outcomes for training set
    test_features: dictionary
        keys are fold numbers and values are the test features data
    test_grade_labels: dictionary
        keys are fold numbers and values are the outcomes for testing set
    features: list
        list of features in the feature set
    """
    train_features = {}
    test_features = {}
    train_grade_labels = {}
    test_grade_labels = {}
    
    for cla in range(len(classes)):
        if types  == "background":
            data = pd.read_csv('./training_data/' + classes[cla][2:] + "_background_training.csv")
            data.head()
            features = ["age", "sattotalscore", "hsgpa",
                      "gpacumulative", "istransfer", "ismale", "lowincomeflag",
                      "firstgenerationflag", "isurm"]
            train_data = data[features].values.astype(float)
        elif types == 'week_time_cat_clicks':
            data = pd.read_csv('./training_data/' + classes[cla][2:] + "_weeks_time_training.csv")
            data.head()
            data2 = pd.read_csv('./training_data/' + classes[cla][2:] + "_categories_training.csv")
            data2.head()
            train_data = np.column_stack((data[['week 1', 'week 2', 'week 3', 'week 4', 'week 5', 'week 6', 'week 7', 'week 8', 'week 9', 'week 10']].values.astype(float), data2[['assignments_clicks', 'discussion_topics_clicks', 'files_clicks', 'pages_clicks']].values.astype(float)))
            features = ['week 1', 'week 2', 'week 3', 'week 4', 'week 5',
                        'week 6', 'week 7', 'week 8', 'week 9', 'week 10',
                        'assignments_clicks', "discussion_topics_clicks",
                        "files_clicks", "pages_clicks"]
        elif types == 'background+week_cat':
            data = pd.read_csv('./training_data/' + classes[cla][2:] + "_weeks_time_training.csv")
            data.head()
            data2 = pd.read_csv('./training_data/' + classes[cla][2:] + "_categories_training.csv")
            data2.head()
            train_data = np.column_stack((data[['week 1', 'week 2', 'week 3', 'week 4', 'week 5', 'week 6', 'week 7', 'week 8', 'week 9', 'week 10']].values.astype(float), data2[['assignments_clicks', 'discussion_topics_clicks', 'files_clicks', 'pages_clicks']].values.astype(float)))
            data3 = pd.read_csv('./training_data/' + classes[cla][2:] + "_background_training.csv")
            data3.head()
            train_data = np.column_stack((train_data, data3[["age", "sattotalscore", "hsgpa",
                      "gpacumulative", "istransfer", "ismale", "lowincomeflag",
                      "firstgenerationflag", "isurm"]].values.astype(float)))
            features = ['week 1', 'week 2', 'week 3', 'week 4', 'week 5',
                        'week 6', 'week 7', 'week 8', 'week 9', 'week 10',
                        'assignments_clicks', "discussion_topics_clicks",
                        "files_clicks", "pages_clicks",
                        "age", "sattotalscore", "hsgpa",
                      "gpacumulative", "istransfer", "ismale", "lowincomeflag",
                      "firstgenerationflag", "isurm"]
            
        if outcome == "grade":
            grade_label = data['grade'].values.astype(float)
        elif outcome == "GPA":
            grade_label = data['GPA'].values.astype(float)
        elif outcome == "longGPA":
            GPA = pd.read_csv('./training_data/' + classes[cla][2:] + "_longGPA.csv")
            GPA.head()
            grade_label = GPA["longGPA"].values.astype(float)
            
        folds = kfold_student(train_data, grade_label, NUMBER_FOLDS)
        
        for fold in range(len(folds)):
            f = folds[fold]
            if cla == 0:
                train_features[fold] = f[0]
                test_features[fold] = f[1]
                train_grade_labels[fold] = f[2]
                test_grade_labels[fold] = f[3]
            else:
                if types in ["total_clicks", "total_time", "total_sessions"]:
                    a = None
                else:
                    a = 0
                train_features[fold] = np.concatenate((train_features[fold], f[0]), axis=a)
                test_features[fold] = np.concatenate((test_features[fold], f[1]), axis=a)
                train_grade_labels[fold] = np.concatenate((train_grade_labels[fold], f[2]), axis=a)
                test_grade_labels[fold] = np.concatenate((test_grade_labels[fold], f[3]), axis=a)

    return train_features, train_grade_labels, test_features, test_grade_labels, features

In [None]:
def graph_coefs(coefs, type_features, model):
    yvalues = []
    xvalues = ["week 1 time", "week 2 time", "week 3 time", "week 4 time",
               "week 5 time", "week 6 time", "week 7 time", "week 8 time",
               "week 9 time", "week 10 time", "assignments clicks",
               "discussion topics clicks", "files clicks", "pages clicks",
               "age", "SAT score", "high school GPA", "cumulative GPA",
               "is transfer", "is male", "low income", "first generation",
               "underrepresented minority"]
    
    for types in coefs:
        yvalues += list(coefs[types][0])
    plt.barh(xvalues, yvalues)
    
    plt.title('Predictor Importance For Ridge Model Based on Background, Category Clicks and Time Spent on Page Per Week', fontsize = 8)
    plt.tick_params(labelsize = 7)
    plt.ylabel('Features', fontsize = 8)
    plt.xlabel('Predictor Importance', fontsize = 8)
    plt.savefig('./training_data/coef_results.pdf', dpi=600, bbox_inches='tight')
    plt.tight_layout()
    plt.clf()

In [None]:
def get_result(level_type, classes, outcome, individual):
     gpa_outcomes = {"SVR":dict(), "Ridge": dict(), "Base Line": 0.0}
     coefs = dict() #keys = type of feature set, values = average coefficients 
     type_features = dict() #keys = type of feature set, values = features in the feature set
        
     for types in ['background+week_cat', 'background', 'week_time_cat_clicks']:
        if level_type == "student":
            print(types, "student level:", outcome)
            train_features, train_grade_labels, test_features, test_grade_labels, features = student_level(classes, types, outcome)
            type_features[types] = features
        
        avg_coef = np.zeros((1, len(features))) #used to add coeffecients across all folds
        for x in range(4): #four because there are four models (Ridge, SVR, Linear Regression and Base Line)
            average_accuracy = 0
            
            for i in range(NUMBER_FOLDS):
                te = test_features[i]
                tr = train_features[i]
                
                if outcome == "GPA" or outcome == "longGPA":
                    if x == 0:
                        predictions, scores, coef = RR(tr, np.ravel(train_grade_labels[i]), te)
                        if types == "background+week_cat":
                            avg_coef += coef
                    elif x == 1:
                        predictions, scores = SVMR(tr, np.ravel(train_grade_labels[i]), te)
                    elif x == 2:
                        predictions, scores, coef = linearRegression(tr, np.ravel(train_grade_labels[i]), te)
                    elif x == 3:
                        average_GPA = np.mean(train_grade_labels[i])
                        predictions = np.full((test_grade_labels[i].shape), average_GPA)
                        scores = []

                    accuracy = calc_classification_error(predictions, scores, test_grade_labels[i], outcome)
                    average_accuracy += accuracy

            if x == 0 and types == 'background+week_cat':
                coefs[types] = avg_coef/NUMBER_FOLDS
            
            
            result = average_accuracy/NUMBER_FOLDS

            if outcome == "GPA" or outcome == "longGPA":
                if x == 0:
                    #print("Ridge:", result)
                    gpa_outcomes["Ridge"][types] = result
                elif x == 1:
                    #print("SVR:", result)
                    gpa_outcomes["SVR"][types] = result
                elif x == 2:
                    #print("Linear Regression:", result)
                    gpa_outcomes["SVR"][types] = result
                elif x == 3:
                    print("Base Line:", result)
                    gpa_outcomes["Base Line"] = result
        if not individual and types == 'background+week_cat' and outcome == "GPA":
            graph_coefs(coefs, type_features, "Ridge")
    
     return gpa_outcomes

In [None]:
def generate_course_comparison_data(classes):
    """
    Returns
    -------
    courses: dict
        keys are courses and values are a list of four value [RMSE for course GPA, SE for course GPA, 
        RMSE for long term GPA, SE for long term GPA] (will be used to graph comparison graph)
    """
    courses = dict()
    for c in classes:
        print(c)
        gpa_outcomes1 = get_result("student", [c], "GPA", True)
        gpa_outcomes2 = get_result("student", [c], "longGPA", True)
        results1 = []
        results2 = []
        for model in gpa_outcomes1:
            if model != "Base Line":
                results1.append(gpa_outcomes1[model]["background+week_cat"])
                results2.append(gpa_outcomes2[model]["background+week_cat"])
        results1 = np.array(results1)
        results2 = np.array(results2)
        courses[c[2:]] = [np.min(results1), np.std(results1),
                         np.min(results2), np.std(results2)]

    gpa_outcomes_short = get_result("student", classes, "GPA", True)
    gpa_outcomes_long = get_result("student", classes, "longGPA", True)

    """
    Below is used to generate results for all courses
    """
    results3 = []
    results4 = []
    for model in gpa_outcomes_short:
        if model != "Base Line":
            results3.append(gpa_outcomes_short[model]["background+week_cat"])
            results4.append(gpa_outcomes_long[model]["background+week_cat"])
    results3 = np.array(results3)
    results4 = np.array(results4)
    courses["all courses"] = [np.min(results3), np.std(results3),
                         np.min(results4), np.std(results4)]
    return courses

In [None]:
def comparison_graph(courses):
    
    short_GPA = []
    long_GPA = []
    
    for c in courses:
        values = courses[c]
        short_GPA.append(values[0])
        long_GPA.append(values[2])

    xvalues = list(courses.keys())
    df = pd.DataFrame({"Short term GPA": short_GPA, "Long term GPA": long_GPA}, index=xvalues)
    ax = df.plot.bar()
    plt.title("Comparison of Accuracy Between Short Term and Long Term GPA", fontsize = 9)
    plt.tick_params(labelsize = 7)
    plt.xlabel("Courses")
    plt.ylabel("Root Mean Square Error")
    plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
    plt.tight_layout()
    plt.savefig('./training_data/comparison.pdf')
    plt.clf()

In [None]:
def save_data_mat(longGPA, shortGPA):
    """
    Used to generate the table 
    """
    lines = []
    lines.append(["", "Course GPA", "Long Term GPA"])
    
    background_long = []
    background_short = []
    
    weeks_cat_long = []
    weeks_cat_short = []
    
    both_long = []
    both_short = []
    
    for model in longGPA:
        if model != "Base Line":
            types = longGPA[model]
            for t in types:
                if t == 'background+week_cat':
                    both_long.append(types[t])
                elif t == 'background':
                    background_long.append(types[t])
                elif t == 'week_time_cat_clicks':
                    weeks_cat_long.append(types[t])
    
    for model in shortGPA:
        if model != "Base Line":
            types = shortGPA[model]
            for t in types:
                if t == 'background+week_cat':
                    both_short.append(types[t])
                elif t == 'background':
                    background_short.append(types[t])
                elif t == 'week_time_cat_clicks':
                    weeks_cat_short.append(types[t])
    
    back_long = (np.min(background_long), np.std(background_long))
    back_short = (np.min(background_short), np.std(background_short))
    
    wc_long = (np.min(weeks_cat_long), np.std(weeks_cat_long))
    wc_short = (np.min(weeks_cat_short), np.std(weeks_cat_short))
    
    b_long = (np.min(both_long), np.std(both_long))
    b_short = (np.min(both_short), np.std(both_short))
   
    lines.append(["Activity Based", wc_short, wc_long])
    lines.append(["Background Based", back_short, back_long])
    lines.append(["Activity and Background Based", b_short, b_long])
    lines.append(["Base Line", shortGPA["Base Line"], longGPA["Base Line"]])
    
    file_name = "./training_data/table.csv"
    with open(file_name, 'w', newline='') as f:
        writer = csv.writer(f, delimiter=',')
        writer.writerows(lines)

In [None]:
classes = np.array(['./16Fa CHEM 1P', './16S1 PHY 3A',
           './17Fa PUBHLTH 1', './17Fa PUBHLTH 2', './17S1 PHY 3A',
           './17Sp PUBHLTH 1', './17Sp PUBHLTH 2', './17Wi PUBHLTH 1',
           './17Wi PUBHLTH 2', './18S1 CHEM 1C',
           './18Wi BIO SCI 9B'])

courses = generate_course_comparison_data(classes)
comparison_graph(courses)

shortGPA = get_result("student", classes, "GPA", False)
longGPA = get_result("student", classes, "longGPA", False)

save_data_mat(longGPA, shortGPA)
