In [397]:
import pandas as pd
import numpy as np

import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score

In [398]:
df = pd.read_csv('data/grades.csv')
df = df.drop(labels = 0, axis = 0) 

In [399]:
df.columns = df.columns.str.replace(' ', '_').str.lower()
#print(df.dtypes)
possible_grades = ['total_enrolled', 'a+', 'a', 'a-', 'b+', 'b', 'b-', 'c+', 'c', 'p', 'c-', 'd', 'f', 'withdrawal', 'inc/no_grade' ]

for i in possible_grades: #converting values from object to integer 
    df[i] = df[i].astype(int)
    
df['class_name'] = df['subject'].astype(str) + df['course_number'].astype(str)

def writing_intensive(class_num):
    if class_num[-1] == 'W':
        return True
    return False

def honors(class_num):
    if class_num[-1] == 'H':
        return True
    return False

def pure_number(class_num):
    if class_num[-1] == 'W' or class_num[-1] == 'H':
        class_num = class_num[:-1]
    return class_num

df['writing_intensive'] = df['course_number'].apply(writing_intensive)
df['honors'] = df['course_number'].apply(honors)
df['class_num'] = df['course_number'].apply(pure_number)

#print(df.dtypes)

print(df.shape)

(2668, 26)


In [400]:
score_1 = (df['a'] + df['a+']) * 4
score_2 = (df['a-']) * 3.7
score_3 = df['b+'] * 3.3
score_4 = df['b'] * 3
score_5 = df['b-'] * 2.7
score_6 = df['c+'] * 2.3
score_7 = (df['c'] + df['p']) * 2
score_8 = df['c-'] * 1.7
score_9 = df['d'] * 1
score_10 = (df['f'] + df['withdrawal'] + df['inc/no_grade']) * 0

df['actual_average'] = (score_1 + score_2 + score_3 + score_4 + score_5 + score_6 + score_7 + score_8 + score_9 + score_10)/df['total_enrolled']
df['reasonably_passing'] = np.where(df['actual_average'] > 2.5, 1 , 0)

save_as = 'data/improved_grades.csv'
df.to_csv(save_as, index=False)

#df.describe()

In [401]:
df.isnull().sum() 
df.duplicated().sum()
#no duplicate or null values 
df.columns

Index(['term', 'subject', 'course_number', 'course_description',
       'class_section', 'instructor', 'total_enrolled', 'a+', 'a', 'a-', 'b+',
       'b', 'b-', 'c+', 'c', 'p', 'c-', 'd', 'f', 'withdrawal', 'inc/no_grade',
       'average_gpa', 'class_name', 'writing_intensive', 'honors', 'class_num',
       'actual_average', 'reasonably_passing'],
      dtype='object')

In [402]:
df.reasonably_passing.value_counts(normalize = True)

1    0.703523
0    0.296477
Name: reasonably_passing, dtype: float64

In [403]:
df = pd.get_dummies(df, columns = ['subject', 'instructor'], drop_first = True)

In [404]:
selected_features = ['class_num', 'total_enrolled', 'class_num', 'writing_intensive', 'honors', 'subject_AFST',
       'subject_ANTH', 'subject_ARAB', 'subject_ARTH', 'subject_ARTS',
       'subject_ASTR', 'subject_BALA', 'subject_BIOL', 'subject_BUS',
       'subject_CERT', 'subject_CESL', 'subject_CHEM', 'subject_CHIN',
       'subject_CLAS', 'subject_CMAL', 'subject_CMLIT', 'subject_CO-OP',
       'subject_CSCI', 'subject_DANCE', 'subject_DATA', 'subject_DRAM',
       'subject_EAST', 'subject_ECON', 'subject_ECPCE', 'subject_ECPEL',
       'subject_ECPSE', 'subject_ECPSP', 'subject_EECE', 'subject_ENGL',
       'subject_ENSCI', 'subject_EURO', 'subject_FNES', 'subject_FREN',
       'subject_GEOL', 'subject_GERM', 'subject_GREEK', 'subject_GRKMD',
       'subject_HEBRW', 'subject_HIST', 'subject_HNRS', 'subject_HTH',
       'subject_ITAL', 'subject_JAZZ', 'subject_JEWST', 'subject_JPNS',
       'subject_KOR', 'subject_LABST', 'subject_LATIN', 'subject_LBSCI',
       'subject_LCD', 'subject_LIBR', 'subject_MAM', 'subject_MATH',
       'subject_MEDST', 'subject_MES', 'subject_MUSIC', 'subject_PERM',
       'subject_PHIL', 'subject_PHOTO', 'subject_PHYS', 'subject_PORT',
       'subject_PSCI', 'subject_PSYCH', 'subject_QNS', 'subject_RM',
       'subject_RUSS', 'subject_SEEK', 'subject_SEYS', 'subject_SEYSL',
       'subject_SOC', 'subject_SPAN', 'subject_STPER', 'subject_URBST',
       'subject_WGS']

In [405]:
X = df[selected_features]
y = df['reasonably_passing']

In [406]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)

In [407]:
print('Length of our Training data:', X_train.shape[0], '\nLength of our Testing data:', X_test.shape[0])

Length of our Training data: 2134 
Length of our Testing data: 534


In [408]:
model = LogisticRegression(max_iter = 3000)

In [409]:
model.fit(X_train, y_train)

LogisticRegression(max_iter=3000)

In [410]:
y_pred = model.predict(X_test)

In [411]:
y_pred_proba = model.predict_proba(X_test)[:,1]
y_pred_proba.round(2)

array([0.87, 0.28, 0.64, 0.94, 0.77, 0.79, 0.87, 0.77, 0.71, 0.75, 0.51,
       0.26, 0.74, 0.28, 0.91, 0.84, 0.84, 0.45, 0.82, 0.79, 0.34, 0.65,
       0.34, 0.9 , 0.9 , 0.77, 0.83, 0.83, 0.52, 0.83, 0.86, 0.85, 0.87,
       0.91, 0.77, 0.75, 0.9 , 0.98, 0.4 , 0.75, 0.52, 0.53, 0.9 , 0.84,
       0.37, 0.88, 0.85, 0.52, 0.26, 0.52, 0.77, 0.26, 0.84, 0.84, 0.63,
       0.77, 0.31, 0.79, 0.87, 0.83, 0.79, 0.67, 0.77, 0.76, 0.34, 0.66,
       0.97, 0.52, 0.52, 0.78, 0.72, 0.79, 0.85, 0.98, 0.94, 0.85, 0.83,
       0.63, 0.67, 0.91, 0.69, 0.61, 0.77, 0.72, 0.73, 0.85, 0.85, 0.63,
       0.64, 0.34, 0.61, 0.71, 0.52, 0.26, 0.93, 0.25, 0.92, 0.77, 0.76,
       0.84, 0.34, 0.87, 0.66, 0.94, 0.37, 0.34, 0.79, 0.85, 0.91, 0.78,
       0.79, 0.26, 0.78, 0.83, 0.52, 0.66, 0.77, 0.37, 0.86, 0.34, 0.52,
       0.84, 0.72, 0.25, 0.91, 0.87, 0.25, 0.93, 0.79, 0.65, 0.63, 0.98,
       0.94, 0.75, 0.78, 0.67, 0.83, 0.74, 0.72, 0.67, 0.98, 0.52, 0.67,
       0.79, 0.71, 0.43, 0.9 , 0.34, 0.67, 0.86, 0.

In [412]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.7546816479400749

In [413]:
precision = precision_score(y_test, y_pred)
precision

0.7675438596491229

In [414]:
recall = recall_score(y_test, y_pred)
recall

0.9333333333333333

In [415]:
f1 = f1_score(y_test, y_pred)
f1

0.8423586040914561

Some notes before leaving:

-Instructors are not included as a feature, even though it is arguable that it would be the MOST important feature to include - due to difficulty in creating and including dummy variables for each instructor
-Grade for reasonably_passing is arbitrary - I set it as 2.3 (which is equivalent to a C+ average).
