<a href="https://colab.research.google.com/github/rgilyard/predict-student-outcomes/blob/main/prelim_gaussian_process_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preliminary Results for a Gaussian Process Classifier for Student Failure and Dropout Prediction

## Constants

In [58]:
DATA_PATH = '/content/drive/MyDrive/COMP 542 Group Project/student_info_prelim_sample.csv'

## Mount Google Drive

In [59]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Libraries

In [60]:
import pandas as pd

## Load data

In [61]:
df = pd.read_csv(DATA_PATH)

In [62]:
print(df.head())

  code_module code_presentation  id_student gender                region  \
0         DDD             2013J      586536      M         London Region   
1         DDD             2013B     1906919      M  North Western Region   
2         BBB             2013J      607642      F   East Anglian Region   
3         DDD             2013B     1105628      F   East Anglian Region   
4         DDD             2014J      680846      M          North Region   

       highest_education imd_band age_band  num_of_prev_attempts  \
0  A Level or Equivalent    0-10%     0-35                     0   
1     Lower Than A Level    0-10%    35-55                     0   
2       HE Qualification   80-90%     0-35                     0   
3     Lower Than A Level  90-100%    35-55                     0   
4     Lower Than A Level   50-60%     0-35                     0   

   studied_credits disability final_result  
0               60          N    Withdrawn  
1               60          N         Fail  

In [63]:
# Drop id column
df.drop(['id_student'], axis=1, inplace=True)

## Preprocessing (These will be moved to a .py file later)

### Combine fail/withdraw and pass/distinction for binary prediction (at first)

In [64]:
df['fail'] = df['final_result'].apply(lambda x: 1 if x in ['Fail', 'Withdrawn'] else 0)

### Split into training and test sets

In [65]:
from sklearn.model_selection import train_test_split

X = df.drop(['final_result', 'fail'], axis=1)  # Dropping the target columns
y = df['fail']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9001)

### Missing values

In [66]:
# Check how many values are missing
print(df.isnull().sum())

code_module              0
code_presentation        0
gender                   0
region                   0
highest_education        0
imd_band                36
age_band                 0
num_of_prev_attempts     0
studied_credits          0
disability               0
final_result             0
fail                     0
dtype: int64


In [67]:
# Since there are not too many missing values, we can take the mode for imd_band
df['imd_band'].fillna(df['imd_band'].mode()[0], inplace=True)
print(df.isnull().sum())

code_module             0
code_presentation       0
gender                  0
region                  0
highest_education       0
imd_band                0
age_band                0
num_of_prev_attempts    0
studied_credits         0
disability              0
final_result            0
fail                    0
dtype: int64


### Label encode binary features

In [68]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [69]:
# Lists for each type of feature to encode
# List of binary columns
binary = ['gender', 'disability']

# List of categorical columns
categorical = ['code_module', 'code_presentation', 'region']

# List of ordinal columns
ordinal = {'highest_education': [\
              'No Formal quals', \
              'Lower Than A Level', \
              'A Level or Equivalent', \
              'HE Qualification', \
              'Post Graduate Qualification'], \
           'imd_band': [\
              '0-10%', \
              '10-20', \
              '20-30%', \
              '30-40%', \
              '40-50%', \
              '50-60%', \
              '60-70%', \
              '70-80%', \
              '80-90%', \
              '90-100%'], \
           'age_band': [\
              '0-35', \
              '35-55', \
              '55<=']}

In [70]:
# Label encode binary features
label_encoder = LabelEncoder()

for feature in binary:
    df[feature] = label_encoder.fit_transform(df[feature])

In [71]:
# Label encode ordinal features in order
for feature, order in ordinal.items():
    label_encoder = LabelEncoder()
    label_encoder.fit(order)
    df[feature] = label_encoder.transform(df[feature])

In [72]:
# One hot encode categorical features
df = pd.get_dummies(df, columns=categorical)

In [73]:
# Because there a some categories with very few instances,
# make sure all the features are in both sets
# Add the missing columns filled with zeros
for col in X_train.columns:
    if col not in X_test.columns:
        X_test[col] = 0

# Reorder test columns to match the order in training set
X_test = X_test[X_train.columns]

# Normalize data

In [74]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

ValueError: ignored