<a href="https://colab.research.google.com/github/rgilyard/predict-student-outcomes/blob/main/feature_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Feature Engineering for OULAD Dataset

## Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Paths to dataset

In [20]:
STUDENT_INFO_PATH = '/content/drive/MyDrive/COMP 542 Group Project/anonymisedData/studentInfo.csv'
COURSES_PATH = '/content/drive/MyDrive/COMP 542 Group Project/anonymisedData/courses.csv'
ASSESSMENTS_PATH = '/content/drive/MyDrive/COMP 542 Group Project/anonymisedData/assessments.csv'
STUDENT_ASSESSMENTS_PATH = '/content/drive/MyDrive/COMP 542 Group Project/anonymisedData/studentAssessment.csv'

## Libraries

In [3]:
import pandas as pd
import numpy as np

## Create a feature for early grades

### Create unique code for each course instance, add start month column for courses data

In [4]:
# Get the courses data
courses_df = pd.read_csv(COURSES_PATH)
courses_df.head()

Unnamed: 0,code_module,code_presentation,module_presentation_length
0,AAA,2013J,268
1,AAA,2014J,269
2,BBB,2013J,268
3,BBB,2014J,262
4,BBB,2013B,240


In [13]:
# Each module (course) had a code, and each presention (session) has a code
# I want a unique code for each course instance instead
courses_df['course_instance'] = courses_df['code_module'] + "_" + courses_df['code_presentation']

In [7]:
# B refers to February start time and J refers to an October start time
# I'll use these months to get the assessments that occur during the beginning
# of the course (to predict failure/dropout rates)
courses_df['course_instance_start'] = courses_df['course_semester'].apply(lambda x: 'feb' if x[-1] == 'J' else 'oct')

In [8]:
courses_df.head()

Unnamed: 0,code_module,code_presentation,module_presentation_length,course_semester,course_start
0,AAA,2013J,268,AAA_2013J,feb
1,AAA,2014J,269,AAA_2014J,feb
2,BBB,2013J,268,BBB_2013J,feb
3,BBB,2014J,262,BBB_2014J,feb
4,BBB,2013B,240,BBB_2013B,oct


### Get list of early assessments for each course instance (the assessment must take place in the first two months)
Courses are around 9 months long

In [10]:
# Get the assessments data
assessments_df = pd.read_csv(ASSESSMENTS_PATH)
assessments_df.head()

Unnamed: 0,code_module,code_presentation,id_assessment,assessment_type,date,weight
0,AAA,2013J,1752,TMA,19.0,10.0
1,AAA,2013J,1753,TMA,54.0,20.0
2,AAA,2013J,1754,TMA,117.0,20.0
3,AAA,2013J,1755,TMA,166.0,20.0
4,AAA,2013J,1756,TMA,215.0,30.0


In [11]:
# Get unique course instance for the assessments table as well
assessments_df['course_instance'] = assessments_df['code_module'] + "_" + assessments_df['code_presentation']

In [14]:
# Create a dictionary for each course instance,
# with a list of assessments that takes place in the first 90 days
course_instance_early_assessments = {}

# For each unique course instance
for course_instance in courses_df['course_instance'].unique():
  # Filter assessments within the 90-day range
  assessments_in_range = assessments_df[
      (assessments_df['course_instance'] == course_instance) &
      (assessments_df['date'] <= 90)
  ]

  # Get the list of assessment IDs
  assessment_list = assessments_in_range['id_assessment'].tolist()

  # Populate dictionary
  course_instance_early_assessments[course_instance] = assessment_list

### For each student, add course instance feature, add average early grades

In [18]:
# Get the student info data
student_info_df = pd.read_csv(STUDENT_INFO_PATH)
student_info_df.head()

Unnamed: 0,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result
0,AAA,2013J,11391,M,East Anglian Region,HE Qualification,90-100%,55<=,0,240,N,Pass
1,AAA,2013J,28400,F,Scotland,HE Qualification,20-30%,35-55,0,60,N,Pass
2,AAA,2013J,30268,F,North Western Region,A Level or Equivalent,30-40%,35-55,0,60,Y,Withdrawn
3,AAA,2013J,31604,F,South East Region,A Level or Equivalent,50-60%,35-55,0,60,N,Pass
4,AAA,2013J,32885,F,West Midlands Region,Lower Than A Level,50-60%,0-35,0,60,N,Pass


In [21]:
# Get the student assessment data
student_assessment_df = pd.read_csv(STUDENT_ASSESSMENTS_PATH)
student_assessment_df.head()

Unnamed: 0,id_assessment,id_student,date_submitted,is_banked,score
0,1752,11391,18,0,78.0
1,1752,28400,22,0,70.0
2,1752,31604,17,0,72.0
3,1752,32885,26,0,69.0
4,1752,38053,19,0,79.0


In [22]:
# Get unique course instance for the student info table
student_info_df['course_instance'] = student_info_df['code_module'] + "_" + student_info_df['code_presentation']

#### Add early grade weighted average for each student

In [None]:
# Add a early grade average column to the student info df
student_info_df['early_assessment_avg'] = np.nan