## Capstone - EDA

##### Reference - https://analyse.kmi.open.ac.uk/open_datasetb

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import seaborn as sns #visualisation
import matplotlib.pyplot as plt #visualisation
%matplotlib inline 
sns.set(color_codes=True)


## Importing the Student Info CSV file

In [2]:
# read in file - studentInfo.csv
file_path = r"C:\Users\sinea\OneDrive\Documents OneDrive\06 - CCT Masters in DA\Capstone - 2023\uci-open-university-learning-analytics-dataset\studentInfo.csv"

studentInfo_df = pd.read_csv(file_path)

#print(studentInfo_df.head())

In [3]:
#studentInfo_df.info()

### Student Info - EDA

In [4]:
#studentInfo_df.head(3)

In [5]:
student_df2 = studentInfo_df

In [6]:
# interger coding of gender column
student_df2['gender'] = student_df2['gender'].map({'M':0,'F':1})

In [7]:
# drop excess columns
student_df2 = studentInfo_df.drop(['region', 'disability', 'imd_band'], axis=1)

In [8]:
# filter on course AAA
filtered_df = student_df2[student_df2['code_module'] == 'AAA']

In [9]:
# filter on semester '2013J'
filtered_df2 = filtered_df[filtered_df['code_presentation'] == '2013J']

In [10]:
# drop excess columns
filtered_df2 = filtered_df2.drop(['code_module', 'code_presentation', 'num_of_prev_attempts'], axis=1)

In [11]:
# add a column for tenure using randomint & numbers between 0 and 20 

# set the seed for random 
np.random.seed(42)

filtered_df2['tenure'] = np.random.randint(0, 20, filtered_df2.shape[0])

In [12]:
unique_values2 = filtered_df2.tenure.unique()

print(unique_values2)

[ 6 19 14 10  7 18  3  2  1 11  5  0 16  9 15  4  8 17 13 12]


In [13]:
# group the tenure column into bands
#bins = [0, 5, 10, 15, float('inf')]  # float('inf') represents positive infinity

#labels = ['0-5', '5-10', '11-15', '16 or more']

In [14]:
# pd.cut() to create a new column with the corresponding bins

#filtered_df2['tenure_band'] = pd.cut(filtered_df2['tenure'], bins=bins, labels=labels, right=False)

In [15]:
#filtered_df2.info()

In [16]:
# drop the tenure column
#filtered_df2 = filtered_df2.drop(['tenure'], axis=1)

In [17]:
filtered_df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 383 entries, 0 to 382
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id_student         383 non-null    int64 
 1   gender             383 non-null    int64 
 2   highest_education  383 non-null    object
 3   age_band           383 non-null    object
 4   studied_credits    383 non-null    int64 
 5   final_result       383 non-null    object
 6   tenure             383 non-null    int32 
dtypes: int32(1), int64(3), object(3)
memory usage: 22.4+ KB


In [18]:
# save the edited filtered_df2 to a csv file to reimport as a numpy array 
#filtered_df2.to_csv('filtered_df2_A2.csv', index=False)

## One-Hot Encoding of Categorical Data

1. Reference: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

In [19]:
from sklearn.preprocessing import OneHotEncoder
from numpy import asarray

In [20]:
# read in the .csv file

data_df = pd.read_csv("filtered_df2_A2.csv")

In [21]:
data_df.head(5)

Unnamed: 0,id_student,gender,highest_education,age_band,studied_credits,final_result,tenure_band
0,11391,0,HE Qualification,55<=,240,Pass,5-10
1,28400,1,HE Qualification,35-55,60,Pass,16 or more
2,30268,1,A Level or Equivalent,35-55,60,Withdrawn,11-15
3,31604,1,A Level or Equivalent,35-55,60,Pass,11-15
4,32885,1,Lower Than A Level,0-35,60,Pass,5-10


In [22]:
# select the columns for one-hot encoding

columns_to_encode = ['highest_education', 'age_band', 'final_result', 'tenure_band']

In [23]:
# apply one-hot encoding using pd.get_dummies()

one_hot_encoded_df = pd.get_dummies(data_df, columns=columns_to_encode)

In [24]:
# concatenate the one-hot encoded DataFrame with the original DataFrame

data_df_encoded = pd.concat([data_df, one_hot_encoded_df], axis=1)

In [25]:
# remove the original columns that have been one-hot encoded

data_df_encoded.drop(columns=columns_to_encode, inplace=True)

In [26]:
# check the new df to make sure the one-hot encoding has worked 

#print(data_df_encoded)

data_df_encoded.head(3)

Unnamed: 0,id_student,gender,studied_credits,id_student.1,gender.1,studied_credits.1,highest_education_A Level or Equivalent,highest_education_HE Qualification,highest_education_Lower Than A Level,highest_education_Post Graduate Qualification,...,age_band_35-55,age_band_55<=,final_result_Distinction,final_result_Fail,final_result_Pass,final_result_Withdrawn,tenure_band_0-5,tenure_band_11-15,tenure_band_16 or more,tenure_band_5-10
0,11391,0,240,11391,0,240,0,1,0,0,...,0,1,0,0,1,0,0,0,0,1
1,28400,1,60,28400,1,60,0,1,0,0,...,1,0,0,0,1,0,0,0,1,0
2,30268,1,60,30268,1,60,1,0,0,0,...,1,0,0,0,0,1,0,1,0,0


In [27]:
# convert the dtype of the recently one-hot encoded columns to int64 from uint8 dtype

columns_to_convert = [
    'highest_education_A Level or Equivalent',
    'highest_education_HE Qualification',
    'highest_education_Lower Than A Level',
    'highest_education_Post Graduate Qualification',
    'age_band_0-35',
    'age_band_35-55',
    'age_band_55<=',
    'final_result_Distinction',
    'final_result_Fail',
    'final_result_Pass',
    'final_result_Withdrawn', 
    'tenure_band_0-5',
    'tenure_band_11-15',
    'tenure_band_16 or more',
    'tenure_band_5-10'
]

# Convert the selected columns to int64 dtype
data_df_encoded[columns_to_convert] = data_df_encoded[columns_to_convert].astype('int64')

In [28]:
#data_df_encoded.info()

## Algorithm 2 - Decision Tree 

#### Reference - https://scikit-learn.org/stable/modules/tree.html

Decision Tree was selected as the 2nd algorithm as it was the most used algorithm in the articles reviewed.

A classication decision tree was used as a counter point to the logistic regression used in the first algorithm.

In [43]:
# import the libraries for the algorithm

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score

# libraries for the confusion matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# create tables
from tabulate import tabulate

In [48]:
# drop any duplicate columns that might be present
# and stop the code being turned into a numpy array

data_df_encoded = data_df_encoded.loc[:, ~data_df_encoded.columns.duplicated()]

In [49]:
# divide the dataset into features (X) and the target variable (y) using gender

X = data_df_encoded.drop(columns=['age_band_35-55'])
y = data_df_encoded['age_band_35-55']

In [50]:
# spliting the dataframe into Test and Train data for Algorithm 2

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [51]:
# creating a new model using Decision Tree Classifier

dt_model = DecisionTreeClassifier()

#### Cross Validaition

In [52]:
# perform 5-fold cross-validation and get the accuracy scores for each fold

cv_scores = cross_val_score(dt_model, X, y, cv=5)

In [53]:
# print the accuracy scores for each fold

print("Cross-Validation Scores:", cv_scores)

Cross-Validation Scores: [1. 1. 1. 1. 1.]


In [54]:
# calculate and print the average accuracy across all folds

average_accuracy = cv_scores.mean()

print("Average Accuracy:", average_accuracy)

Average Accuracy: 1.0


## References 

1. https://analyse.kmi.open.ac.uk/open_dataset, accessed 21 July 2023
2. https://stackoverflow.com/questions/58030352/csv-file-transpose-column-to-row-in-python, accessed on 25 July 2023
3. 