In [1]:
import pandas as pd
import umap
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.cluster import DBSCAN
import hdbscan

%matplotlib inline

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [3]:
final_df = pd.read_csv("../data/processed/final_df.csv")

In [4]:
final_df.set_index('id_student', inplace=True)

In [5]:
drop_final = ["press_mod","code_presentation", "code_module"]

def drop_col (df, list):
    df.drop(columns=(list), inplace=True)
    return df

In [6]:
drop_col(final_df, drop_final)

Unnamed: 0_level_0,age_band,gender,highest_education,n_assignments,mean_score,final_result,total_clicks,studied_credits,region
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
11391,55<=,M,HE Qualification,5,82.000000,Pass,934,240,East Anglian Region
28400,35-55,F,HE Qualification,5,66.400000,Pass,1435,60,Scotland
31604,35-55,F,A Level or Equivalent,5,76.000000,Pass,2158,60,South East Region
32885,0-35,F,Lower Than A Level,5,54.400000,Pass,1034,60,West Midlands Region
38053,35-55,M,A Level or Equivalent,5,68.000000,Pass,2445,60,Wales
...,...,...,...,...,...,...,...,...,...
2620947,0-35,F,A Level or Equivalent,9,88.888889,Distinction,476,30,Scotland
2645731,35-55,F,Lower Than A Level,9,88.111111,Distinction,893,30,East Anglian Region
2648187,0-35,F,A Level or Equivalent,9,76.666667,Pass,312,30,South Region
2679821,35-55,F,Lower Than A Level,2,91.500000,Withdrawn,275,30,South East Region


In [7]:
final_df

Unnamed: 0_level_0,age_band,gender,highest_education,n_assignments,mean_score,final_result,total_clicks,studied_credits,region
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
11391,55<=,M,HE Qualification,5,82.000000,Pass,934,240,East Anglian Region
28400,35-55,F,HE Qualification,5,66.400000,Pass,1435,60,Scotland
31604,35-55,F,A Level or Equivalent,5,76.000000,Pass,2158,60,South East Region
32885,0-35,F,Lower Than A Level,5,54.400000,Pass,1034,60,West Midlands Region
38053,35-55,M,A Level or Equivalent,5,68.000000,Pass,2445,60,Wales
...,...,...,...,...,...,...,...,...,...
2620947,0-35,F,A Level or Equivalent,9,88.888889,Distinction,476,30,Scotland
2645731,35-55,F,Lower Than A Level,9,88.111111,Distinction,893,30,East Anglian Region
2648187,0-35,F,A Level or Equivalent,9,76.666667,Pass,312,30,South Region
2679821,35-55,F,Lower Than A Level,2,91.500000,Withdrawn,275,30,South East Region


In [8]:
final_df.isnull().sum()

age_band              0
gender                0
highest_education     0
n_assignments         0
mean_score           19
final_result          0
total_clicks          0
studied_credits       0
region                0
dtype: int64

In [9]:
final_df[final_df.isnull().any(axis=1)]

Unnamed: 0_level_0,age_band,gender,highest_education,n_assignments,mean_score,final_result,total_clicks,studied_credits,region
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
186780,35-55,F,A Level or Equivalent,0,,Withdrawn,30,120,North Western Region
549713,35-55,F,Lower Than A Level,0,,Withdrawn,75,60,West Midlands Region
554393,35-55,M,A Level or Equivalent,0,,Fail,21,60,West Midlands Region
606501,0-35,F,A Level or Equivalent,0,,Withdrawn,9,120,West Midlands Region
654422,0-35,F,No Formal quals,0,,Withdrawn,31,60,East Midlands Region
555297,0-35,M,Lower Than A Level,0,,Withdrawn,155,60,North Western Region
557247,0-35,F,A Level or Equivalent,0,,Withdrawn,576,60,Yorkshire Region
557247,0-35,F,A Level or Equivalent,0,,Withdrawn,576,60,Yorkshire Region
427248,35-55,F,A Level or Equivalent,0,,Withdrawn,317,120,North Western Region
676642,0-35,M,Lower Than A Level,0,,Withdrawn,62,60,South West Region


In [10]:
#replace nan with mean value
final_df["mean_score"].fillna(72.83, inplace = True)

In [11]:
#replace "0s" with mean
final_df['n_assignments'] = np.where((final_df.n_assignments == 0), 7, final_df.n_assignments)

In [12]:
final_df['mean_score'] = np.where((final_df.mean_score == 0.0), 72.83, final_df.mean_score)

## Preprocessing

### Numeric Features

In [13]:
num_df = pd.DataFrame(data=final_df, columns=["n_assignments", "mean_score", "total_clicks"])

In [14]:
num_df_log = np.log(num_df)


In [15]:
num_df_log

Unnamed: 0_level_0,n_assignments,mean_score,total_clicks
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11391,1.609438,4.406719,6.839476
28400,1.609438,4.195697,7.268920
31604,1.609438,4.330733,7.676937
32885,1.609438,3.996364,6.941190
38053,1.609438,4.219508,7.801800
...,...,...,...
2620947,2.197225,4.487387,6.165418
2645731,2.197225,4.478599,6.794587
2648187,2.197225,4.339467,5.743003
2679821,0.693147,4.516339,5.616771


### Categorical Features

In [16]:
final_df['final_result'] = final_df['final_result'].apply(lambda x: str(x).replace('Distinction','3'))
final_df['final_result'] = final_df['final_result'].apply(lambda x: str(x).replace('Pass','2'))
final_df['final_result'] = final_df['final_result'].apply(lambda x: str(x).replace('Fail','1'))
final_df['final_result'] = final_df['final_result'].apply(lambda x: str(x).replace('Withdrawn','0'))

In [17]:
final_df['highest_education'] = final_df['highest_education'].apply(lambda x: str(x).replace('Post Graduate Qualification','4'))
final_df['highest_education'] = final_df['highest_education'].apply(lambda x: str(x).replace('HE Qualification','3'))
final_df['highest_education'] = final_df['highest_education'].apply(lambda x: str(x).replace('A Level or Equivalent','2'))
final_df['highest_education'] = final_df['highest_education'].apply(lambda x: str(x).replace('Lower Than A Level','1'))
final_df['highest_education'] = final_df['highest_education'].apply(lambda x: str(x).replace('No Formal quals','0'))

In [18]:
final_df['age_band'] = final_df['age_band'].apply(lambda x: str(x).replace('55<=','3'))
final_df['age_band'] = final_df['age_band'].apply(lambda x: str(x).replace('35-55','2'))
final_df['age_band'] = final_df['age_band'].apply(lambda x: str(x).replace('0-35','1'))

In [19]:
final_df = pd.get_dummies(data=final_df, columns=['gender', 'region'])

In [20]:
final_df

Unnamed: 0_level_0,age_band,highest_education,n_assignments,mean_score,final_result,total_clicks,studied_credits,gender_F,gender_M,region_East Anglian Region,...,region_London Region,region_North Region,region_North Western Region,region_Scotland,region_South East Region,region_South Region,region_South West Region,region_Wales,region_West Midlands Region,region_Yorkshire Region
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11391,3,3,5,82.000000,2,934,240,0,1,1,...,0,0,0,0,0,0,0,0,0,0
28400,2,3,5,66.400000,2,1435,60,1,0,0,...,0,0,0,1,0,0,0,0,0,0
31604,2,2,5,76.000000,2,2158,60,1,0,0,...,0,0,0,0,1,0,0,0,0,0
32885,1,1,5,54.400000,2,1034,60,1,0,0,...,0,0,0,0,0,0,0,0,1,0
38053,2,2,5,68.000000,2,2445,60,0,1,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2620947,1,2,9,88.888889,3,476,30,1,0,0,...,0,0,0,1,0,0,0,0,0,0
2645731,2,1,9,88.111111,3,893,30,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2648187,1,2,9,76.666667,2,312,30,1,0,0,...,0,0,0,0,0,1,0,0,0,0
2679821,2,1,2,91.500000,0,275,30,1,0,0,...,0,0,0,0,1,0,0,0,0,0


In [21]:
drop_lg = ["n_assignments","mean_score", "total_clicks"]

drop_col(final_df, drop_lg)

Unnamed: 0_level_0,age_band,highest_education,final_result,studied_credits,gender_F,gender_M,region_East Anglian Region,region_East Midlands Region,region_Ireland,region_London Region,region_North Region,region_North Western Region,region_Scotland,region_South East Region,region_South Region,region_South West Region,region_Wales,region_West Midlands Region,region_Yorkshire Region
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
11391,3,3,2,240,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0
28400,2,3,2,60,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
31604,2,2,2,60,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
32885,1,1,2,60,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0
38053,2,2,2,60,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2620947,1,2,3,30,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2645731,2,1,3,30,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2648187,1,2,2,30,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2679821,2,1,0,30,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [22]:
pro_merged_df = num_df_log.merge(final_df, left_index=True, right_index=True)

In [23]:
pro_merged_df

Unnamed: 0_level_0,n_assignments,mean_score,total_clicks,age_band,highest_education,final_result,studied_credits,gender_F,gender_M,region_East Anglian Region,...,region_London Region,region_North Region,region_North Western Region,region_Scotland,region_South East Region,region_South Region,region_South West Region,region_Wales,region_West Midlands Region,region_Yorkshire Region
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6516,1.609438,4.123903,7.934155,3,3,2,60,0,1,0,...,0,0,0,1,0,0,0,0,0,0
8462,1.945910,4.465908,6.486161,3,3,0,90,0,1,0,...,1,0,0,0,0,0,0,0,0,0
8462,1.945910,4.465908,6.486161,3,3,0,60,0,1,0,...,1,0,0,0,0,0,0,0,0,0
8462,1.945910,4.465908,6.486161,3,3,0,90,0,1,0,...,1,0,0,0,0,0,0,0,0,0
8462,1.945910,4.465908,6.486161,3,3,0,60,0,1,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2698535,2.079442,3.669951,8.352554,1,1,2,60,0,1,0,...,0,0,0,0,0,0,0,1,0,0
2698535,2.079442,3.669951,8.352554,1,1,0,60,0,1,0,...,0,0,0,0,0,0,0,1,0,0
2698535,2.079442,3.669951,8.352554,1,1,2,60,0,1,0,...,0,0,0,0,0,0,0,1,0,0
2698577,1.609438,4.165114,6.575076,2,1,1,60,1,0,0,...,0,0,0,0,0,0,0,1,0,0


# UMAP

In [24]:
pro_merged_df = pro_merged_df.to_numpy()

In [25]:
reducer = umap.UMAP(random_state=42)
reducer.fit(pro_merged_df.data)

failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!


UMAP(a=None, angular_rp_forest=False, b=None,
     force_approximation_algorithm=False, init='spectral', learning_rate=1.0,
     local_connectivity=1.0, low_memory=False, metric='euclidean',
     metric_kwds=None, min_dist=0.1, n_components=2, n_epochs=None,
     n_neighbors=15, negative_sample_rate=5, output_metric='euclidean',
     output_metric_kwds=None, random_state=42, repulsion_strength=1.0,
     set_op_mix_ratio=1.0, spread=1.0, target_metric='categorical',
     target_metric_kwds=None, target_n_neighbors=-1, target_weight=0.5,
     transform_queue_size=4.0, transform_seed=42, unique=False, verbose=False)