In [1]:
import pandas as pd
import umap
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.cluster import DBSCAN
import hdbscan

%matplotlib inline

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [3]:
from sklearn.cluster import KMeans

In [4]:
final_df = pd.read_csv("../data/processed/final_df.csv")

In [5]:
final_df.set_index('id_student', inplace=True)

In [6]:
def drop_col (df, list):
    df.drop(columns=(list), inplace=True)
    return df

In [7]:
col_drop = ["press_mod", "region", "gender"]

In [8]:
drop_col(final_df, col_drop)

Unnamed: 0_level_0,age_band,highest_education,n_assignments,mean_score,final_result,total_clicks,studied_credits,code_module,code_presentation
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
11391,55<=,HE Qualification,5,82.000000,Pass,934,240,AAA,2013J
28400,35-55,HE Qualification,5,66.400000,Pass,1435,60,AAA,2013J
31604,35-55,A Level or Equivalent,5,76.000000,Pass,2158,60,AAA,2013J
32885,0-35,Lower Than A Level,5,54.400000,Pass,1034,60,AAA,2013J
38053,35-55,A Level or Equivalent,5,68.000000,Pass,2445,60,AAA,2013J
...,...,...,...,...,...,...,...,...,...
2620947,0-35,A Level or Equivalent,9,88.888889,Distinction,476,30,GGG,2014J
2645731,35-55,Lower Than A Level,9,88.111111,Distinction,893,30,GGG,2014J
2648187,0-35,A Level or Equivalent,9,76.666667,Pass,312,30,GGG,2014J
2679821,35-55,Lower Than A Level,2,91.500000,Withdrawn,275,30,GGG,2014J


In [9]:
idx=final_df.index.tolist()

In [10]:
final_df.index.min()

6516

In [11]:
final_df

Unnamed: 0_level_0,age_band,highest_education,n_assignments,mean_score,final_result,total_clicks,studied_credits,code_module,code_presentation
id_student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
11391,55<=,HE Qualification,5,82.000000,Pass,934,240,AAA,2013J
28400,35-55,HE Qualification,5,66.400000,Pass,1435,60,AAA,2013J
31604,35-55,A Level or Equivalent,5,76.000000,Pass,2158,60,AAA,2013J
32885,0-35,Lower Than A Level,5,54.400000,Pass,1034,60,AAA,2013J
38053,35-55,A Level or Equivalent,5,68.000000,Pass,2445,60,AAA,2013J
...,...,...,...,...,...,...,...,...,...
2620947,0-35,A Level or Equivalent,9,88.888889,Distinction,476,30,GGG,2014J
2645731,35-55,Lower Than A Level,9,88.111111,Distinction,893,30,GGG,2014J
2648187,0-35,A Level or Equivalent,9,76.666667,Pass,312,30,GGG,2014J
2679821,35-55,Lower Than A Level,2,91.500000,Withdrawn,275,30,GGG,2014J


In [12]:
final_df.columns

Index(['age_band', 'highest_education', 'n_assignments', 'mean_score',
       'final_result', 'total_clicks', 'studied_credits', 'code_module',
       'code_presentation'],
      dtype='object')

## Scaling NUM Feats.

In [13]:
FEATS = ['n_assignments', 'mean_score', 'total_clicks', 'studied_credits']


In [14]:
transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')), 
                ('scaler', StandardScaler())])


In [15]:
preprocessor = \
ColumnTransformer(transformers=[('num', transformer,FEATS)])

In [16]:
preprocessor

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('num',
                                 Pipeline(memory=None,
                                          steps=[('imputer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='mean',
                                                                verbose=0)),
                                                 ('scaler',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                             

In [17]:
blind_num_df = pd.DataFrame(data=preprocessor.fit_transform(final_df))

In [18]:
blind_num_df

Unnamed: 0,0,1,2,3
0,-0.604870,0.589045,-0.409888,4.156988
1,-0.604870,-0.413900,-0.170668,-0.455321
2,-0.604870,0.203296,0.174555,-0.455321
3,-0.604870,-1.185397,-0.362140,-0.455321
4,-0.604870,-0.311034,0.311593,-0.455321
...,...,...,...,...
26716,0.275801,1.031940,-0.628577,-1.224039
26717,0.275801,0.981936,-0.429465,-1.224039
26718,0.275801,0.246157,-0.706884,-1.224039
26719,-1.265373,1.199812,-0.724551,-1.224039


## Scaling CAT Feats.

In [19]:
final_df.final_result.unique()

array(['Pass', 'Withdrawn', 'Fail', 'Distinction'], dtype=object)

In [20]:
final_df['final_result'] = final_df['final_result'].apply(lambda x: str(x).replace('Distinction','4'))
final_df['final_result'] = final_df['final_result'].apply(lambda x: str(x).replace('Pass','3'))
final_df['final_result'] = final_df['final_result'].apply(lambda x: str(x).replace('Fail','2'))
final_df['final_result'] = final_df['final_result'].apply(lambda x: str(x).replace('Withdrawn','1'))

In [21]:
final_df['highest_education'] = final_df['highest_education'].apply(lambda x: str(x).replace('Post Graduate Qualification','5'))
final_df['highest_education'] = final_df['highest_education'].apply(lambda x: str(x).replace('HE Qualification','4'))
final_df['highest_education'] = final_df['highest_education'].apply(lambda x: str(x).replace('A Level or Equivalent','3'))
final_df['highest_education'] = final_df['highest_education'].apply(lambda x: str(x).replace('Lower Than A Level','2'))
final_df['highest_education'] = final_df['highest_education'].apply(lambda x: str(x).replace('No Formal quals','1'))

In [22]:
final_df['age_band'] = final_df['age_band'].apply(lambda x: str(x).replace('55<=','3'))
final_df['age_band'] = final_df['age_band'].apply(lambda x: str(x).replace('35-55','2'))
final_df['age_band'] = final_df['age_band'].apply(lambda x: str(x).replace('0-35','1'))

In [23]:
final_df.code_module.unique()

array(['AAA', 'FFF', 'BBB', 'DDD', 'GGG', 'CCC', 'EEE'], dtype=object)

In [24]:
#code_module = get dummies

In [25]:
final_df.code_presentation.unique()

array(['2013J', '2014J', '2014B', '2013B'], dtype=object)

In [26]:
#Ordinal encoding
final_df['code_presentation'] = final_df['code_presentation'].apply(lambda x: str(x).replace('2014J','4'))
final_df['code_presentation'] = final_df['code_presentation'].apply(lambda x: str(x).replace('2014B','3'))
final_df['code_presentation'] = final_df['code_presentation'].apply(lambda x: str(x).replace('2013J','2'))
final_df['code_presentation'] = final_df['code_presentation'].apply(lambda x: str(x).replace('2013B','1'))

In [27]:
#final_all_num = final_df.merge(cat_dummies, left_index=True, right_index=True)

In [28]:
#final_all_num

In [29]:
#Dropping Num feats and dummies on final_df
#drop_col(final_df, FEATS)

In [30]:
#cat_dummies

#blind_num_df


In [31]:
#blind_num_df.to_csv('../data/processed/blind_num_df.csv', index=False)
#merge_final_cat.to_csv('../data/processed/cat_merge.csv', index=False)

# UMAP

In [32]:
#cat_feat = merge_final_cat.to_numpy()

In [33]:
#reducer = umap.UMAP(random_state=42)
#reducer.fit(cat_feat.data)

In [34]:
#embedding = reducer.transform(cat_feat.data)

In [35]:
#plt.scatter(embedding[:, 0], embedding[:, 1], cmap='Spectral', s=5)
#plt.gca().set_aspect('equal', 'datalim')
#plt.colorbar(boundaries=np.arange(11)-0.5).set_ticks(np.arange(10))
#plt.title('UMAP projection of OU Dataset', fontsize=24);