In [1]:
import pandas as pd
import umap
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.cluster import DBSCAN
import hdbscan

%matplotlib inline

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [3]:
final_df = pd.read_csv("../data/processed/final_df.csv")

In [4]:
final_df.head()

Unnamed: 0,id_student,age_band,gender,highest_education,n_assignments,mean_score,final_result,total_clicks,studied_credits,region,code_module,code_presentation,press_mod
0,11391,55<=,M,HE Qualification,5,82.0,Pass,934,240,East Anglian Region,AAA,2013J,2013J-AAA
1,28400,35-55,F,HE Qualification,5,66.4,Pass,1435,60,Scotland,AAA,2013J,2013J-AAA
2,31604,35-55,F,A Level or Equivalent,5,76.0,Pass,2158,60,South East Region,AAA,2013J,2013J-AAA
3,32885,0-35,F,Lower Than A Level,5,54.4,Pass,1034,60,West Midlands Region,AAA,2013J,2013J-AAA
4,38053,35-55,M,A Level or Equivalent,5,68.0,Pass,2445,60,Wales,AAA,2013J,2013J-AAA


In [5]:
final_df.dtypes

id_student             int64
age_band              object
gender                object
highest_education     object
n_assignments          int64
mean_score           float64
final_result          object
total_clicks           int64
studied_credits        int64
region                object
code_module           object
code_presentation     object
press_mod             object
dtype: object

In [7]:
#find null values
final_df[final_df.isnull().any(axis=1)]

Unnamed: 0,id_student,age_band,gender,highest_education,n_assignments,mean_score,final_result,total_clicks,studied_credits,region,code_module,code_presentation,press_mod
797,186780,35-55,F,A Level or Equivalent,0,,Withdrawn,30,120,North Western Region,BBB,2013B,2013B-BBB
1734,549713,35-55,F,Lower Than A Level,0,,Withdrawn,75,60,West Midlands Region,BBB,2013B,2013B-BBB
1843,554393,35-55,M,A Level or Equivalent,0,,Fail,21,60,West Midlands Region,BBB,2013B,2013B-BBB
4498,606501,0-35,F,A Level or Equivalent,0,,Withdrawn,9,120,West Midlands Region,BBB,2014B,2014B-BBB
6171,654422,0-35,F,No Formal quals,0,,Withdrawn,31,60,East Midlands Region,BBB,2014J,2014J-BBB
13730,555297,0-35,M,Lower Than A Level,0,,Withdrawn,155,60,North Western Region,DDD,2013B,2013B-DDD
13768,557247,0-35,F,A Level or Equivalent,0,,Withdrawn,576,60,Yorkshire Region,DDD,2013B,2013B-DDD
13769,557247,0-35,F,A Level or Equivalent,0,,Withdrawn,576,60,Yorkshire Region,DDD,2013J,2013J-DDD
14355,427248,35-55,F,A Level or Equivalent,0,,Withdrawn,317,120,North Western Region,DDD,2013J,2013J-DDD
17039,676642,0-35,M,Lower Than A Level,0,,Withdrawn,62,60,South West Region,DDD,2014J,2014J-DDD


In [8]:
final_df.isnull().sum()

id_student            0
age_band              0
gender                0
highest_education     0
n_assignments         0
mean_score           19
final_result          0
total_clicks          0
studied_credits       0
region                0
code_module           0
code_presentation     0
press_mod             0
dtype: int64

In [9]:
#replace nan with mean value
final_df["mean_score"].fillna(72.83, inplace = True)

In [10]:
final_df.describe()

Unnamed: 0,id_student,n_assignments,mean_score,total_clicks,studied_credits
count,26721.0,26721.0,26721.0,26721.0,26721.0
mean,708581.6,7.747315,72.837882,1792.429812,77.769357
std,554072.3,4.542077,15.554484,2094.3421,39.026739
min,6516.0,0.0,0.0,1.0,30.0
25%,505878.0,4.0,64.8,460.0,60.0
50%,589327.0,7.0,75.714286,1082.0,60.0
75%,642196.0,11.0,84.047619,2383.0,90.0
max,2698588.0,28.0,100.0,28615.0,630.0


In [11]:
#replace "0s" with mean
final_df['n_assignments'] = np.where((final_df.n_assignments == 0), 7, final_df.n_assignments)


In [12]:
final_df['mean_score'] = np.where((final_df.mean_score == 0.0), 72.83, final_df.mean_score)

In [13]:
final_df['clicks_per_asmt'] = final_df['total_clicks']/final_df['n_assignments']

In [14]:
num_res_df = pd.DataFrame(data=final_df, columns=["n_assignments", "mean_score", "clicks_per_asmt", "studied_credits","id_student", "final_result"])



In [15]:
final_df.isnull().sum()

id_student           0
age_band             0
gender               0
highest_education    0
n_assignments        0
mean_score           0
final_result         0
total_clicks         0
studied_credits      0
region               0
code_module          0
code_presentation    0
press_mod            0
clicks_per_asmt      0
dtype: int64

In [16]:
num_res_df

Unnamed: 0,n_assignments,mean_score,clicks_per_asmt,studied_credits,id_student,final_result
0,5,82.000000,186.800000,240,11391,Pass
1,5,66.400000,287.000000,60,28400,Pass
2,5,76.000000,431.600000,60,31604,Pass
3,5,54.400000,206.800000,60,32885,Pass
4,5,68.000000,489.000000,60,38053,Pass
...,...,...,...,...,...,...
26716,9,88.888889,52.888889,30,2620947,Distinction
26717,9,88.111111,99.222222,30,2645731,Distinction
26718,9,76.666667,34.666667,30,2648187,Pass
26719,2,91.500000,137.500000,30,2679821,Withdrawn


In [17]:
num_res_df['final_result'] = num_res_df['final_result'].apply(lambda x: str(x).replace('Distinction','4'))
num_res_df['final_result'] = num_res_df['final_result'].apply(lambda x: str(x).replace('Pass','3'))
num_res_df['final_result'] = num_res_df['final_result'].apply(lambda x: str(x).replace('Fail','2'))
num_res_df['final_result'] = num_res_df['final_result'].apply(lambda x: str(x).replace('Withdrawn','1'))

In [None]:
num_res_df

In [18]:
num_res_df.set_index(['id_student'], inplace=True)

In [19]:
num_res_df.dtypes

n_assignments        int64
mean_score         float64
clicks_per_asmt    float64
studied_credits      int64
final_result        object
dtype: object

In [20]:
num_res_df=num_res_df.astype(int)

In [24]:
num_res_df.dtypes

n_assignments      int64
mean_score         int64
clicks_per_asmt    int64
studied_credits    int64
final_result       int64
dtype: object

In [None]:
#num_res_df.to_csv('../data/processed/numeric_results_df.csv', index=False)

In [33]:
num_res_df.clicks_per_asmt.unique()

array([ 186,  287,  431, ..., 1290, 1196,  826])

In [25]:
num_res_log = np.log(num_res_df)

  """Entry point for launching an IPython kernel.


In [None]:
#to numpy

In [28]:
np_num_log = num_res_log.to_numpy()

# UMAP LOG

In [29]:
reducer = umap.UMAP(random_state=42)
reducer.fit(np_num_log.data)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [None]:
embedding = reducer.transform(np_num_log.data)


In [None]:
plt.scatter(embedding[:, 0], embedding[:, 1], cmap='Spectral', s=5)
plt.gca().set_aspect('equal', 'datalim')
plt.colorbar(boundaries=np.arange(11)-0.5).set_ticks(np.arange(10))
plt.title('UMAP projection of the numeric dataset', fontsize=24);

# UMAP StandardScaler Mean

In [None]:
transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')), 
                ('scaler', StandardScaler())])


In [None]:
FEATS = ['n_assignments', 'mean_score','final_result', 'clicks_per_asmt', 'studied_credits']
preprocessor = \
ColumnTransformer(transformers=[('num', transformer, FEATS)])

In [None]:
preprocessor

In [None]:
blind = pd.DataFrame(data=preprocessor.fit_transform(num_res_df))

In [None]:
blind

In [None]:
blind_np = blind.to_numpy()

In [None]:
reducer_sclr = umap.UMAP(random_state=42)
reducer_sclr.fit(blind_np.data)

In [None]:
embedding_sclr = reducer_sclr.transform(blind_np.data)


In [None]:
plt.scatter(embedding_sclr[:, 0], embedding_sclr[:, 1], cmap='Spectral', s=5)
plt.gca().set_aspect('equal', 'datalim')
plt.colorbar(boundaries=np.arange(11)-0.5).set_ticks(np.arange(10))
plt.title('UMAP projection of the numeric dataset', fontsize=24);

# UMAP StandardScaler Median

In [None]:
transformer_med = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), 
                ('scaler', StandardScaler())])


In [None]:
preprocessor_med = \
ColumnTransformer(transformers=[('num', transformer, FEATS)])

In [None]:
preprocessor_med

In [None]:
blind_med = pd.DataFrame(data=preprocessor_med.fit_transform(num_res_df))

In [None]:
blind_med_np = blind_med.to_numpy()

In [None]:
reducer_med = umap.UMAP(random_state=42)
reducer_med.fit(blind_med_np.data)

In [None]:
embedding_med = reducer_med.transform(blind_med_np.data)


In [None]:
plt.scatter(embedding_med[:, 0], embedding_med[:, 1], cmap='Spectral', s=5)
plt.gca().set_aspect('equal', 'datalim')
plt.colorbar(boundaries=np.arange(11)-0.5).set_ticks(np.arange(10))
plt.title('UMAP projection of the numeric dataset', fontsize=24);