In [1]:
import numpy as np
import pandas as pd
import sklearn
import sklearn.preprocessing, sklearn.cluster, sklearn.metrics
import scipy.spatial
import matplotlib.pyplot as plt
import seaborn as sns

# Social, gender and study data from secondary school students


* school - student's school (binary: 'GP' - Gabriel Pereira or 'MS' - Mousinho da Silveira)
* sex - student's sex (binary: 'F' - female or 'M' - male)
* age - student's age (numeric: from 15 to 22)
* address - student's home address type (binary: 'U' - urban or 'R' - rural)
* famsize - family size (binary: 'LE3' - less or equal to 3 or 'GT3' - greater than 3)
* Pstatus - parent's cohabitation status (binary: 'T' - living together or 'A' - apart)
* Medu - mother's education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education)
* Fedu - father's education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education)
* Mjob - mother's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')
* Fjob - father's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')
* reason - reason to choose this school (nominal: close to 'home', school 'reputation', 'course' preference or 'other')
* guardian - student's guardian (nominal: 'mother', 'father' or 'other')
* traveltime - home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - >1 hour)
* studytime - weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours)
* failures - number of past class failures (numeric: n if 1<=n<3, else 4)
* schoolsup - extra educational support (binary: yes or no)
* famsup - family educational support (binary: yes or no)
* paid - extra paid classes within the course subject (Math or Portuguese) (binary: yes or no)
* activities - extra-curricular activities (binary: yes or no)
* nursery - attended nursery school (binary: yes or no)
* higher - wants to take higher education (binary: yes or no)
* internet - Internet access at home (binary: yes or no)
* romantic - with a romantic relationship (binary: yes or no)
* famrel - quality of family relationships (numeric: from 1 - very bad to 5 - excellent)
* freetime - free time after school (numeric: from 1 - very low to 5 - very high)
* goout - going out with friends (numeric: from 1 - very low to 5 - very high)
* Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high)
* Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high)
* health - current health status (numeric: from 1 - very bad to 5 - very good)
* absences - number of school absences (numeric: from 0 to 93)



In [2]:
df = pd.read_csv('student-por.csv')
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13


### Chybejici hodnoty

In [3]:
df.isna().sum().sort_values(ascending=False)

school        0
paid          0
G2            0
G1            0
absences      0
health        0
Walc          0
Dalc          0
goout         0
freetime      0
famrel        0
romantic      0
internet      0
higher        0
nursery       0
activities    0
famsup        0
sex           0
schoolsup     0
failures      0
studytime     0
traveltime    0
guardian      0
reason        0
Fjob          0
Mjob          0
Fedu          0
Medu          0
Pstatus       0
famsize       0
address       0
age           0
G3            0
dtype: int64

In [4]:
df.dtypes

school        object
sex           object
age            int64
address       object
famsize       object
Pstatus       object
Medu           int64
Fedu           int64
Mjob          object
Fjob          object
reason        object
guardian      object
traveltime     int64
studytime      int64
failures       int64
schoolsup     object
famsup        object
paid          object
activities    object
nursery       object
higher        object
internet      object
romantic      object
famrel         int64
freetime       int64
goout          int64
Dalc           int64
Walc           int64
health         int64
absences       int64
G1             int64
G2             int64
G3             int64
dtype: object

### Numericka data

In [5]:
df.describe(include=np.number)

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
count,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0
mean,16.744222,2.514638,2.306626,1.568567,1.930663,0.22188,3.930663,3.180277,3.1849,1.502311,2.280431,3.53621,3.659476,11.399076,11.570108,11.906009
std,1.218138,1.134552,1.099931,0.74866,0.82951,0.593235,0.955717,1.051093,1.175766,0.924834,1.28438,1.446259,4.640759,2.745265,2.913639,3.230656
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,16.0,2.0,1.0,1.0,1.0,0.0,4.0,3.0,2.0,1.0,1.0,2.0,0.0,10.0,10.0,10.0
50%,17.0,2.0,2.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,2.0,4.0,2.0,11.0,11.0,12.0
75%,18.0,4.0,3.0,2.0,2.0,0.0,5.0,4.0,4.0,2.0,3.0,5.0,6.0,13.0,13.0,14.0
max,22.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,5.0,5.0,5.0,32.0,19.0,19.0,19.0


### Kategoricka data

In [6]:
df.describe(exclude=np.number)

Unnamed: 0,school,sex,address,famsize,Pstatus,Mjob,Fjob,reason,guardian,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic
count,649,649,649,649,649,649,649,649,649,649,649,649,649,649,649,649,649
unique,2,2,2,2,2,5,5,4,3,2,2,2,2,2,2,2,2
top,GP,F,U,GT3,T,other,other,course,mother,no,yes,no,no,yes,yes,yes,no
freq,423,383,452,457,569,258,367,285,455,581,398,610,334,521,580,498,410


In [7]:
df.age.value_counts()

17    179
16    177
18    140
15    112
19     32
20      6
21      2
22      1
Name: age, dtype: int64

In [8]:
df.Fedu.value_counts()

2    209
1    174
3    131
4    128
0      7
Name: Fedu, dtype: int64

## Data Preprocesing
Budeme chtit zpracovavat sloupce u kterych ma rozdeleni do clusteru smysl. V nasem pripade se zde nenachazi nejake jedinecne hodnoty, ktere by byly pro kazdy zaznam jine, jako napr. jmeno studenta. Vsechny atributy se daji pouzit pro rozdeleni do skupin, takze se zadnych nebudeme zbavovat. Samozrejme by jsme se mohli zbavit tech co nas nezajimaji nebo nam neprijdou relevantni, ale v tomto pripade uz ze zvedavosti to nechame jak to je a budeme se soustredit na spravny preprocesing dat.

In [9]:
df_encoded = pd.DataFrame(index = df.index)


### Zacneme enkodovanim kategorickych dat

Prvne na binarni data

In [10]:
df_encoded['school'] = sklearn.preprocessing.OrdinalEncoder().fit_transform(df[['school']])
df_encoded['sex'] = sklearn.preprocessing.OrdinalEncoder().fit_transform(df[['sex']])
df_encoded['address'] = sklearn.preprocessing.OrdinalEncoder().fit_transform(df[['address']])
df_encoded['famsize'] = sklearn.preprocessing.OrdinalEncoder().fit_transform(df[['famsize']])
df_encoded['Pstatus'] = sklearn.preprocessing.OrdinalEncoder().fit_transform(df[['Pstatus']])
df_encoded['schoolsup'] = sklearn.preprocessing.OrdinalEncoder().fit_transform(df[['schoolsup']])
df_encoded['famsup'] = sklearn.preprocessing.OrdinalEncoder().fit_transform(df[['famsup']])
df_encoded['paid'] = sklearn.preprocessing.OrdinalEncoder().fit_transform(df[['paid']])
df_encoded['activities'] = sklearn.preprocessing.OrdinalEncoder().fit_transform(df[['activities']])
df_encoded['nursery'] = sklearn.preprocessing.OrdinalEncoder().fit_transform(df[['nursery']])
df_encoded['higher'] = sklearn.preprocessing.OrdinalEncoder().fit_transform(df[['higher']])
df_encoded['internet'] = sklearn.preprocessing.OrdinalEncoder().fit_transform(df[['internet']])
df_encoded['romantic'] = sklearn.preprocessing.OrdinalEncoder().fit_transform(df[['romantic']])
df_encoded.head()

Unnamed: 0,school,sex,address,famsize,Pstatus,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic
0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
2,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
3,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
4,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0


Dale kategoricka data u kteruch nam nezalezi na vzdalenosti, takze je prevedeme dummies.

In [11]:
df_encoded = df_encoded.join(pd.get_dummies(df[['Mjob']]))
df_encoded = df_encoded.join(pd.get_dummies(df[['Fjob']]))
df_encoded = df_encoded.join(pd.get_dummies(df[['reason']]))
df_encoded = df_encoded.join(pd.get_dummies(df[['guardian']]))
df_encoded.head()

Unnamed: 0,school,sex,address,famsize,Pstatus,schoolsup,famsup,paid,activities,nursery,...,Fjob_other,Fjob_services,Fjob_teacher,reason_course,reason_home,reason_other,reason_reputation,guardian_father,guardian_mother,guardian_other
0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0,0,1,1,0,0,0,0,1,0
1,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,1,0,0,1,0,0,0,1,0,0
2,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,...,1,0,0,0,0,1,0,0,1,0
3,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,0,1,0,0,1,0,0,0,1,0
4,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1,0,0,0,1,0,0,1,0,0


Ted bude potreba dobre skalovat ciselna data

In [18]:
#age	Medu	Fedu	traveltime	studytime	failures	famrel	freetime	goout	Dalc	Walc	health	absences	G1	G2	G3
df_encoded['Age'] = sklearn.preprocessing.minmax_scale(df.age)

In [61]:
df.describe()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
count,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0,649.0
mean,16.744222,2.514638,2.306626,1.568567,1.930663,0.22188,3.930663,3.180277,3.1849,1.502311,2.280431,3.53621,3.659476,11.399076,11.570108,11.906009
std,1.218138,1.134552,1.099931,0.74866,0.82951,0.593235,0.955717,1.051093,1.175766,0.924834,1.28438,1.446259,4.640759,2.745265,2.913639,3.230656
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,16.0,2.0,1.0,1.0,1.0,0.0,4.0,3.0,2.0,1.0,1.0,2.0,0.0,10.0,10.0,10.0
50%,17.0,2.0,2.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,2.0,4.0,2.0,11.0,11.0,12.0
75%,18.0,4.0,3.0,2.0,2.0,0.0,5.0,4.0,4.0,2.0,3.0,5.0,6.0,13.0,13.0,14.0
max,22.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,5.0,5.0,5.0,32.0,19.0,19.0,19.0


## Clustering