In [1]:
import os, sys

import numpy as np
import pandas as pd

from os.path import join

from library import SBKMeans, ErrorChecker
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.cluster import KMeans

In [2]:
pwd = os.getcwd()
data_dir = join(pwd, 'data/')
need_normalization = False
need_standardization = True
need_pca = True
filename = 'turkiye-student-evaluation_R_Specific.csv'
datapath = join(data_dir, filename)
num_comps = 2
num_clusters = 3
num_iterations = 50

In [3]:
df = pd.read_csv(datapath, low_memory=False)

In [4]:
print(len(df))

5820


In [5]:
print(df.head())

   instr  class  nb.repeat  attendance  difficulty  Q1  Q2  Q3  Q4  Q5  ...  \
1      1      2          1           0           4   3   3   3   3   3  ...   
2      1      2          1           1           3   3   3   3   3   3  ...   
3      1      2          1           2           4   5   5   5   5   5  ...   
4      1      2          1           1           3   3   3   3   3   3  ...   
5      1      2          1           0           1   1   1   1   1   1  ...   

   Q19  Q20  Q21  Q22  Q23  Q24  Q25  Q26  Q27  Q28  
1    3    3    3    3    3    3    3    3    3    3  
2    3    3    3    3    3    3    3    3    3    3  
3    5    5    5    5    5    5    5    5    5    5  
4    3    3    3    3    3    3    3    3    3    3  
5    1    1    1    1    1    1    1    1    1    1  

[5 rows x 33 columns]


In [6]:
print(df.dtypes)

instr         int64
class         int64
nb.repeat     int64
attendance    int64
difficulty    int64
Q1            int64
Q2            int64
Q3            int64
Q4            int64
Q5            int64
Q6            int64
Q7            int64
Q8            int64
Q9            int64
Q10           int64
Q11           int64
Q12           int64
Q13           int64
Q14           int64
Q15           int64
Q16           int64
Q17           int64
Q18           int64
Q19           int64
Q20           int64
Q21           int64
Q22           int64
Q23           int64
Q24           int64
Q25           int64
Q26           int64
Q27           int64
Q28           int64
dtype: object


In [7]:
columns = df.columns.tolist()
cols = columns[1:]
# cols = ['Survived', 'Pclass', 'Age', 'SibSp', 'Fare']
# cols = columns[:-1]
cols = ['Passenger Count', 'Adjusted Passenger Count']
cols = columns[1:]
df = df[cols]
for col in cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df = df.fillna(0)
df = df.replace([-np.inf, np.inf], np.nan)
df = df.dropna()
df = df.loc[:, (df != 0).any(axis=0)]
print(len(df))
print(df.dtypes)

5820
class         int64
nb.repeat     int64
attendance    int64
difficulty    int64
Q1            int64
Q2            int64
Q3            int64
Q4            int64
Q5            int64
Q6            int64
Q7            int64
Q8            int64
Q9            int64
Q10           int64
Q11           int64
Q12           int64
Q13           int64
Q14           int64
Q15           int64
Q16           int64
Q17           int64
Q18           int64
Q19           int64
Q20           int64
Q21           int64
Q22           int64
Q23           int64
Q24           int64
Q25           int64
Q26           int64
Q27           int64
Q28           int64
dtype: object


In [8]:
df = df.sample(
    frac=1.0,
    random_state=1,
)
df = df.reset_index(drop=True)

In [9]:
if need_normalization is True:
    normalizer = StandardScaler()
    tmp = normalizer.fit_transform(df)
    df = pd.DataFrame(tmp, columns=df.columns)
elif need_standardization is True:
    scaler = MinMaxScaler()
    tmp = scaler.fit_transform(df)
    df = pd.DataFrame(tmp, columns=df.columns)

In [10]:
X = df
if need_pca is True:
    pca = PCA(
        n_components=num_comps,
        svd_solver='auto',
    )
    X = pca.fit_transform(X)
else:
    X = X.to_numpy()

In [11]:
for x in X:
    for a in x:
        if np.isinf(a):
            print(x)

In [12]:
data = X
print(data[:5])
print(data.shape)

[[-1.10503648  0.36752731]
 [-2.36774079 -0.73266403]
 [-1.09405395  0.32037828]
 [ 0.27859712 -0.45985762]
 [ 2.91372029 -0.46401632]]
(5820, 2)


In [13]:
sbkmeans = SBKMeans(
    n_clusters=num_clusters,
    n_iters=num_iterations,
)

In [14]:
centers = sbkmeans.fit(data)

Initial centers:
[[ 2.87186485 -0.48637943]
 [-1.04645225 -0.66964007]
 [ 0.85834787  0.23984265]]


In [15]:
print(sbkmeans.centers)

[[ 1.14791021 -0.4308732 ]
 [-1.10482511  0.08094426]
 [ 1.42712825  0.24767369]]


In [16]:
labels = sbkmeans.predict(data)

In [17]:
print(labels[:5])

[1, 1, 1, 0, 2]


In [18]:
error_checker = ErrorChecker(
    X=data,
    centers=centers,
    labels=labels,
)

In [19]:
dist_total = error_checker.potential_function()

In [20]:
print(dist_total)

72361.56180138634


In [21]:
kmeans = KMeans(
    n_clusters=num_clusters,
    max_iter=num_iterations,
    init='k-means++',
    algorithm='auto',
#     n_init=50,
)

In [22]:
labels = kmeans.fit_predict(data)

In [23]:
error_checker = ErrorChecker(
    data,
    kmeans.cluster_centers_,
    kmeans.labels_,
)

In [24]:
dist_total = error_checker.potential_function()

In [25]:
print(dist_total)

89530.08753283779
