In [1]:
PATH = '/home/g056122/FeatExt_Data_Clustering/final_github/collaborative_filtering/'    

In [2]:
DATAPATH = '/home/g056122/ISG_studienarbeit/Movielens_100K'

In [3]:
''' 
Load 'euclid', 'validate' and 'valres' module from the 'Module' folder 
'''
# import required libraries
import numpy as np
import pandas as pd
from kmodes.kmodes import KModes
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from validate import evaluation
from euclid import EuclideanDistance
from valres import result

In [4]:
# import users dataset
dfusers = pd.read_csv(f"{DATAPATH}/u.user", sep='|', header=None, engine='python', encoding='latin-1')
dfusers.columns = ['userId', 'age', 'gender', 'occupation', 'zipcode']

In [5]:
dfusers.shape

(943, 5)

In [6]:
# encode users dataset
categorylist=['age', 'gender', 'occupation', 'zipcode']
n_dfusers = pd.get_dummies(dfusers, columns=categorylist)

In [7]:
# drop userId feature from dataset
n_dfusers = n_dfusers.drop('userId', axis=1)

In [8]:
n_dfusers.shape

(943, 879)

In [9]:
# KMode clustering with optimal value 
num_clusters = 8
kmodes = KModes(n_clusters=num_clusters, init="Huang")
kmodes.fit_predict(n_dfusers)

array([0, 5, 6, 0, 5, 7, 3, 3, 2, 0, 5, 5, 1, 0, 5, 0, 0, 5, 0, 5, 6, 6,
       5, 5, 0, 0, 5, 6, 0, 2, 0, 5, 2, 5, 5, 5, 2, 5, 0, 0, 0, 3, 5, 0,
       0, 5, 0, 3, 5, 6, 1, 5, 0, 7, 0, 0, 0, 0, 1, 0, 0, 5, 0, 1, 5, 2,
       2, 2, 0, 0, 0, 5, 2, 0, 0, 2, 0, 3, 5, 5, 2, 0, 4, 7, 1, 3, 3, 5,
       5, 1, 0, 0, 7, 2, 3, 5, 0, 5, 2, 7, 2, 0, 2, 2, 0, 0, 0, 1, 4, 2,
       0, 0, 7, 0, 0, 0, 2, 3, 0, 5, 0, 5, 5, 2, 0, 5, 0, 5, 5, 0, 5, 4,
       0, 0, 2, 4, 1, 0, 2, 5, 0, 4, 0, 0, 0, 0, 5, 0, 5, 5, 5, 5, 2, 2,
       5, 1, 0, 1, 5, 0, 0, 0, 3, 0, 5, 1, 4, 4, 5, 5, 5, 0, 4, 5, 5, 0,
       0, 4, 0, 5, 7, 0, 0, 0, 5, 5, 1, 2, 0, 3, 3, 1, 2, 3, 0, 6, 0, 5,
       6, 0, 6, 5, 5, 5, 0, 5, 0, 0, 5, 0, 0, 5, 7, 5, 0, 0, 4, 3, 0, 0,
       2, 0, 5, 5, 5, 2, 7, 5, 5, 5, 0, 0, 0, 0, 1, 5, 3, 5, 0, 5, 5, 1,
       1, 0, 2, 2, 0, 2, 2, 7, 0, 0, 5, 1, 0, 5, 2, 5, 2, 5, 3, 5, 0, 5,
       7, 5, 0, 0, 5, 5, 0, 0, 5, 5, 0, 2, 5, 5, 0, 5, 5, 3, 0, 7, 0, 2,
       0, 0, 0, 0, 2, 5, 6, 0, 1, 5, 5, 7, 0, 5, 2,

In [10]:
# cluster centroids
centroids = kmodes.cluster_centroids_

In [11]:
centroids.dtype, centroids.shape

(dtype('uint8'), (8, 879))

In [12]:
# convert users dataframe to an array
n_dfusers = n_dfusers.values

In [13]:
# compute euclidean distance between a cluster centroid and each data point
euc = EuclideanDistance(n_dfusers, num_clusters, centroids)
eucres = euc.compute()

  val = (self.data[i][j] - self.centroid[k][j])**2


In [14]:
eucres.shape

(943, 8)

In [15]:
# convert userId feature to an array
val_u = np.array(dfusers.userId.values)
val_u = val_u.reshape(n_dfusers.shape[0],1)
val_u.shape


(943, 1)

In [16]:
# merge dataset
n_dfusers = np.concatenate((val_u, n_dfusers), axis=1)
n_dfusers, n_dfusers.shape

(array([[  1,   0,   0, ...,   0,   0,   0],
        [  2,   0,   0, ...,   0,   0,   0],
        [  3,   0,   0, ...,   0,   0,   0],
        ...,
        [941,   0,   0, ...,   0,   0,   0],
        [942,   0,   0, ...,   0,   0,   0],
        [943,   0,   0, ...,   0,   0,   0]]),
 (943, 880))

In [17]:
# merge dataset
n_dfusers = np.concatenate((n_dfusers, eucres), axis=1)
n_dfusers.shape

(943, 888)

In [18]:
# import ratings dataset
dfratings = pd.read_csv(f"{DATAPATH}/u.data", sep='\t', header=None, engine='python', encoding='latin-1')
dfratings.columns = ['userId', 'movieId', 'rating', 'timestamp']

In [19]:
# timestamp feature transformation
dfratings['timestamp'] = pd.to_datetime (dfratings['timestamp'], unit='s')
dfratings['year']=dfratings['timestamp'].dt.year

In [20]:
# scalerize ratings dataset
scaler = MinMaxScaler()
dfratings[['year', 'rating']]=scaler.fit_transform(dfratings[['year', 'rating']])

In [21]:
# drop timestamp feature
dfratings = dfratings.drop('timestamp', axis=1)

In [22]:
# split dataset for training and validation
n_dfratings = dfratings.drop('rating', axis=1).values
n_ratings = dfratings['rating'].values

In [23]:
# merge ratings and users dataset
list =[]
for m in range(n_dfratings.shape[0]):
    for n in range(n_dfusers.shape[0]):
        if (n_dfratings[m][0] == n_dfusers[n][0]):
            merge = np.concatenate((n_dfratings[m], n_dfusers[n]))
            list.append(merge)
list = np.array(list)
new_list = []
for k in range(list.shape[0]):
    val = np.delete(list[k], 0)
    new_list.append(val)
new_list = np.array(new_list)

In [24]:
# split dataset to X (features) and y (labels)
X = new_list
y = n_ratings

In [25]:
# kfold cross validation
K = evaluation( X, y, 5, 100)
cv = K.kfold()

Fold 1:
Fold 2:
Fold 3:
Fold 4:
Fold 5:


In [26]:
#split tuple 
classes, eucdis = cv
classes, eucdis

(array([[2.63, 3.21, 2.98, ..., 3.3 , 3.16, 3.35],
        [4.69, 3.61, 3.19, ..., 2.87, 2.17, 4.29],
        [2.93, 4.46, 3.66, ..., 4.01, 2.  , 1.96],
        [4.21, 3.9 , 3.28, ..., 3.1 , 2.94, 4.21],
        [4.38, 4.56, 3.82, ..., 4.29, 3.29, 3.69]]),
 array([[0.  , 0.5 , 0.5 , ..., 0.75, 0.5 , 0.25],
        [0.75, 1.  , 0.5 , ..., 0.25, 0.5 , 1.  ],
        [0.5 , 0.75, 1.  , ..., 0.  , 0.  , 0.  ],
        [0.5 , 0.25, 1.  , ..., 0.5 , 0.25, 0.5 ],
        [0.5 , 0.  , 0.25, ..., 0.75, 0.75, 0.5 ]]))

In [27]:
# metrics
ans = result(classes, eucdis)
Metrics = ans.validate()

Metric | Mean | Standard Deviation
RMSE 3.0077082140209965, 0.0030345198348923617
MAE 2.9418765000000002, 0.0041442845582801945
