In [1]:
PATH = '/home/g056122/FeatExt_Data_Clustering/final_github/collaborative_filtering'

In [2]:
DATAPATH = '/home/g056122/ISG_studienarbeit/Movielens_100K'

In [3]:
''' 
Load 'validate' and 'valres' module available in 'Module' folder
'''
# import required libraries
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.cluster import OPTICS
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from validate import evaluation
from valres import result

In [4]:
# import user data
dfusers = pd.read_csv(f"{DATAPATH}/u.user", sep='|', header=None, engine='python', encoding='latin-1')
dfusers.columns = ['userId', 'age', 'gender', 'occupation', 'zipcode']
copydfuser = dfusers.copy()

In [5]:
dfusers.head()

Unnamed: 0,userId,age,gender,occupation,zipcode
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [6]:
dfusers['userId'].nunique()

943

In [7]:
dfusers['age'].nunique()

61

In [8]:
dfusers['gender'].nunique()

2

In [9]:
dfusers['occupation'].nunique()

21

In [10]:
dfusers['zipcode'].nunique()

795

In [11]:
# encode users dataset
labelencoder = LabelEncoder()
categorylist=['occupation','gender', 'zipcode']
dfusers[categorylist]=dfusers[categorylist].apply(labelencoder.fit_transform)
# n_dfusers = pd.get_dummies(dfusers, columns=categorylist)

In [12]:
dfusers.head()

Unnamed: 0,userId,age,gender,occupation,zipcode
0,1,24,1,19,622
1,2,53,0,13,689
2,3,23,1,20,270
3,4,24,1,19,331
4,5,33,0,13,133


In [13]:
dfusers.describe()

Unnamed: 0,userId,age,gender,occupation,zipcode
count,943.0,943.0,943.0,943.0,943.0
mean,472.0,34.051962,0.710498,10.83351,396.287381
std,272.364951,12.19274,0.453772,6.658875,225.636691
min,1.0,7.0,0.0,0.0,0.0
25%,236.5,25.0,0.0,4.0,202.5
50%,472.0,31.0,1.0,13.0,411.0
75%,707.5,43.0,1.0,18.0,587.5
max,943.0,73.0,1.0,20.0,794.0


In [14]:
# scalerize users dataset
scaler = MinMaxScaler()
minmaxindex = ['age', 'occupation', 'gender', 'zipcode']
dfusers[minmaxindex] = scaler.fit_transform(dfusers[minmaxindex])

In [15]:
dfusers.describe()

Unnamed: 0,userId,age,gender,occupation,zipcode
count,943.0,943.0,943.0,943.0,943.0
mean,472.0,0.409878,0.710498,0.541676,0.499102
std,272.364951,0.184738,0.453772,0.332944,0.284177
min,1.0,0.0,0.0,0.0,0.0
25%,236.5,0.272727,0.0,0.2,0.255038
50%,472.0,0.363636,1.0,0.65,0.517632
75%,707.5,0.545455,1.0,0.9,0.739924
max,943.0,1.0,1.0,1.0,1.0


In [16]:
dfusers = dfusers.drop(['userId'], axis=1)
dfusers.head()

Unnamed: 0,age,gender,occupation,zipcode
0,0.257576,1.0,0.95,0.783375
1,0.69697,0.0,0.65,0.867758
2,0.242424,1.0,1.0,0.34005
3,0.257576,1.0,0.95,0.416877
4,0.393939,0.0,0.65,0.167506


In [17]:
dfusers.describe()

Unnamed: 0,age,gender,occupation,zipcode
count,943.0,943.0,943.0,943.0
mean,0.409878,0.710498,0.541676,0.499102
std,0.184738,0.453772,0.332944,0.284177
min,0.0,0.0,0.0,0.0
25%,0.272727,0.0,0.2,0.255038
50%,0.363636,1.0,0.65,0.517632
75%,0.545455,1.0,0.9,0.739924
max,1.0,1.0,1.0,1.0


In [18]:
# OPTICS model
optics_model = OPTICS(min_samples = 5, xi = 0.02, min_cluster_size = 0.003)

In [19]:
# fit OPTICS model
optics_model.fit(dfusers)

In [20]:
# clusters 
cluster = np.unique(optics_model.labels_)
print('Clusters: ', cluster)

Clusters:  [-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22
 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
 71 72 73 74 75]


In [21]:
# outliers
outliers = np.count_nonzero(optics_model.labels_ == -1)
print('Outliers: ', outliers)

Outliers:  418


In [22]:
# determine min reachability distance for each cluster
labels = optics_model.labels_
reach_dis = optics_model.reachability_
cluster_representatives = []
for i in cluster:
    cd = np.min(reach_dis[labels == i])
    cluster_representatives.append(cd)
# convert list to array
cluster_representatives = np.array(cluster_representatives)

In [23]:
# augment dataset
dfusers['reach_dis'] = optics_model.reachability_
for j in range(len(dfusers)):
    for m,n in enumerate(cluster_representatives):
        dfusers[f'cluster {m}'] = dfusers['reach_dis'] - n

In [24]:
dfusers.head()

Unnamed: 0,age,gender,occupation,zipcode,reach_dis,cluster 0,cluster 1,cluster 2,cluster 3,cluster 4,...,cluster 67,cluster 68,cluster 69,cluster 70,cluster 71,cluster 72,cluster 73,cluster 74,cluster 75,cluster 76
0,0.257576,1.0,0.95,0.783375,inf,inf,inf,inf,inf,inf,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf
1,0.69697,0.0,0.65,0.867758,0.161915,0.129561,0.135466,0.130335,0.116443,0.136703,...,0.040829,0.030531,0.040644,0.062714,0.100812,0.051608,0.032619,0.027614,0.066877,0.060742
2,0.242424,1.0,1.0,0.34005,0.067012,0.034658,0.040563,0.035432,0.02154,0.0418,...,-0.054074,-0.064372,-0.054259,-0.032189,0.005909,-0.043295,-0.062284,-0.067289,-0.028026,-0.034161
3,0.257576,1.0,0.95,0.416877,0.053018,0.020664,0.02657,0.021438,0.007546,0.027806,...,-0.068067,-0.078365,-0.068253,-0.046182,-0.008084,-0.057289,-0.076278,-0.081283,-0.04202,-0.048155
4,0.393939,0.0,0.65,0.167506,0.12308,0.090727,0.096632,0.0915,0.077608,0.097868,...,0.001995,-0.008303,0.001809,0.02388,0.061978,0.012773,-0.006215,-0.011221,0.028042,0.021907


In [25]:
dfusers = dfusers.replace([np.inf, -np.inf], np.nan)
dfusers = dfusers.interpolate()

In [26]:
dfusers.describe()

Unnamed: 0,age,gender,occupation,zipcode,reach_dis,cluster 0,cluster 1,cluster 2,cluster 3,cluster 4,...,cluster 67,cluster 68,cluster 69,cluster 70,cluster 71,cluster 72,cluster 73,cluster 74,cluster 75,cluster 76
count,943.0,943.0,943.0,943.0,942.0,942.0,942.0,942.0,942.0,942.0,...,942.0,942.0,942.0,942.0,942.0,942.0,942.0,942.0,942.0,942.0
mean,0.409878,0.710498,0.541676,0.499102,0.088678,0.056325,0.06223,0.057099,0.043207,0.063467,...,-0.032407,-0.042705,-0.032593,-0.010522,0.027576,-0.021629,-0.040617,-0.045622,-0.006359,-0.012494
std,0.184738,0.453772,0.332944,0.284177,0.046909,0.046909,0.046909,0.046909,0.046909,0.046909,...,0.046909,0.046909,0.046909,0.046909,0.046909,0.046909,0.046909,0.046909,0.046909,0.046909
min,0.0,0.0,0.0,0.0,0.02053,-0.011823,-0.005918,-0.011049,-0.024942,-0.004681,...,-0.100555,-0.110853,-0.100741,-0.07867,-0.040572,-0.089777,-0.108765,-0.11377,-0.074507,-0.080642
25%,0.272727,0.0,0.2,0.255038,0.060852,0.028498,0.034403,0.029272,0.01538,0.03564,...,-0.060233,-0.070531,-0.060419,-0.038348,-0.000251,-0.049455,-0.068444,-0.073449,-0.034186,-0.040321
50%,0.363636,1.0,0.65,0.517632,0.08452,0.052166,0.058072,0.05294,0.039048,0.059308,...,-0.036565,-0.046863,-0.036751,-0.01468,0.023418,-0.025787,-0.044776,-0.049781,-0.010518,-0.016653
75%,0.545455,1.0,0.9,0.739924,0.110614,0.078261,0.084166,0.079034,0.065142,0.085403,...,-0.010471,-0.020769,-0.010657,0.011414,0.049512,0.000307,-0.018681,-0.023686,0.015577,0.009442
max,1.0,1.0,1.0,1.0,1.0,0.967646,0.973552,0.96842,0.954528,0.974788,...,0.878915,0.868617,0.878729,0.9008,0.938898,0.889693,0.870704,0.865699,0.904962,0.898827


In [27]:
# fill NaN values with mean of the dataset
dfusers = dfusers.fillna(dfusers.mean())

In [28]:
dfusers.head()

Unnamed: 0,age,gender,occupation,zipcode,reach_dis,cluster 0,cluster 1,cluster 2,cluster 3,cluster 4,...,cluster 67,cluster 68,cluster 69,cluster 70,cluster 71,cluster 72,cluster 73,cluster 74,cluster 75,cluster 76
0,0.257576,1.0,0.95,0.783375,0.088678,0.056325,0.06223,0.057099,0.043207,0.063467,...,-0.032407,-0.042705,-0.032593,-0.010522,0.027576,-0.021629,-0.040617,-0.045622,-0.006359,-0.012494
1,0.69697,0.0,0.65,0.867758,0.161915,0.129561,0.135466,0.130335,0.116443,0.136703,...,0.040829,0.030531,0.040644,0.062714,0.100812,0.051608,0.032619,0.027614,0.066877,0.060742
2,0.242424,1.0,1.0,0.34005,0.067012,0.034658,0.040563,0.035432,0.02154,0.0418,...,-0.054074,-0.064372,-0.054259,-0.032189,0.005909,-0.043295,-0.062284,-0.067289,-0.028026,-0.034161
3,0.257576,1.0,0.95,0.416877,0.053018,0.020664,0.02657,0.021438,0.007546,0.027806,...,-0.068067,-0.078365,-0.068253,-0.046182,-0.008084,-0.057289,-0.076278,-0.081283,-0.04202,-0.048155
4,0.393939,0.0,0.65,0.167506,0.12308,0.090727,0.096632,0.0915,0.077608,0.097868,...,0.001995,-0.008303,0.001809,0.02388,0.061978,0.012773,-0.006215,-0.011221,0.028042,0.021907


In [29]:
# merge userId feature with users dataset
dfusers = pd.concat([copydfuser['userId'], dfusers], axis=1)
dfusers.head()

Unnamed: 0,userId,age,gender,occupation,zipcode,reach_dis,cluster 0,cluster 1,cluster 2,cluster 3,...,cluster 67,cluster 68,cluster 69,cluster 70,cluster 71,cluster 72,cluster 73,cluster 74,cluster 75,cluster 76
0,1,0.257576,1.0,0.95,0.783375,0.088678,0.056325,0.06223,0.057099,0.043207,...,-0.032407,-0.042705,-0.032593,-0.010522,0.027576,-0.021629,-0.040617,-0.045622,-0.006359,-0.012494
1,2,0.69697,0.0,0.65,0.867758,0.161915,0.129561,0.135466,0.130335,0.116443,...,0.040829,0.030531,0.040644,0.062714,0.100812,0.051608,0.032619,0.027614,0.066877,0.060742
2,3,0.242424,1.0,1.0,0.34005,0.067012,0.034658,0.040563,0.035432,0.02154,...,-0.054074,-0.064372,-0.054259,-0.032189,0.005909,-0.043295,-0.062284,-0.067289,-0.028026,-0.034161
3,4,0.257576,1.0,0.95,0.416877,0.053018,0.020664,0.02657,0.021438,0.007546,...,-0.068067,-0.078365,-0.068253,-0.046182,-0.008084,-0.057289,-0.076278,-0.081283,-0.04202,-0.048155
4,5,0.393939,0.0,0.65,0.167506,0.12308,0.090727,0.096632,0.0915,0.077608,...,0.001995,-0.008303,0.001809,0.02388,0.061978,0.012773,-0.006215,-0.011221,0.028042,0.021907


In [30]:
# import ratings dataset
dfratings = pd.read_csv(f"{DATAPATH}/u.data", sep='\t', header=None, engine='python', encoding='latin-1')
dfratings.columns = ['userId', 'movieId', 'rating', 'timestamp']

In [31]:
# transform timestamp 
dfratings['timestamp'] = pd.to_datetime (dfratings['timestamp'], unit='s')
dfratings['year']=dfratings['timestamp'].dt.year

In [32]:
# scalerize features
dfratings[['year', 'rating']]=scaler.fit_transform(dfratings[['year', 'rating']])
# drop timestamp 
dfratings = dfratings.drop('timestamp', axis=1)

In [33]:
dfratings['year'].nunique()

2

In [34]:
# merge dataset
dfratings = pd.merge(dfusers, dfratings, on='userId')

In [35]:
dfratings.head()

Unnamed: 0,userId,age,gender,occupation,zipcode,reach_dis,cluster 0,cluster 1,cluster 2,cluster 3,...,cluster 70,cluster 71,cluster 72,cluster 73,cluster 74,cluster 75,cluster 76,movieId,rating,year
0,1,0.257576,1.0,0.95,0.783375,0.088678,0.056325,0.06223,0.057099,0.043207,...,-0.010522,0.027576,-0.021629,-0.040617,-0.045622,-0.006359,-0.012494,61,0.75,0.0
1,1,0.257576,1.0,0.95,0.783375,0.088678,0.056325,0.06223,0.057099,0.043207,...,-0.010522,0.027576,-0.021629,-0.040617,-0.045622,-0.006359,-0.012494,189,0.5,1.0
2,1,0.257576,1.0,0.95,0.783375,0.088678,0.056325,0.06223,0.057099,0.043207,...,-0.010522,0.027576,-0.021629,-0.040617,-0.045622,-0.006359,-0.012494,33,0.75,0.0
3,1,0.257576,1.0,0.95,0.783375,0.088678,0.056325,0.06223,0.057099,0.043207,...,-0.010522,0.027576,-0.021629,-0.040617,-0.045622,-0.006359,-0.012494,160,0.75,0.0
4,1,0.257576,1.0,0.95,0.783375,0.088678,0.056325,0.06223,0.057099,0.043207,...,-0.010522,0.027576,-0.021629,-0.040617,-0.045622,-0.006359,-0.012494,20,0.75,1.0


In [36]:
# split dataset to X (features) and y (labels)
X = dfratings.drop('rating', axis=1).values
y = dfratings['rating'].values

In [37]:
# kfold cross validation
K = evaluation( X, y, 5, 100)
cv = K.kfold()

Fold 1:
Fold 2:
Fold 3:
Fold 4:
Fold 5:


In [38]:
#split tuple 
classes, eucdis = cv
classes, eucdis

(array([[2.64, 3.12, 4.4 , ..., 3.22, 2.74, 3.04],
        [3.59, 3.47, 3.84, ..., 4.05, 2.7 , 2.92],
        [4.47, 3.4 , 2.81, ..., 2.99, 4.11, 3.09],
        [4.32, 3.47, 4.01, ..., 4.1 , 3.91, 1.7 ],
        [4.52, 3.91, 4.76, ..., 3.  , 3.39, 4.37]]),
 array([[0.75, 0.75, 0.5 , ..., 0.5 , 0.  , 0.75],
        [1.  , 0.75, 0.25, ..., 0.75, 0.25, 0.75],
        [1.  , 0.5 , 0.  , ..., 0.5 , 0.75, 0.5 ],
        [0.5 , 0.75, 0.75, ..., 0.75, 0.5 , 0.25],
        [0.75, 0.75, 1.  , ..., 0.  , 0.5 , 0.75]]))

In [40]:
# metrics
ans = result(classes, eucdis)
metrics = ans.validate()

Metric | Mean | Standard Deviation
RMSE 2.943602276404458, 0.005240262063594165
MAE 2.8813364000000004, 0.005565491365549034
