In [1]:
PATH = '/home/g056122/FeatExt_Data_Clustering/final_github/collaborative_filtering'

In [2]:
DATAPATH = '/home/g056122/ISG_studienarbeit/Movielens_100K'

In [3]:
# import required libraries
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import mean_squared_error,mean_absolute_error 

In [4]:
# import users dataset
dfusers = pd.read_csv(f"{DATAPATH}/u.user", sep='|', header=None, engine='python', encoding='latin-1')
dfusers.columns = ['userId', 'age', 'gender', 'occupation', 'zipcode']

In [5]:
# copy of users dataset
copydfuser = dfusers.copy()

In [6]:
dfusers.head()

Unnamed: 0,userId,age,gender,occupation,zipcode
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [7]:
# encode users dataset
labelencoder = LabelEncoder()
categorylist=['occupation','gender', 'zipcode']
dfusers[categorylist]=dfusers[categorylist].apply(labelencoder.fit_transform)

In [8]:
# scalerize the users dataset
scaler = MinMaxScaler()
minmaxindex = ['age', 'occupation', 'gender', 'zipcode']
dfusers[minmaxindex] = scaler.fit_transform(dfusers[minmaxindex])

In [9]:
dfusers.describe()

Unnamed: 0,userId,age,gender,occupation,zipcode
count,943.0,943.0,943.0,943.0,943.0
mean,472.0,0.409878,0.710498,0.541676,0.499102
std,272.364951,0.184738,0.453772,0.332944,0.284177
min,1.0,0.0,0.0,0.0,0.0
25%,236.5,0.272727,0.0,0.2,0.255038
50%,472.0,0.363636,1.0,0.65,0.517632
75%,707.5,0.545455,1.0,0.9,0.739924
max,943.0,1.0,1.0,1.0,1.0


In [10]:
# import ratings dataset
dfratings = pd.read_csv(f"{DATAPATH}/u.data", sep='\t', header=None, engine='python', encoding='latin-1')
dfratings.columns = ['userId', 'movieId', 'rating', 'timestamp']

In [11]:
# timestamp feature transformation
dfratings['timestamp'] = pd.to_datetime (dfratings['timestamp'], unit='s')
dfratings['year']=dfratings['timestamp'].dt.year

In [12]:
# minimum and maximum rating 
min_rating = np.min(dfratings.rating)
max_rating = np.max(dfratings.rating)

In [13]:
# scalerize ratings dataset
scaler = MinMaxScaler()
dfratings[['year', 'rating']]=scaler.fit_transform(dfratings[['year', 'rating']])

In [14]:
# drop timestamp feature 
dfratings = dfratings.drop('timestamp', axis=1)

In [15]:
dfratings.describe()

Unnamed: 0,userId,movieId,rating,year
count,100000.0,100000.0,100000.0,100000.0
mean,462.48475,425.53013,0.632465,0.47101
std,266.61442,330.798356,0.281418,0.499161
min,1.0,1.0,0.0,0.0
25%,254.0,175.0,0.5,0.0
50%,447.0,322.0,0.75,0.0
75%,682.0,631.0,0.75,1.0
max,943.0,1682.0,1.0,1.0


In [16]:
# merge users and ratings dataset
dfratings = pd.merge(dfusers, dfratings, on='userId')

In [17]:
dfratings.describe()

Unnamed: 0,userId,age,gender,occupation,zipcode,movieId,rating,year
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,462.48475,0.393483,0.7426,0.553817,0.511097,425.53013,0.632465,0.47101
std,266.61442,0.175191,0.437204,0.331142,0.283895,330.798356,0.281418,0.499161
min,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25%,254.0,0.257576,0.0,0.2,0.260705,175.0,0.5,0.0
50%,447.0,0.348485,1.0,0.65,0.531486,322.0,0.75,0.0
75%,682.0,0.5,1.0,0.9,0.754408,631.0,0.75,1.0
max,943.0,1.0,1.0,1.0,1.0,1682.0,1.0,1.0


In [18]:
# split dataset into X (features) and y (labels)
X = dfratings.drop('rating', axis=1).values
y = dfratings['rating'].values

In [19]:
# Kfold cross validation
kfold = KFold(n_splits=5, shuffle=True, random_state=32)

In [20]:
# lists with predictions and real values for kfold
classes = []
truth_val = []
for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
  val = i+1
  print(f'Fold {val}:')
  # define model 
  rf_model = RandomForestRegressor(n_estimators = 100)
  # model fit
  rf_model.fit(X[train_index],y[train_index])
  # model predict
  predict = rf_model.predict(X[test_index])
  # transform ratings in order 1 to 5
  x =  (predict * (max_rating - min_rating) ) + min_rating
  # append predictions and real values
  classes.append(x)
  truth = y[test_index]
  truth_val.append(truth)
# convert list to array
classes = np.array(classes)
truth_val = np.array(truth_val)

Fold 1:
Fold 2:
Fold 3:
Fold 4:
Fold 5:


In [21]:
# lists with rmse and mae of kfold
rmse, mae = [], []
for i in range(5):
  # root mean square error
  rmse_val = mean_squared_error(truth_val[i], classes[i], squared = False)
  # mean absolute error
  mae_val = mean_absolute_error(truth_val[i], classes[i])
  # append rmse and and mae
  rmse.append(rmse_val)
  mae.append(mae_val)

In [22]:
# results
print('Metric Mean Standard Deviation')
print(f'RMSE: {np.mean(rmse)}, {np.std(rmse)}')
print(f'MAE: {np.mean(mae)}, {np.std(mae)}')

Metric Mean Standard Deviation
RMSE: 2.942351647495312, 0.004668563121822674
MAE: 2.8818229000000004, 0.0050994733002536225
