In [80]:
import numpy as np
import sys
import time
import pandas as pd
sys.path.append("../../")
sys.path.append("../../reco_utils/recommender/rlrmc/")

from reco_utils.dataset.python_splitters import python_random_split
from reco_utils.dataset.python_splitters import python_stratified_split
from reco_utils.dataset import movielens
from reco_utils.recommender.rlrmc.RLRMCdataset import RLRMCdataset 
from reco_utils.recommender.rlrmc.RLRMCalgorithm import RLRMCalgorithm 
# Pymanopt installation is required via
# pip install pymanopt 
from reco_utils.evaluation.python_evaluation import (
    rmse, mae
)

# import logging

# %load_ext autoreload
# %autoreload 2

In [81]:
print("Pandas version: {}".format(pd.__version__))
print("System version: {}".format(sys.version))

Pandas version: 0.25.3
System version: 3.7.4 (default, Aug  9 2019, 18:34:13) [MSC v.1915 64 bit (AMD64)]


In [82]:
df = pd.read_csv('C:/Users/prath/Downloads/argos_r.csv', usecols=["userID", "itemID", "rating"])
df.head(10)

Unnamed: 0,userID,itemID,rating
0,17850.0,85123A,4
1,17850.0,71053,2
2,17850.0,84406B,5
3,17850.0,84029G,5
4,17850.0,84029E,3
5,17850.0,22752,3
6,17850.0,21730,1
7,17850.0,22633,2
8,17850.0,22632,4
9,13047.0,84879,4


In [83]:
df.dropna(subset=['userID'],how='all',inplace=True)
df["userID"] = df["userID"].apply(lambda x: str(x)[:-2])

In [84]:
df.head(10)

Unnamed: 0,userID,itemID,rating
0,17850,85123A,4
1,17850,71053,2
2,17850,84406B,5
3,17850,84029G,5
4,17850,84029E,3
5,17850,22752,3
6,17850,21730,1
7,17850,22633,2
8,17850,22632,4
9,13047,84879,4


In [85]:
df['userID'].astype(str)
df['itemID'].astype(str)

0         85123A
1          71053
2         84406B
3         84029G
4         84029E
           ...  
541904     22613
541905     22899
541906     23254
541907     23255
541908     22138
Name: itemID, Length: 406829, dtype: object

In [86]:
df.dtypes

userID    object
itemID    object
rating     int64
dtype: object

In [87]:
df.shape

(406829, 3)

In [88]:
df.head(10)

Unnamed: 0,userID,itemID,rating
0,17850,85123A,4
1,17850,71053,2
2,17850,84406B,5
3,17850,84029G,5
4,17850,84029E,3
5,17850,22752,3
6,17850,21730,1
7,17850,22633,2
8,17850,22632,4
9,13047,84879,4


In [89]:
# Model parameters

# rank of the model, a positive integer (usually small), required parameter
rank_parameter = 10
# regularization parameter multiplied to loss function, a positive number (usually small), required parameter
regularization_parameter = 0.001
# initialization option for the model, 'svd' employs singular value decomposition, optional parameter
initialization_flag = 'svd' #default is 'random'
# maximum number of iterations for the solver, a positive integer, optional parameter
maximum_iteration = 100 #optional, default is 100
# maximum time in seconds for the solver, a positive integer, optional parameter
maximum_time = 300#optional, default is 1000

# Verbosity of the intermediate results
verbosity=0 #optional parameter, valid values are 0,1,2, default is 0
# Whether to compute per iteration train RMSE (and test RMSE, if test data is given)
compute_iter_rmse=True #optional parameter, boolean value, default is False

In [90]:
## Logging utilities. Please import 'logging' in order to use the following command. 
# logging.basicConfig(level=logging.INFO)

In [91]:
## If both validation and test sets are required
# train, validation, test = python_random_split(df,[0.6, 0.2, 0.2])

## If validation set is not required
train, test = python_random_split(df,[0.8, 0.2])

## If test set is not required
# train, validation = python_random_split(df,[0.8, 0.2])

## If both validation and test sets are not required (i.e., the complete dataset is for training the model)
# train = df

In [92]:

# data = RLRMCdataset(train=train, validation=validation, test=test)
data = RLRMCdataset(train=train, test=test) # No validation set
# data = RLRMCdataset(train=train, validation=validation) # No test set
# data = RLRMCdataset(train=train) # No validation or test set

In [93]:
model = RLRMCalgorithm(rank = rank_parameter,
                       C = regularization_parameter,
                       model_param = data.model_param,
                       initialize_flag = initialization_flag,
                       maxiter=maximum_iteration,
                       max_time=maximum_time)

In [94]:
start_time = time.time()

model.fit(data,verbosity=verbosity)

# fit_and_evaluate will compute RMSE on the validation set (if given) at every iteration
# model.fit_and_evaluate(data,verbosity=verbosity)

train_time = time.time() - start_time # train_time includes both model initialization and model training time. 

print("Took {} seconds for training.".format(train_time))

Took 6.46556830406189 seconds for training.


In [95]:

# Obtain prediction on the full test set
predictions_ndarr = model.predict(test['userID'].values,test['itemID'].values)

In [96]:
predictions_df = pd.DataFrame(data={"userID": test['userID'].values, "itemID":test['itemID'].values, "prediction":predictions_ndarr})

## Compute test RMSE 
eval_rmse = rmse(test, predictions_df)
## Compute test MAE 
eval_mae = mae(test, predictions_df)

print("RMSE:\t%f" % eval_rmse,
      "MAE:\t%f" % eval_mae, sep='\n')

RMSE:	3.082148
MAE:	2.227280


In [97]:
predictions_df.shape

(81366, 3)

In [98]:
predictions_df.head(20)

Unnamed: 0,userID,itemID,prediction
0,14375,22469,2.591529
1,17375,22325,3.554836
2,15498,84945,6.065064
3,16374,85078,-2.998823
4,13735,22489,2.880092
5,15644,84692,0.576318
6,16523,23581,0.929753
7,18125,22197,7.680851
8,16984,21929,2.493899
9,14911,15060B,9.328908


In [99]:
predictions_df.to_csv('RLRMC_final.csv')

In [100]:
import joblib
# save the model to disk
filename = 'RLRMC_final.sav'
joblib.dump(model, filename)

['RLRMC_final.sav']

In [101]:
thislist1 = ['14375']
thislist2 = ['22469']
model.predict(thislist1, thislist2)

array([3.37364433])

In [102]:
thislist1 = ['17375']
thislist2 = ['22325']
model.predict(thislist1, thislist2)

array([2.57537032])

In [112]:
df1 = pd.read_csv('RLRMC_final.csv')
df1['userID'] = df1['userID'].apply(str)

In [113]:
df1.dtypes

Unnamed: 0      int64
userID         object
itemID         object
prediction    float64
dtype: object

In [None]:
prediction = dataframe.loc[(dataframe['userID'] == user_id) & (dataframe['itemID'] == item_id), ['prediction']]

In [117]:
result = df1.loc[df1['userID'] == '14375']
result.head()

Unnamed: 0.1,Unnamed: 0,userID,itemID,prediction
0,0,14375,22469,2.591529
10378,10378,14375,22374,3.5566
11494,11494,14375,84596B,2.347066
13786,13786,14375,21272,2.651583
14434,14434,14375,22342,3.592249


In [67]:
result = predictions_df.loc[predictions_df['userID'] == '17375'].sort_values(by = 'prediction', ascending = False).head(5)

In [69]:
result = result.drop(columns=["prediction", "userID"], axis=1)

In [70]:
result.to_json()

'{"0":"23201","1":"22096","2":"84313B","3":"22325","4":"17003"}'