In [1]:
import pandas as pd
import numpy as np
import os 
from surprise import Dataset
from surprise import KNNWithMeans
from surprise import Reader
from surprise.model_selection import GridSearchCV
import pickle
import time
import sys

In [2]:
root_dir, curr=os.path.split(os.path.abspath(os.curdir))

print("Root Dir:", root_dir)

Root Dir: /Users/adi/Desktop/MLIP/group-project-f22-pulp-prediction


In [3]:
data=pd.read_csv('/Users/adi/Desktop/MLIP/group-project-f22-pulp-prediction/data/kafka_log-movielog6_stream.csv', 
        index_col=0)

In [4]:
data.loc[~data['rating'].isna(),'user_id']

149       108166
242       301361
8765      108166
8858      301361
9178      184424
           ...  
198148    309981
198297    175334
198506    280872
198774    102528
198911    293569
Name: user_id, Length: 3674, dtype: int64

In [5]:
data=data.loc[~data['rating'].isna()]

In [6]:
top_20=data.groupby(by='movie_id').sum().sort_values(by='rating', ascending=False)
top_20.reset_index(inplace=True)

In [7]:
top_20=top_20['movie_id'][:20].values

In [8]:
top_20

array(['interstellar+2014', 'monsters_+inc.+2001', 'inception+2010',
       'raiders+of+the+lost+ark+1981',
       'the+lord+of+the+rings+the+two+towers+2002',
       'the+lord+of+the+rings+the+return+of+the+king+2003',
       'the+shawshank+redemption+1994',
       'the+lord+of+the+rings+the+fellowship+of+the+ring+2001',
       'fight+club+1999', 'spirited+away+2001',
       'howls+moving+castle+2004', 'monty+python+and+the+holy+grail+1975',
       'my+neighbor+totoro+1988', 'pulp+fiction+1994',
       'one+flew+over+the+cuckoos+nest+1975', 'seven+samurai+1954',
       'indiana+jones+and+the+last+crusade+1989',
       'harry+potter+and+the+goblet+of+fire+2005', 'up+2009',
       'star+wars+1977'], dtype=object)

In [9]:


df = data.loc[:,['user_id','movie_id','rating']]

In [10]:
top_20

array(['interstellar+2014', 'monsters_+inc.+2001', 'inception+2010',
       'raiders+of+the+lost+ark+1981',
       'the+lord+of+the+rings+the+two+towers+2002',
       'the+lord+of+the+rings+the+return+of+the+king+2003',
       'the+shawshank+redemption+1994',
       'the+lord+of+the+rings+the+fellowship+of+the+ring+2001',
       'fight+club+1999', 'spirited+away+2001',
       'howls+moving+castle+2004', 'monty+python+and+the+holy+grail+1975',
       'my+neighbor+totoro+1988', 'pulp+fiction+1994',
       'one+flew+over+the+cuckoos+nest+1975', 'seven+samurai+1954',
       'indiana+jones+and+the+last+crusade+1989',
       'harry+potter+and+the+goblet+of+fire+2005', 'up+2009',
       'star+wars+1977'], dtype=object)

In [11]:

reader = Reader(rating_scale=(1, 5))
# Loads Pandas dataframe
data = Dataset.load_from_df(df[["user_id", "movie_id", "rating"]], reader)
trainingSet = data.build_full_trainset()

In [12]:
items=df['movie_id'].unique()
users=df['user_id'].unique()

np.savetxt(root_dir+"/Inference/Models/items.txt", items, fmt="%s" )
np.savetxt(root_dir+"/Inference/Models/users.py", users, fmt="%s")
np.savetxt(root_dir+"/Inference/Models/top_20.py", top_20, fmt="%s")

# Model 1: KNN

## Tuning

In [13]:

sim_options = {
    "name": ["msd", "cosine"],
    "min_support": [3, 4, 5],
    "user_based": [False, True],
}

param_grid = {"sim_options": sim_options}
gs = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse", "mae"], cv=3)
gs.fit(data)

print("Best RMSE",gs.best_score["rmse"])
print(gs.best_params["rmse"])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [14]:
# Train with best model

sim_options=gs.best_params["rmse"]['sim_options']

knn = KNNWithMeans(sim_options=sim_options)

start=time.time()
knn.fit(trainingSet)
stop = time.time()

print(f"Training time: {round(stop - start,10)}s")


Computing the msd similarity matrix...
Done computing similarity matrix.
Training time: 0.1699829102s


In [15]:
times=[]
for i in range(100):
    knn = KNNWithMeans(sim_options=sim_options)
    start=time.time()
    knn.fit(trainingSet)
    stop = time.time()
    times.append(round(stop-start,10))

print (np.mean(times))

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [16]:
# Inference time

times=[]

for i in range(100000):
    start=time.time()
    knn.predict(57687,"screamers+the+hunting+2009").est
    stop = time.time()
    times.append(round(stop-start,10))

print (np.mean(times))

3.6143784409999997e-06


In [17]:
model_path=root_dir+"/Inference/Models/"
knn_filename=model_path+"KNN_model.sav"
p=pickle.dump(knn, open(knn_filename, 'wb'))
pickle.dump(knn, open(knn_filename, 'wb'))

print(sys.getsizeof(p))



16


# SVD model 

## Tuning

In [18]:
from surprise import SVD

param_grid = {
    "n_epochs": [5, 10],
    "lr_all": [0.002, 0.005],
    "reg_all": [0.4, 0.6]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)

gs.fit(data)

print("Best RMSE", gs.best_score["rmse"])
print(gs.best_params["rmse"])

Best RMSE 0.7368330901491941
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


In [19]:
# Train with best model
best_params = gs.best_params["rmse"]

svd_algo = SVD(n_epochs=best_params['n_epochs'],
               lr_all=best_params['lr_all'],
               reg_all=best_params['reg_all'])

start=time.time()
svd_algo.fit(trainingSet)
stop = time.time()
print(f"Training time: {round(stop - start,10)}s")




Training time: 0.1089730263s


In [20]:
times=[]

for i in range(100):
    svd_algo = SVD(n_epochs=best_params['n_epochs'],
                lr_all=best_params['lr_all'],
                reg_all=best_params['reg_all'])

    start=time.time()
    svd_algo.fit(trainingSet)
    stop = time.time()
    times.append(round(stop-start,10))

print (np.mean(times))

0.102660918239


In [21]:
# Inference time

times=[]

for i in range(100000):
    start=time.time()
    svd_algo.predict(57687,"screamers+the+hunting+2009").est
    stop = time.time()
    times.append(round(stop-start,10))

print (np.mean(times))

3.6978116169999997e-06


In [22]:
svd_filename=model_path+"SVD_model.sav"
p=pickle.dump(svd_algo, open(svd_filename, 'wb'))
pickle.dump(svd_algo, open(svd_filename, 'wb'))

print(sys.getsizeof(p))

16


### Creating model object and making recommendations

In [23]:
sys.path.append(root_dir+"/Inference")

In [24]:
sys.path

['/Users/adi/Desktop/MLIP/group-project-f22-pulp-prediction/Notebooks',
 '/Users/adi/.vscode/extensions/ms-toolsai.jupyter-2022.8.1002431955/pythonFiles',
 '/Users/adi/.vscode/extensions/ms-toolsai.jupyter-2022.8.1002431955/pythonFiles/lib/python',
 '/Users/adi/opt/anaconda3/envs/ML_fall/lib/python38.zip',
 '/Users/adi/opt/anaconda3/envs/ML_fall/lib/python3.8',
 '/Users/adi/opt/anaconda3/envs/ML_fall/lib/python3.8/lib-dynload',
 '',
 '/Users/adi/opt/anaconda3/envs/ML_fall/lib/python3.8/site-packages',
 '/Users/adi/Desktop/MLIP/group-project-f22-pulp-prediction/Inference']

In [25]:
from Inference import model

In [34]:
recommender=model(model_path=model_path+"SVD_model.sav", 
                    items_path=model_path+"items.txt", users_path=model_path+"users.py", top_20_path=model_path+"top_20.py", do_rand=False)

In [35]:
recommender.recommend(240127)

['interstellar+2014',
 'monsters_+inc.+2001',
 'inception+2010',
 'raiders+of+the+lost+ark+1981',
 'the+lord+of+the+rings+the+two+towers+2002',
 'the+lord+of+the+rings+the+return+of+the+king+2003',
 'the+shawshank+redemption+1994',
 'the+lord+of+the+rings+the+fellowship+of+the+ring+2001',
 'fight+club+1999',
 'spirited+away+2001',
 'howls+moving+castle+2004',
 'monty+python+and+the+holy+grail+1975',
 'my+neighbor+totoro+1988',
 'pulp+fiction+1994',
 'one+flew+over+the+cuckoos+nest+1975',
 'seven+samurai+1954',
 'indiana+jones+and+the+last+crusade+1989',
 'harry+potter+and+the+goblet+of+fire+2005',
 'up+2009',
 'star+wars+1977']