<a href="https://colab.research.google.com/github/shaya-686/AI/blob/main/%D0%9F%D1%80%D0%B0%D0%BA%D1%82%D0%B8%D1%87%D0%BD%D0%B0_recommendation_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Завдання

* Створіть Reader
* Створіть датасет та розділіть його на тренувальні та тестові дані
* Виберіть метрики для поріняння якості моделей
* На основі метрик виберіть найкращу модель
* Використайте `optuna` для підбору найкращих параметрів



In [52]:
import pandas as pd

df = pd.read_csv("https://raw.githubusercontent.com/HalyshAnton/IT-Step-Pyton-AI/main/module7/data/Blog%20Ratings.csv")

df.head()

Unnamed: 0,blog_id,userId,ratings
0,9025,11,3.5
1,9320,11,5.0
2,9246,11,3.5
3,9431,11,5.0
4,875,11,2.0


In [53]:
df.describe()

Unnamed: 0,blog_id,userId,ratings
count,200140.0,200140.0,200140.0
mean,5652.533621,2545.710158,3.117468
std,2970.685946,1446.195478,1.768113
min,1.0,10.0,0.5
25%,2906.0,1314.0,2.0
50%,5994.0,2552.0,3.5
75%,8510.0,3795.0,5.0
max,9755.0,5010.0,5.0


In [54]:
df.shape

(200140, 3)

In [55]:
!pip install -q surprise

In [56]:
from surprise import Dataset, SVD, Reader

reader = Reader(rating_scale=(0.5, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df[["userId", "blog_id", "ratings"]], reader)

In [57]:
from surprise.model_selection import train_test_split

trainset, testset = train_test_split(data, train_size=0.8)

In [58]:
from surprise import BaselineOnly

bsl_options = {'method': 'sgd',
               'reg_u': 0.0001,
               'reg_i': 0.0001}

algo = BaselineOnly(bsl_options=bsl_options)
algo.fit(trainset)

Estimating biases using sgd...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x7a24d3145c00>

In [59]:
from surprise import accuracy

In [60]:
preds = algo.test(testset)

print(f"mae = {accuracy.mae(preds, verbose=False)}")
print(f"mse = {accuracy.mse(preds, verbose=False)}")
print(f"rmse= {accuracy.rmse(preds, verbose=False)}")
print(f"fcp = {accuracy.fcp(preds, verbose=False)}")

mae = 1.632868481150443
mse = 3.31918087123421
rmse= 1.8218619243055194
fcp = 0.5010723629970232


In [61]:
from surprise import SVD

algo = SVD(n_factors=150,
           n_epochs=50,
           )

algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7a24d31465c0>

In [62]:
preds = algo.test(testset)

print(f"mae = {accuracy.mae(preds, verbose=False)}")
print(f"mse = {accuracy.mse(preds, verbose=False)}")
print(f"rmse= {accuracy.rmse(preds, verbose=False)}")
print(f"fcp = {accuracy.fcp(preds, verbose=False)}")

mae = 1.6651509194018617
mse = 3.5918484266173563
rmse= 1.895217250506484
fcp = 0.49123165865561513


In [63]:
from surprise import KNNBasic

algo = KNNBasic(k=15,
                min_k=5,
                sim_options={'name': 'cosine',
                             'user_based': True})

algo.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7a24d3146d70>

In [64]:
preds = algo.test(testset)

print(f"mae = {accuracy.mae(preds, verbose=False)}")
print(f"mse = {accuracy.mse(preds, verbose=False)}")
print(f"rmse= {accuracy.rmse(preds, verbose=False)}")
print(f"fcp = {accuracy.fcp(preds, verbose=False)}")

mae = 1.6408597053866967
mse = 3.402199984062349
rmse= 1.8445053494263304
fcp = 0.5089021780467494


In [65]:
from surprise import CoClustering

algo = CoClustering(n_cltr_u = 5,
                    n_cltr_i = 15,
                    n_epochs = 40)

algo.fit(trainset)

<surprise.prediction_algorithms.co_clustering.CoClustering at 0x7a24d315db10>

In [66]:
preds = algo.test(testset)

print(f"mae = {accuracy.mae(preds, verbose=False)}")
print(f"mse = {accuracy.mse(preds, verbose=False)}")
print(f"rmse= {accuracy.rmse(preds, verbose=False)}")
print(f"fcp = {accuracy.fcp(preds, verbose=False)}")

mae = 1.7449817858354484
mse = 4.187903711588813
rmse= 2.0464368330317
fcp = 0.49752246044595894


In [67]:
from sklearn import metrics


def objective(trial):


  params = {
        'n_factors': trial.suggest_int('n_factors', 10, 100),
        'n_epochs': trial.suggest_int('n_epochs', 10, 50),
        'lr_all': trial.suggest_float('lr_all', 1e-4, 0.1),
        'reg_all': trial.suggest_float('reg_all', 1e-3, 1.0)
    }


  model = SVD(**params)

  model.fit(trainset)

  preds = model.test(testset)
  mae = accuracy.mae(preds, verbose=False)

  return mae

In [68]:
!pip install -q optuna

In [69]:
import optuna
study = optuna.create_study(directions=['minimize'])
study.optimize(objective, n_trials=20)

[I 2024-08-29 18:14:40,361] A new study created in memory with name: no-name-145054c5-4b4a-4589-b725-e19615aa075e
[I 2024-08-29 18:14:51,871] Trial 0 finished with value: 1.6567420864210114 and parameters: {'n_factors': 22, 'n_epochs': 45, 'lr_all': 0.05843168531452816, 'reg_all': 0.3240272504922287}. Best is trial 0 with value: 1.6567420864210114.
[I 2024-08-29 18:14:56,791] Trial 1 finished with value: 1.6756330784440816 and parameters: {'n_factors': 83, 'n_epochs': 26, 'lr_all': 0.08112321687138428, 'reg_all': 0.03755611469458074}. Best is trial 0 with value: 1.6567420864210114.
[I 2024-08-29 18:15:00,857] Trial 2 finished with value: 1.632645895143639 and parameters: {'n_factors': 95, 'n_epochs': 16, 'lr_all': 0.0606836598918591, 'reg_all': 0.761569506576412}. Best is trial 2 with value: 1.632645895143639.
[I 2024-08-29 18:15:05,586] Trial 3 finished with value: 1.6288813353637912 and parameters: {'n_factors': 36, 'n_epochs': 33, 'lr_all': 0.0020289943074293555, 'reg_all': 0.042064

In [70]:
best_params = study.best_trials[0].params
best_params

{'n_factors': 72,
 'n_epochs': 18,
 'lr_all': 0.0002649056346091809,
 'reg_all': 0.4323715121325345}

In [74]:
model = SVD(
    n_factors = 72,
    n_epochs = 18,
    lr_all = 0.0002649056346091809,
    reg_all = 0.4323715121325345
)

In [75]:
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7a24d3147af0>

In [76]:
preds = model.test(testset)

print(f"mae = {accuracy.mae(preds, verbose=False)}")
print(f"mse = {accuracy.mse(preds, verbose=False)}")
print(f"rmse= {accuracy.rmse(preds, verbose=False)}")
print(f"fcp = {accuracy.fcp(preds, verbose=False)}")

mae = 1.6205873512096398
mse = 3.1557578256780383
rmse= 1.7764452779857978
fcp = 0.5001399588478114
