# 수행목표
- 사용자별 평균 평점으로 사용자별 평점과 비교한다.

# 수행단계
사용자별로 평균 평점을 계산해서 비교한다.

- 추천 코드 class 명을 `AvgUserRecommender로` 한다.
- 결과에 `rate_avg_user` 필드를 추가해서 사용자 평점 평균을 계산한 다음 업데이트 한다.
- `rate_avg_user` 결과를 회귀 평가 한다.
- `rate_avg_user_class` 필드를 추가해서 `rate_avg_user` 값을 반올림한 값을 업데이트 한다.
- `rate_avg_user_class` 결과를 분류 평가 한다.
- 결과 데이터 평점 높은 순 n개를 출력하고 그 아래에 분석 결과를 출력한다.
- 랜덤 추천, 영화 평균 평점 추천과 평가를 비교한다.

# 라이브러리 설치

In [4]:
# Library

import os
import sys
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.rcParams['font.family'] = 'AppleGothic'
mpl.rcParams['axes.unicode_minus'] = False

# DataLoader

In [5]:
class MovieDataLoader:
    def __init__(self, file_path):
        self.file_path = file_path
        movie_path = os.path.join(self.file_path, 'movies.txt')
        rate_path = os.path.join(self.file_path, 'rates.csv')
        self.movies = pd.read_csv(movie_path, sep='\t')
        self.rates = pd.read_csv(rate_path)

    def load(self):
        self._preprocess()

        return self.movies, self.rates

    def _preprocess(self):
        self.movies.dropna(subset=['title_eng'], inplace=True)

        if self.movies['year'].isnull().sum() > 0:
            non_year = self.movies[self.movies['year'].isnull()]
            for row in non_year.iterrows():
                movie = row[1]['movie']
                title_eng = row[1]['title_eng'].split(' , ')[:-1]
                title_eng = ' , '.join(title_eng)
                year = row[1]['title_eng'].split(' , ')[-1]
                self.movies.loc[self.movies['movie'] == movie, 'title_eng'] = title_eng
                self.movies.loc[self.movies['movie'] == movie, 'year'] = year

        if self.movies['grade'].isnull().sum() > 0:
            self.movies['grade'] = self.movies['grade'].fillna('NR grade')



In [6]:
movies_df, rates_df = MovieDataLoader('../data/kmrd/').load()

movies_df.info()
rates_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 991 entries, 0 to 998
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   movie      991 non-null    int64 
 1   title      991 non-null    object
 2   title_eng  991 non-null    object
 3   year       991 non-null    object
 4   grade      991 non-null    object
dtypes: int64(1), object(4)
memory usage: 46.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140710 entries, 0 to 140709
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   user    140710 non-null  int64
 1   movie   140710 non-null  int64
 2   rate    140710 non-null  int64
 3   time    140710 non-null  int64
dtypes: int64(4)
memory usage: 4.3 MB


  self.movies.loc[self.movies['movie'] == movie, 'year'] = year


# Random Recommend

In [7]:
# 결과에 random 평점 field(rate_random)를 추가하고 random 값을 반영해서 결과를 return 한다.
# 결과에 random 평점 field(rate_random)를 추가하고 random 값을 반영해서 결과를 return 한다.

class RandomRecommender:
    def __init__(self, movies_df, rates_df):
        self.movies_df = movies_df
        self.rates_df = rates_df

    def run(self, n_items=10):
        df = self.rates_df.copy()
        df['rate_random'] = np.random.rand(len(df)) * 10

        top_n = (
            df.groupby('user', group_keys=True)
            .apply(lambda x: x.nlargest(n_items, 'rate_random'))
        )

        top_n = top_n.reset_index(drop=True)
        top_n['rate_random_class'] = top_n['rate_random'].apply(lambda x: np.round(x))
        return top_n

    def run_for_user(self, user_id, n_items=10):
        user_rates = self.rates_df[self.rates_df['user'] == user_id].copy()
        rate_random = np.random.rand(len(user_rates)) * 10
        user_rates['rate_random'] = rate_random
        user_rates = user_rates.sort_values(by='rate_random', ascending=False)
        user_rates = user_rates.head(n_items)
        user_rates['rate_random_class'] = user_rates['rate_random'].apply(lambda x: np.round(x))


        return user_rates

In [8]:
result = RandomRecommender(movies_df, rates_df).run(20)
result

  df.groupby('user', group_keys=True)


Unnamed: 0,user,movie,rate,time,rate_random,rate_random_class
0,0,10941,7,1457400720,9.783018,10.0
1,0,10746,7,1442579400,9.667641,10.0
2,0,10614,3,1438409760,9.638738,10.0
3,0,10104,8,1454133300,9.553418,10.0
4,0,10039,8,1427980080,9.523505,10.0
...,...,...,...,...,...,...
117757,52023,10998,10,1421679660,6.076770,6.0
117758,52024,10998,10,1204622460,0.440702,0.0
117759,52025,10998,7,1498546920,5.644030,6.0
117760,52026,10998,9,1434090420,4.620192,5.0


# Analyzer

In [9]:
class Analyzer:
    def __init__(self, y_true, y_pred, y_pred_class):
        self.y_true = y_true
        self.y_pred = y_pred
        self.y_pred_class = y_pred_class

    def analyze_error(self):
        self._analyze_mae()
        self._analyze_mse()
        self._analyze_rmse()
        self._analyze_mape()

    def analyze_confusion_matrix(self):
        self._make_confusion_matrix()
        self._analyze_accuracy()
        self._analyze_precision()
        self._analyze_recall()
        self._analyze_f1()

    def _analyze_mae(self):
        self.mae = np.abs(self.y_true - self.y_pred).mean()
        print(f'MAE: {self.mae}')

    def _analyze_mse(self):
        self.mse = ((self.y_true - self.y_pred) ** 2).mean()
        print(f'MSE: {self.mse}')

    def _analyze_rmse(self):
        self.rmse = np.sqrt(self.mse)
        print(f'RMSE: {self.rmse}')

    def _analyze_mape(self):
        self.mape = (np.abs(self.y_true - self.y_pred) / self.y_true).mean()
        print(f'MAPE: {self.mape}')

    def _make_confusion_matrix(self):
        unique_labels = np.unique(np.concatenate((self.y_true, self.y_pred_class)))
        self.confusion_matrix = np.zeros((len(unique_labels), len(unique_labels)), dtype=int)

        label_to_index = {label: index for index, label in enumerate(unique_labels)}
        for true, pred in zip(self.y_true, self.y_pred_class):
            self.confusion_matrix[label_to_index[true]][label_to_index[pred]] += 1

        print('Confusion Matrix :')
        print(self.confusion_matrix)

    def _analyze_accuracy(self):
        self.accuracy = np.diag(self.confusion_matrix).sum() / self.confusion_matrix.sum()
        print(f'Accuracy: {self.accuracy}')

    def _analyze_precision(self):
        with np.errstate(divide='ignore', invalid='ignore'):
            try:
                self.precision = np.diag(self.confusion_matrix) / self.confusion_matrix.sum(axis=0)
            except:
                self.precision = np.nan
        print(f'Precision: {self.precision}')

    def _analyze_recall(self):
        with np.errstate(divide='ignore', invalid='ignore'):
            try:
                self.recall = np.diag(self.confusion_matrix) / self.confusion_matrix.sum(axis=1)
            except:
                self.recall = np.nan
        print(f'Recall: {self.recall}')

    def _analyze_f1(self):
        self.f1 = 2 * (self.precision * self.recall) / (self.precision + self.recall)
        print(f'F1: {self.f1}')


In [10]:
analyzer = Analyzer(result["rate"], result["rate_random"], result["rate_random_class"])

analyzer.analyze_error()

MAE: 4.375831784359974
MSE: 27.273381360781137
RMSE: 5.22239230245882
MAPE: 0.6167491623823316


In [11]:
analyzer.analyze_confusion_matrix()

Confusion Matrix :
[[   0    0    0    0    0    0    0    0    0    0    0]
 [ 189  381  411  446  446  405  474  510  454  457  269]
 [  24   43   38   54   46   60   57   49   76   80   33]
 [  22   36   47   40   40   40   62   53   74   68   47]
 [  27   60   62   62   74   59   72   66   90  115   53]
 [  43  123  120  113  109  137  125  151  147  199  110]
 [  90  181  153  215  206  225  235  242  272  335  173]
 [ 147  297  312  323  372  388  436  465  539  610  303]
 [ 312  611  674  670  769  761  802  895 1001 1155  574]
 [ 453  903 1063 1004 1081 1174 1166 1217 1309 1448  702]
 [3808 7644 7734 7971 8156 8294 8514 8740 9185 9190 4615]]
Accuracy: 0.07161902820943938
Precision: [0.         0.03706586 0.00358018 0.0036704  0.00654925 0.01186866
 0.0196768  0.03753633 0.07613904 0.10602621 0.6708824 ]
Recall: [       nan 0.08577217 0.06785714 0.07561437 0.1        0.09949165
 0.1009884  0.11092557 0.12171693 0.12569444 0.0550381 ]
F1: [       nan 0.05176279 0.0068015  0.00700

# Average rate Recommend

In [12]:
df = rates_df.copy()
df.groupby('movie')['rate'].transform('mean')

0         9.270981
1         9.120000
2         9.205059
3         9.317726
4         9.132075
            ...   
140705    8.882759
140706    8.882759
140707    8.882759
140708    8.882759
140709    8.882759
Name: rate, Length: 140710, dtype: float64

In [13]:
class AvgMovieRecommender:
    def __init__(self, movies_df, rates_df):
        self.movies_df = movies_df
        self.rates_df = rates_df

    def run(self, n_items=10):
        df = self.rates_df.copy()
        df['rate_avg_movie'] = df.groupby('movie')['rate'].transform('mean')

        top_n = (
            df.groupby('user', group_keys=False)
            .apply(lambda x: x.nlargest(n_items, 'rate_avg_movie'))
        )

        top_n = top_n.reset_index(drop=True)
        top_n['rate_avg_movie_class'] = top_n['rate_avg_movie'].apply(lambda x: np.round(x))

        return top_n

In [14]:
result = AvgMovieRecommender(movies_df, rates_df).run(20)
result

  df.groupby('user', group_keys=False)


Unnamed: 0,user,movie,rate,time,rate_avg_movie,rate_avg_movie_class
0,0,10200,10,1483582800,9.434136,9.0
1,0,10048,7,1394114760,9.378706,9.0
2,0,10038,9,1495625940,9.343516,9.0
3,0,10021,9,1424497980,9.317726,9.0
4,0,10185,7,1473424740,9.299180,9.0
...,...,...,...,...,...,...
117757,52023,10998,10,1421679660,8.882759,9.0
117758,52024,10998,10,1204622460,8.882759,9.0
117759,52025,10998,7,1498546920,8.882759,9.0
117760,52026,10998,9,1434090420,8.882759,9.0


In [15]:
analyzer = Analyzer(result["rate"], result["rate_avg_movie"], result["rate_avg_movie_class"])
analyzer.analyze_error()

MAE: 1.255901668956898
MSE: 3.952888516210945
RMSE: 1.9881872437501817
MAPE: 0.42152373306057334


In [16]:
analyzer.analyze_confusion_matrix()

Confusion Matrix :
[[    0     1     1    11    68    56   316   406  3570     4]
 [    0     0     0     4     9    13    27   117   321     0]
 [    0     0     0     0    11    13    47    88   283     0]
 [    0     0     0     3     5    14    45   166   424     0]
 [    0     0     1     2     8    25    70   256   846     0]
 [    0     0     0     0    16    27   105   484  1456     5]
 [    0     0     0     2    14    31   146   746  2903     7]
 [    0     0     0     0    14    31   238  1357  6288    17]
 [    0     0     0     1    14    22   151  1416  9957    40]
 [    0     0     0     7    41   114   878  5497 78155   351]]
Accuracy: 0.10061819602248603
Precision: [       nan 0.         0.         0.1        0.04       0.07803468
 0.07217004 0.12883319 0.09555387 0.82783019]
Recall: [0.         0.         0.         0.00456621 0.00662252 0.01290014
 0.03793193 0.17079924 0.85828808 0.00412732]
F1: [       nan        nan        nan 0.00873362 0.01136364 0.02214022
 0.0

  self.f1 = 2 * (self.precision * self.recall) / (self.precision + self.recall)


# Average User Rate Recommend

In [18]:
class AvgUserRecommender:
    def __init__(self, movies_df, rates_df):
        self.movies_df = movies_df
        self.rates_df = rates_df

    def run(self, n_items=10):
        df = self.rates_df.copy()
        df['rate_avg_user'] = df.groupby('user')['rate'].transform('mean')

        top_n = (
            df.groupby('user', group_keys=False)
            .apply(lambda x: x.nlargest(n_items, 'rate_avg_user'))
        )

        top_n = top_n.reset_index(drop=True)
        top_n['rate_avg_user_class'] = top_n['rate_avg_user'].apply(lambda x: np.round(x))

        return top_n

In [19]:
result = AvgUserRecommender(movies_df, rates_df).run(20)
result

  df.groupby('user', group_keys=False)


Unnamed: 0,user,movie,rate,time,rate_avg_user,rate_avg_user_class
0,0,10003,7,1494128040,7.388235,7.0
1,0,10004,7,1467529800,7.388235,7.0
2,0,10018,9,1513344120,7.388235,7.0
3,0,10021,9,1424497980,7.388235,7.0
4,0,10022,7,1427627340,7.388235,7.0
...,...,...,...,...,...,...
117757,52023,10998,10,1421679660,10.000000,10.0
117758,52024,10998,10,1204622460,10.000000,10.0
117759,52025,10998,7,1498546920,7.000000,7.0
117760,52026,10998,9,1434090420,9.000000,9.0


In [20]:
analyzer = Analyzer(result["rate"], result["rate_avg_user"], result["rate_avg_user_class"])
analyzer.analyze_error()

MAE: 0.6168470944568313
MSE: 1.7458033564580981
RMSE: 1.3212885212769003
MAPE: 0.18796969777773848


In [21]:
analyzer.analyze_confusion_matrix()

Confusion Matrix :
[[ 1842   101   180   225   163   538   420   499   456    10]
 [    2   163     9    21    39    78    68    88    66     2]
 [    0     5   148     8    30    76    62    83    75     6]
 [    0     6     3   205    33    87   122   141    91     5]
 [    0     8    11    14   354   139   202   352   207    22]
 [    0     0     6    22    33   601   408   621   495    53]
 [    0     0     6    19    41   194  1238  1391  1058   127]
 [    0     0    14    16    37   167   638  3734  2896   630]
 [    0     0     1    11    55    97   413  1801  6567  2591]
 [    4     8    34    91    88   649  1156  4181 14421 63684]]
Accuracy: 0.6669044343676228
Precision: [0.99675325 0.56013746 0.3592233  0.32436709 0.40549828 0.22886519
 0.26189972 0.28965945 0.24939237 0.94866677]
Recall: [0.41542625 0.30410448 0.30020284 0.2958153  0.27043545 0.2684234
 0.30387825 0.45917364 0.56926144 0.75530148]
F1: [0.58643744 0.39419589 0.32707182 0.30943396 0.32447296 0.24707091
 0.281