# 수행목표
- 장르별 평균 평점으로 사용자별 평점과 비교한다.
# 수행단계
장르별로 평균 평점을 계산해서 비교한다.

- 추천 코드 class 명을 `AvgGenreRecommender로` 한다.
- 비슷한 클래스가 계속해서 추가되기 때문에 공통으로 사용하고 있는 - 코드를 부모 클래스로 추출하는 것을 권장한다.
    - 이름은 `BaseRecommender`로 한다.
    - 공통 코드는 데이터 load와 전처리
    - 추천 후처리 등이 해당 된다.
- 장르의 경우 별도 파일(genre.csv)에 있으므로 `MovieDataLoader`에 추가로 data를 load 한다.
- 한 영화가 복수의 장르에 속할 수 있다는 점을 고려해야 한다.
- 결과에 `rate_avg_genre` 필드를 추가해서 장르 평점 평균을 계산한 다음 업데이트 한다.
- `rate_avg_genre` 결과를 회귀 평가 한다.
- `rate_avg_genre_class` 필드를 추가해서 `rate_avg_genre` 값을 반올림한 값을 업데이트 한다.
- `rate_avg_genre_class` 결과를 분류 평가 한다.
- 결과 데이터 평점 높은 순 n개를 출력하고 그 아래에 분석 결과를 출력한다.
- 랜덤, 영화 평균 평점, 사용자 평균 평점 추천과 평가를 비교한다.

# 라이브러리 설치

In [1]:
# Library

import os
import sys
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.rcParams['font.family'] = 'AppleGothic'
mpl.rcParams['axes.unicode_minus'] = False

# DataLoader

In [2]:
class MovieDataLoader:
    def __init__(self, file_path):
        self.file_path = file_path
        movie_path = os.path.join(self.file_path, 'movies.txt')
        rate_path = os.path.join(self.file_path, 'rates.csv')
        gerne_path = os.path.join(self.file_path, 'genres.csv')
        self.movies = pd.read_csv(movie_path, sep='\t')
        self.rates = pd.read_csv(rate_path)
        self.gernes = pd.read_csv(gerne_path)

    def load(self):
        self._preprocess()

        return self.movies, self.rates, self.gernes

    def _preprocess(self):
        self.movies.dropna(subset=['title_eng'], inplace=True)

        if self.movies['year'].isnull().sum() > 0:
            non_year = self.movies[self.movies['year'].isnull()]
            for row in non_year.iterrows():
                movie = row[1]['movie']
                title_eng = row[1]['title_eng'].split(' , ')[:-1]
                title_eng = ' , '.join(title_eng)
                year = row[1]['title_eng'].split(' , ')[-1]
                self.movies.loc[self.movies['movie'] == movie, 'title_eng'] = title_eng
                self.movies.loc[self.movies['movie'] == movie, 'year'] = year

        if self.movies['grade'].isnull().sum() > 0:
            self.movies['grade'] = self.movies['grade'].fillna('NR grade')



In [3]:
movies_df, rates_df, genres_df = MovieDataLoader('../data/kmrd/').load()

movies_df.info()
rates_df.info()
genres_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 991 entries, 0 to 998
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   movie      991 non-null    int64 
 1   title      991 non-null    object
 2   title_eng  991 non-null    object
 3   year       991 non-null    object
 4   grade      991 non-null    object
dtypes: int64(1), object(4)
memory usage: 46.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140710 entries, 0 to 140709
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   user    140710 non-null  int64
 1   movie   140710 non-null  int64
 2   rate    140710 non-null  int64
 3   time    140710 non-null  int64
dtypes: int64(4)
memory usage: 4.3 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2025 entries, 0 to 2024
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   movie   2025 non-null   int64 

  self.movies.loc[self.movies['movie'] == movie, 'year'] = year


# Random Recommend

In [4]:
# 결과에 random 평점 field(rate_random)를 추가하고 random 값을 반영해서 결과를 return 한다.
# 결과에 random 평점 field(rate_random)를 추가하고 random 값을 반영해서 결과를 return 한다.

class RandomRecommender:
    def __init__(self, movies_df, rates_df):
        self.movies_df = movies_df
        self.rates_df = rates_df

    def run(self, n_items=10):
        df = self.rates_df.copy()
        df['rate_random'] = np.random.rand(len(df)) * 10

        top_n = (
            df.groupby('user', group_keys=True)
            .apply(lambda x: x.nlargest(n_items, 'rate_random'))
        )

        top_n = top_n.reset_index(drop=True)
        top_n['rate_random_class'] = top_n['rate_random'].apply(lambda x: np.round(x))
        return top_n

    def run_for_user(self, user_id, n_items=10):
        user_rates = self.rates_df[self.rates_df['user'] == user_id].copy()
        rate_random = np.random.rand(len(user_rates)) * 10
        user_rates['rate_random'] = rate_random
        user_rates = user_rates.sort_values(by='rate_random', ascending=False)
        user_rates = user_rates.head(n_items)
        user_rates['rate_random_class'] = user_rates['rate_random'].apply(lambda x: np.round(x))


        return user_rates

In [5]:
result = RandomRecommender(movies_df, rates_df).run(20)
result

  df.groupby('user', group_keys=True)


Unnamed: 0,user,movie,rate,time,rate_random,rate_random_class
0,0,10069,7,1430816940,9.818150,10.0
1,0,10514,8,1462067280,9.675350,10.0
2,0,10701,7,1423288920,9.651095,10.0
3,0,10647,6,1434248460,9.642635,10.0
4,0,10453,8,1449327840,9.627567,10.0
...,...,...,...,...,...,...
117757,52023,10998,10,1421679660,6.610486,7.0
117758,52024,10998,10,1204622460,6.342834,6.0
117759,52025,10998,7,1498546920,8.516215,9.0
117760,52026,10998,9,1434090420,2.164140,2.0


# Analyzer

In [26]:
class Analyzer:
    def __init__(self, y_true, y_pred, y_pred_class):
        self.y_true = y_true
        self.y_pred = y_pred
        self.y_pred_class = y_pred_class

    def analyze_error(self):
        self._analyze_mae()
        self._analyze_mse()
        self._analyze_rmse()
        self._analyze_mape()

    def analyze_confusion_matrix(self):
        self._make_confusion_matrix()
        self._analyze_accuracy()
        self._analyze_precision()
        self._analyze_recall()
        self._analyze_f1()

    def _analyze_mae(self):
        self.mae = np.abs(self.y_true - self.y_pred).mean()
        print(f'MAE: {self.mae}')

    def _analyze_mse(self):
        self.mse = ((self.y_true - self.y_pred) ** 2).mean()
        print(f'MSE: {self.mse}')

    def _analyze_rmse(self):
        self.rmse = np.sqrt(self.mse)
        print(f'RMSE: {self.rmse}')

    def _analyze_mape(self):
        self.mape = (np.abs(self.y_true - self.y_pred) / self.y_true).mean()
        print(f'MAPE: {self.mape}')

    def _make_confusion_matrix(self):
        unique_labels = np.unique(np.concatenate((self.y_true, self.y_pred_class)))
        self.confusion_matrix = np.zeros((len(unique_labels), len(unique_labels)), dtype=int)

        label_to_index = {label: index for index, label in enumerate(unique_labels)}
        for true, pred in zip(self.y_true, self.y_pred_class):
            self.confusion_matrix[label_to_index[true]][label_to_index[pred]] += 1

        print('Confusion Matrix :')
        print(self.confusion_matrix)

    def _analyze_accuracy(self):
        self.accuracy = np.diag(self.confusion_matrix).sum() / self.confusion_matrix.sum()
        print(f'Accuracy: {self.accuracy}')

    def _analyze_precision(self):
        with np.errstate(divide='ignore', invalid='ignore'):
            try:
                self.precision = np.diag(self.confusion_matrix) / self.confusion_matrix.sum(axis=0)
            except:
                self.precision = np.nan
        print(f'Precision: {self.precision}')

    def _analyze_recall(self):
        with np.errstate(divide='ignore', invalid='ignore'):
            try:
                self.recall = np.diag(self.confusion_matrix) / self.confusion_matrix.sum(axis=1)
            except:
                self.recall = np.nan
        print(f'Recall: {self.recall}')

    def _analyze_f1(self):
        self.f1 = 2 * (self.precision * self.recall) / (self.precision + self.recall)
        print(f'F1: {self.f1}')


In [7]:
analyzer = Analyzer(result["rate"], result["rate_random"], result["rate_random_class"])

analyzer.analyze_error()

MAE: 4.379506817104283
MSE: 27.309803227788276
RMSE: 5.225878225503181
MAPE: 0.6171994754469737


In [8]:
analyzer.analyze_confusion_matrix()

Confusion Matrix :
[[   0    0    0    0    0    0    0    0    0    0    0]
 [ 204  414  388  436  430  445  429  446  492  520  247]
 [  31   46   47   42   50   50   55   66   58   75   37]
 [  27   48   32   44   53   48   37   57   55   58   40]
 [  32   54   56   46   65   77   77   81   90  106   56]
 [  49  108  102  113   98  126  150  154  156  200   99]
 [  72  186  172  177  187  199  213  249  295  372  178]
 [ 145  305  340  348  365  391  385  472  534  633  318]
 [ 290  584  684  684  755  768  856  960  987 1129  592]
 [ 485  979  961  986 1099 1110 1227 1267 1328 1377  680]
 [3789 7747 7701 7940 8279 8227 8514 8652 9159 9230 4598]]
Accuracy: 0.07084628318133183
Precision: [0.         0.03953777 0.00448345 0.00406805 0.00571127 0.01101302
 0.01783471 0.03805224 0.07503421 0.10051095 0.67173119]
Recall: [       nan 0.09301281 0.08438061 0.08817635 0.08783784 0.09298893
 0.0926087  0.11142587 0.11907347 0.11974954 0.05484517]
F1: [       nan 0.05548854 0.00851449 0.00777

# Average rate Recommend

In [9]:
df = rates_df.copy()
df.groupby('movie')['rate'].transform('mean')

0         9.270981
1         9.120000
2         9.205059
3         9.317726
4         9.132075
            ...   
140705    8.882759
140706    8.882759
140707    8.882759
140708    8.882759
140709    8.882759
Name: rate, Length: 140710, dtype: float64

In [10]:
class AvgMovieRecommender:
    def __init__(self, movies_df, rates_df):
        self.movies_df = movies_df
        self.rates_df = rates_df

    def run(self, n_items=10):
        df = self.rates_df.copy()
        df['rate_avg_movie'] = df.groupby('movie')['rate'].transform('mean')

        top_n = (
            df.groupby('user', group_keys=False)
            .apply(lambda x: x.nlargest(n_items, 'rate_avg_movie'))
        )

        top_n = top_n.reset_index(drop=True)
        top_n['rate_avg_movie_class'] = top_n['rate_avg_movie'].apply(lambda x: np.round(x))

        return top_n

In [11]:
result = AvgMovieRecommender(movies_df, rates_df).run(20)
result

  df.groupby('user', group_keys=False)


Unnamed: 0,user,movie,rate,time,rate_avg_movie,rate_avg_movie_class
0,0,10200,10,1483582800,9.434136,9.0
1,0,10048,7,1394114760,9.378706,9.0
2,0,10038,9,1495625940,9.343516,9.0
3,0,10021,9,1424497980,9.317726,9.0
4,0,10185,7,1473424740,9.299180,9.0
...,...,...,...,...,...,...
117757,52023,10998,10,1421679660,8.882759,9.0
117758,52024,10998,10,1204622460,8.882759,9.0
117759,52025,10998,7,1498546920,8.882759,9.0
117760,52026,10998,9,1434090420,8.882759,9.0


In [12]:
analyzer = Analyzer(result["rate"], result["rate_avg_movie"], result["rate_avg_movie_class"])
analyzer.analyze_error()

MAE: 1.255901668956898
MSE: 3.952888516210945
RMSE: 1.9881872437501817
MAPE: 0.42152373306057334


In [13]:
analyzer.analyze_confusion_matrix()

Confusion Matrix :
[[    0     1     1    11    68    56   316   406  3570     4]
 [    0     0     0     4     9    13    27   117   321     0]
 [    0     0     0     0    11    13    47    88   283     0]
 [    0     0     0     3     5    14    45   166   424     0]
 [    0     0     1     2     8    25    70   256   846     0]
 [    0     0     0     0    16    27   105   484  1456     5]
 [    0     0     0     2    14    31   146   746  2903     7]
 [    0     0     0     0    14    31   238  1357  6288    17]
 [    0     0     0     1    14    22   151  1416  9957    40]
 [    0     0     0     7    41   114   878  5497 78155   351]]
Accuracy: 0.10061819602248603
Precision: [       nan 0.         0.         0.1        0.04       0.07803468
 0.07217004 0.12883319 0.09555387 0.82783019]
Recall: [0.         0.         0.         0.00456621 0.00662252 0.01290014
 0.03793193 0.17079924 0.85828808 0.00412732]
F1: [       nan        nan        nan 0.00873362 0.01136364 0.02214022
 0.0

  self.f1 = 2 * (self.precision * self.recall) / (self.precision + self.recall)


# Average User Rate Recommend

In [14]:
class AvgUserRecommender:
    def __init__(self, movies_df, rates_df):
        self.movies_df = movies_df
        self.rates_df = rates_df

    def run(self, n_items=10):
        df = self.rates_df.copy()
        df['rate_avg_user'] = df.groupby('user')['rate'].transform('mean')

        top_n = (
            df.groupby('user', group_keys=False)
            .apply(lambda x: x.nlargest(n_items, 'rate_avg_user'))
        )

        top_n = top_n.reset_index(drop=True)
        top_n['rate_avg_user_class'] = top_n['rate_avg_user'].apply(lambda x: np.round(x))

        return top_n

In [15]:
result = AvgUserRecommender(movies_df, rates_df).run(20)
result

  df.groupby('user', group_keys=False)


Unnamed: 0,user,movie,rate,time,rate_avg_user,rate_avg_user_class
0,0,10003,7,1494128040,7.388235,7.0
1,0,10004,7,1467529800,7.388235,7.0
2,0,10018,9,1513344120,7.388235,7.0
3,0,10021,9,1424497980,7.388235,7.0
4,0,10022,7,1427627340,7.388235,7.0
...,...,...,...,...,...,...
117757,52023,10998,10,1421679660,10.000000,10.0
117758,52024,10998,10,1204622460,10.000000,10.0
117759,52025,10998,7,1498546920,7.000000,7.0
117760,52026,10998,9,1434090420,9.000000,9.0


In [16]:
analyzer = Analyzer(result["rate"], result["rate_avg_user"], result["rate_avg_user_class"])
analyzer.analyze_error()

MAE: 0.6168470944568313
MSE: 1.7458033564580981
RMSE: 1.3212885212769003
MAPE: 0.18796969777773848


In [17]:
analyzer.analyze_confusion_matrix()

Confusion Matrix :
[[ 1842   101   180   225   163   538   420   499   456    10]
 [    2   163     9    21    39    78    68    88    66     2]
 [    0     5   148     8    30    76    62    83    75     6]
 [    0     6     3   205    33    87   122   141    91     5]
 [    0     8    11    14   354   139   202   352   207    22]
 [    0     0     6    22    33   601   408   621   495    53]
 [    0     0     6    19    41   194  1238  1391  1058   127]
 [    0     0    14    16    37   167   638  3734  2896   630]
 [    0     0     1    11    55    97   413  1801  6567  2591]
 [    4     8    34    91    88   649  1156  4181 14421 63684]]
Accuracy: 0.6669044343676228
Precision: [0.99675325 0.56013746 0.3592233  0.32436709 0.40549828 0.22886519
 0.26189972 0.28965945 0.24939237 0.94866677]
Recall: [0.41542625 0.30410448 0.30020284 0.2958153  0.27043545 0.2684234
 0.30387825 0.45917364 0.56926144 0.75530148]
F1: [0.58643744 0.39419589 0.32707182 0.30943396 0.32447296 0.24707091
 0.281

In [94]:
class AvgGenreRecommender:
    def __init__(self, movies_df, rates_df, genres_df):
        self.movies_df = movies_df
        self.rates_df = rates_df
        self.genres_df = genres_df

    def run(self, n_items=10):
        df = self.rates_df.copy()
        df = pd.merge(df, self.genres_df, on='movie', how='left')

        gerne_avg = df.groupby('genre')['rate'].mean()
        movie_genre = self.genres_df.groupby('movie')['genre'].apply(list)

        movie_genre_rating = {}
        for movie, gerne_list in movie_genre.items():
            valid_avg = [gerne_avg[g] for g in gerne_list if g in gerne_avg]
            if len(valid_avg) > 0:
                movie_genre_rating[movie] = np.mean(valid_avg)
            else:
                movie_genre_rating[movie] = np.nan

        df['rate_avg_genre'] = df['movie'].apply(lambda x: movie_genre_rating.get(x, np.nan))
        df['rate_avg_genre_class'] = df['rate_avg_genre'].apply(lambda x: np.round(x))

        df = df.dropna(subset=['rate_avg_genre'])
        return df

In [95]:
result = AvgGenreRecommender(movies_df, rates_df, genres_df).run(20)
result

Unnamed: 0,user,movie,rate,time,genre,rate_avg_genre,rate_avg_genre_class
0,0,10003,7,1494128040,SF,9.003956,9.0
1,0,10003,7,1494128040,코미디,9.003956,9.0
2,0,10004,7,1467529800,서부,8.997516,9.0
3,0,10004,7,1467529800,SF,8.997516,9.0
4,0,10004,7,1467529800,판타지,8.997516,9.0
...,...,...,...,...,...,...,...
350669,52026,10998,9,1434090420,스릴러,8.953182,9.0
350670,52027,10998,10,1227036960,드라마,8.953182,9.0
350671,52027,10998,10,1227036960,액션,8.953182,9.0
350672,52027,10998,10,1227036960,모험,8.953182,9.0


In [96]:
analyze = Analyzer(result["rate"], result["rate_avg_genre"], result["rate_avg_genre_class"])
analyze.analyze_error()

MAE: 1.4080493919945725
MSE: 4.446942192985892
RMSE: 2.1087774166530453
MAPE: 0.4573179798127568


In [99]:
analyze.analyze_confusion_matrix()

Confusion Matrix :
[[     0      0      0      0      0      7     30      9  13256      0]
 [     0      0      0      0      0      2     12      3   1875      0]
 [     0      0      0      0      0      5     22      0   1877      0]
 [     0      0      0      0      0      2     22      0   2586      0]
 [     0      0      0      0      0      2     28      0   4889      0]
 [     0      0      0      0      0      4     26      0   8730      0]
 [     0      0      0      0      0      2     14      3  15865      0]
 [     0      0      0      0      0      5     12      2  27841      0]
 [     0      0      0      0      0      1      8      0  35567      0]
 [     0      0      0      0      0      5     56      3 236643      0]]
Accuracy: 0.10184766494759798
Precision: [       nan        nan        nan        nan        nan 0.11428571
 0.06086957 0.1        0.10187352        nan]
Recall: [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 4.56621005e