# 수행목표
- 주연 배우별 평균 평점으로 사용자별 평점과 비교한다.

# 수행단계
주연 배우별로 평균 평점을 계산해서 비교한다.

- 추천 코드 class 명을 `AvgActorRecommender로` 한다.
- 주연배우의 경우 별도 파일(castings.csv)에 있으므로 `MovieDataLoader에` 추가로 data를 load 한다.
- 주연 배우(단수,복수)가 출연한 모든 영화의 평균을 고려해야 한다.
- 결과에 `rate_avg_actor` 필드를 추가해서 주연배우 평점 평균을 계산한 다음 업데이트 한다.
- `rate_avg_actor` 결과를 회귀 평가 한다.
- `rate_avg_actor_class` 필드를 추가해서 `rate_avg_actor` 값을 반올림한 값을 업데이트 한다.
- `rate_avg_actor_class` 결과를 분류 평가 한다.
- 결과 데이터 평점 높은 순 n개를 출력하고 그 아래에 분석 결과를 출력한다.
- 랜덤, 영화 평균 평점, 사용자 평균 평점, 장르 평균 평점 추천과 평가를 비교한다.

# Library 설치

In [1]:
# Library

import os
import sys
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.rcParams['font.family'] = 'AppleGothic'
mpl.rcParams['axes.unicode_minus'] = False

# DataLoader

In [27]:
class MovieDataLoader:
    def __init__(self, file_path):
        self.file_path = file_path
        movie_path = os.path.join(self.file_path, 'movies.txt')
        rate_path = os.path.join(self.file_path, 'rates.csv')
        gerne_path = os.path.join(self.file_path, 'genres.csv')
        casting_path = os.path.join(self.file_path, 'castings.csv')
        self.movies = pd.read_csv(movie_path, sep='\t')
        self.rates = pd.read_csv(rate_path)
        self.gernes = pd.read_csv(gerne_path)
        self.castings = pd.read_csv(casting_path)

    def load(self):
        self._preprocess()

        return self.movies, self.rates, self.gernes, self.castings

    def _preprocess(self):
        self.movies.dropna(subset=['title_eng'], inplace=True)

        if self.movies['year'].isnull().sum() > 0:
            non_year = self.movies[self.movies['year'].isnull()]
            for row in non_year.iterrows():
                movie = row[1]['movie']
                title_eng = row[1]['title_eng'].split(' , ')[:-1]
                title_eng = ' , '.join(title_eng)
                year = row[1]['title_eng'].split(' , ')[-1]
                self.movies.loc[self.movies['movie'] == movie, 'title_eng'] = title_eng
                self.movies.loc[self.movies['movie'] == movie, 'year'] = year

        if self.movies['grade'].isnull().sum() > 0:
            self.movies['grade'] = self.movies['grade'].fillna('NR grade')


In [28]:
movies_df, rates_df, genres_df, castings_df = MovieDataLoader('../data/kmrd/').load()

movies_df.info()
rates_df.info()
genres_df.info()
castings_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 991 entries, 0 to 998
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   movie      991 non-null    int64 
 1   title      991 non-null    object
 2   title_eng  991 non-null    object
 3   year       991 non-null    object
 4   grade      991 non-null    object
dtypes: int64(1), object(4)
memory usage: 46.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140710 entries, 0 to 140709
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   user    140710 non-null  int64
 1   movie   140710 non-null  int64
 2   rate    140710 non-null  int64
 3   time    140710 non-null  int64
dtypes: int64(4)
memory usage: 4.3 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2025 entries, 0 to 2024
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   movie   2025 non-null   int64 

  self.movies.loc[self.movies['movie'] == movie, 'year'] = year


# Analyzer

In [19]:
class Analyzer:
    def __init__(self, y_true, y_pred, y_pred_class):
        self.y_true = y_true
        self.y_pred = y_pred
        self.y_pred_class = y_pred_class

    def analyze_error(self):
        self._analyze_mae()
        self._analyze_mse()
        self._analyze_rmse()
        self._analyze_mape()

    def analyze_confusion_matrix(self):
        self._make_confusion_matrix()
        self._analyze_accuracy()
        self._analyze_precision()
        self._analyze_recall()
        self._analyze_f1()

    def _analyze_mae(self):
        self.mae = np.abs(self.y_true - self.y_pred).mean()
        print(f'MAE: {self.mae}')

    def _analyze_mse(self):
        self.mse = ((self.y_true - self.y_pred) ** 2).mean()
        print(f'MSE: {self.mse}')

    def _analyze_rmse(self):
        self.rmse = np.sqrt(self.mse)
        print(f'RMSE: {self.rmse}')

    def _analyze_mape(self):
        self.mape = (np.abs(self.y_true - self.y_pred) / self.y_true).mean()
        print(f'MAPE: {self.mape}')

    def _make_confusion_matrix(self):
        unique_labels = np.unique(np.concatenate((self.y_true, self.y_pred_class)))
        self.confusion_matrix = np.zeros((len(unique_labels), len(unique_labels)), dtype=int)

        label_to_index = {label: index for index, label in enumerate(unique_labels)}
        for true, pred in zip(self.y_true, self.y_pred_class):
            self.confusion_matrix[label_to_index[true]][label_to_index[pred]] += 1

        print('Confusion Matrix :')
        print(self.confusion_matrix)

    def _analyze_accuracy(self):
        self.accuracy = np.diag(self.confusion_matrix).sum() / self.confusion_matrix.sum()
        print(f'Accuracy: {self.accuracy}')

    def _analyze_precision(self):
        with np.errstate(divide='ignore', invalid='ignore'):
            try:
                self.precision = np.diag(self.confusion_matrix) / self.confusion_matrix.sum(axis=0)
            except:
                self.precision = np.nan
        print(f'Precision: {self.precision}')

    def _analyze_recall(self):
        with np.errstate(divide='ignore', invalid='ignore'):
            try:
                self.recall = np.diag(self.confusion_matrix) / self.confusion_matrix.sum(axis=1)
            except:
                self.recall = np.nan
        print(f'Recall: {self.recall}')

    def _analyze_f1(self):
        self.f1 = 2 * (self.precision * self.recall) / (self.precision + self.recall)
        print(f'F1: {self.f1}')


# Recommenders

In [54]:
class BaseRecommender:
    def __init__(self, movies_df, rates_df, genres_df=None, castings_df=None):
        self.movies_df = movies_df
        self.rates_df = rates_df
        self.genres_df = genres_df
        self.castings_df = castings_df

    def get_top_n(self, df, score_col, n_items=10):
        top_n = (
            df.groupby('user', group_keys=False)
            .apply(lambda x: x.nlargest(n_items, score_col))
        )
        top_n = top_n.reset_index(drop=True)

        class_col = f'{score_col}_class'
        top_n[class_col] = top_n[score_col].apply(lambda x: np.round(x))
        return top_n

    def run(self, n_items=10):
        raise NotImplementedError


In [55]:
class RandomRecommender(BaseRecommender):
    def __init__(self, movies_df, rates_df, genres_df=None):
        super().__init__(movies_df, rates_df, genres_df)

    def run(self, n_items=10):
        df = self.rates_df.copy()
        df['rate_random'] = np.random.rand(len(df)) * 10

        top_n = self.get_top_n(df, 'rate_random', n_items)
        return top_n

In [56]:
class AvgMovieRecommender(BaseRecommender):
    def __init__(self, movies_df, rates_df, genres_df=None):
        super().__init__(movies_df, rates_df, genres_df)

    def run(self, n_items=10):
        df = self.rates_df.copy()
        df['rate_avg_movie'] = df.groupby('movie')['rate'].transform('mean')

        top_n = self.get_top_n(df, 'rate_avg_movie', n_items)
        return top_n

In [57]:
class AvgUserRecommender(BaseRecommender):
    def __init__(self, movies_df, rates_df, genres_df=None):
        super().__init__(movies_df, rates_df, genres_df)

    def run(self, n_items=10):
        df = self.rates_df.copy()
        df['rate_avg_user'] = df.groupby('user')['rate'].transform('mean')

        top_n = self.get_top_n(df, 'rate_avg_user', n_items)
        return top_n


In [58]:
class AvgGenreRecommender(BaseRecommender):
    def __init__(self, movies_df, rates_df, genres_df):
        super().__init__(movies_df, rates_df, genres_df)

    def run(self, n_items=10):
        df = self.rates_df.copy()
        df = pd.merge(df, self.genres_df, on='movie', how='left')

        genre_avg = df.groupby('genre')['rate'].mean()
        movie_genre = self.genres_df.groupby('movie')['genre'].apply(list)
        movie_genre_rating = {}
        for movie, g_list in movie_genre.items():
            valid_avg = [genre_avg[g] for g in g_list if g in genre_avg]
            movie_genre_rating[movie] = np.mean(valid_avg) if valid_avg else np.nan

        df['rate_avg_genre'] = df['movie'].apply(lambda m: movie_genre_rating.get(m, np.nan))
        df = df.dropna(subset=['rate_avg_genre'])  # NaN 제거

        top_n = self.get_top_n(df, 'rate_avg_genre', n_items)

        return top_n


In [59]:
random_df = RandomRecommender(movies_df, rates_df).run(20)
random_df.head()

  df.groupby('user', group_keys=False)


Unnamed: 0,user,movie,rate,time,rate_random,rate_random_class
0,0,10746,7,1442579400,9.843231,10.0
1,0,10073,7,1421845740,9.499658,9.0
2,0,10096,5,1427543880,9.410215,9.0
3,0,10024,4,1429359420,9.385105,9.0
4,0,10132,9,1403325360,9.366862,9.0


In [60]:
avg_movie_df = AvgMovieRecommender(movies_df, rates_df).run(20)
avg_movie_df.head()

  df.groupby('user', group_keys=False)


Unnamed: 0,user,movie,rate,time,rate_avg_movie,rate_avg_movie_class
0,0,10200,10,1483582800,9.434136,9.0
1,0,10048,7,1394114760,9.378706,9.0
2,0,10038,9,1495625940,9.343516,9.0
3,0,10021,9,1424497980,9.317726,9.0
4,0,10185,7,1473424740,9.29918,9.0


In [61]:
avg_user_df = AvgUserRecommender(movies_df, rates_df).run(20)
avg_user_df.head()

  df.groupby('user', group_keys=False)


Unnamed: 0,user,movie,rate,time,rate_avg_user,rate_avg_user_class
0,0,10003,7,1494128040,7.388235,7.0
1,0,10004,7,1467529800,7.388235,7.0
2,0,10018,9,1513344120,7.388235,7.0
3,0,10021,9,1424497980,7.388235,7.0
4,0,10022,7,1427627340,7.388235,7.0


In [62]:
avg_genre_df = AvgGenreRecommender(movies_df, rates_df, genres_df).run(20)
avg_genre_df.head()

  df.groupby('user', group_keys=False)


Unnamed: 0,user,movie,rate,time,genre,rate_avg_genre,rate_avg_genre_class
0,0,10087,6,1455366120,멜로/로맨스,9.068975,9.0
1,0,10087,6,1455366120,드라마,9.068975,9.0
2,0,10087,6,1455366120,뮤지컬,9.068975,9.0
3,0,10395,8,1452179460,멜로/로맨스,9.068975,9.0
4,0,10395,8,1452179460,드라마,9.068975,9.0


In [63]:
class AvgActorRecommender(BaseRecommender):
    def __init__(self, movies_df, rates_df, castings_df):
        super().__init__(movies_df, rates_df, castings_df=castings_df)

    def run(self, n_items=10):
        df = self.rates_df.copy()
        df = pd.merge(df, self.castings_df, on='movie', how='left')

        df = df[df['leading'] == 1]
        df['rate_avg_actor'] = df.groupby('people')['rate'].transform('mean')

        top_n = self.get_top_n(df, 'rate_avg_actor', n_items)

        return top_n

In [64]:
avg_actor_df = AvgActorRecommender(movies_df, rates_df, castings_df).run(20)
avg_actor_df.head()

  df.groupby('user', group_keys=False)


Unnamed: 0,user,movie,rate,time,people,order,leading,rate_avg_actor,rate_avg_actor_class
0,0,10048,7,1394114760,1349.0,1.0,1.0,9.367127,9.0
1,0,10004,7,1467529800,5104.0,4.0,1.0,9.360691,9.0
2,0,10038,9,1495625940,7049.0,3.0,1.0,9.343516,9.0
3,0,10038,9,1495625940,10587.0,5.0,1.0,9.343516,9.0
4,0,10003,7,1494128040,1076.0,1.0,1.0,9.336012,9.0


In [66]:
# analysis all recommenders

random_analyzer = Analyzer(random_df['rate'], random_df['rate_random'], random_df['rate_random_class'])
avg_movie_analyzer = Analyzer(avg_movie_df['rate'], avg_movie_df['rate_avg_movie'], avg_movie_df['rate_avg_movie_class'])
avg_user_analyzer = Analyzer(avg_user_df['rate'], avg_user_df['rate_avg_user'], avg_user_df['rate_avg_user_class'])
avg_genre_analyzer = Analyzer(avg_genre_df['rate'], avg_genre_df['rate_avg_genre'], avg_genre_df['rate_avg_genre_class'])
avg_actor_analyzer = Analyzer(avg_actor_df['rate'], avg_actor_df['rate_avg_actor'], avg_actor_df['rate_avg_actor_class'])

In [67]:
random_analyzer.analyze_error()

MAE: 4.3769186819542565
MSE: 27.287828738770077
RMSE: 5.2237753338720525
MAPE: 0.620532870691746


In [68]:
random_analyzer.analyze_confusion_matrix()

Confusion Matrix :
[[   0    0    0    0    0    0    0    0    0    0    0]
 [ 151  409  426  411  421  445  463  468  480  513  290]
 [  23   42   50   47   58   54   47   49   57   76   44]
 [  18   32   38   46   36   36   54   67   79   75   38]
 [  27   54   60   72   59   67   72   84   90   96   46]
 [  30  112  123  108  133  131  135  146  152  175  108]
 [  69  186  185  203  190  226  231  260  298  328  168]
 [ 164  323  291  330  360  406  400  499  497  622  314]
 [ 300  607  678  717  756  808  809  892 1032 1119  551]
 [ 459  989 1002 1048 1083 1080 1184 1259 1344 1390  719]
 [3793 7423 7854 8130 8084 8404 8318 8758 8984 9341 4674]]
Accuracy: 0.0723578064231246
Precision: [0.         0.04018866 0.00466984 0.00413967 0.00527728 0.01123788
 0.01972168 0.03997757 0.07930531 0.10120131 0.67232451]
Recall: [       nan 0.09135582 0.09140768 0.08863198 0.08115543 0.09682188
 0.09854949 0.11864004 0.12480348 0.12027343 0.05580029]
F1: [       nan 0.05582094 0.00888573 0.007909

In [69]:
avg_movie_analyzer.analyze_error()

MAE: 1.255901668956898
MSE: 3.952888516210945
RMSE: 1.9881872437501817
MAPE: 0.42152373306057334


In [70]:
avg_movie_analyzer.analyze_confusion_matrix()

Confusion Matrix :
[[    0     1     1    11    68    56   316   406  3570     4]
 [    0     0     0     4     9    13    27   117   321     0]
 [    0     0     0     0    11    13    47    88   283     0]
 [    0     0     0     3     5    14    45   166   424     0]
 [    0     0     1     2     8    25    70   256   846     0]
 [    0     0     0     0    16    27   105   484  1456     5]
 [    0     0     0     2    14    31   146   746  2903     7]
 [    0     0     0     0    14    31   238  1357  6288    17]
 [    0     0     0     1    14    22   151  1416  9957    40]
 [    0     0     0     7    41   114   878  5497 78155   351]]
Accuracy: 0.10061819602248603
Precision: [       nan 0.         0.         0.1        0.04       0.07803468
 0.07217004 0.12883319 0.09555387 0.82783019]
Recall: [0.         0.         0.         0.00456621 0.00662252 0.01290014
 0.03793193 0.17079924 0.85828808 0.00412732]
F1: [       nan        nan        nan 0.00873362 0.01136364 0.02214022
 0.0

  self.f1 = 2 * (self.precision * self.recall) / (self.precision + self.recall)


In [71]:
avg_user_analyzer.analyze_error()

MAE: 0.6168470944568313
MSE: 1.7458033564580981
RMSE: 1.3212885212769003
MAPE: 0.18796969777773848


In [72]:
avg_user_analyzer.analyze_confusion_matrix()

Confusion Matrix :
[[ 1842   101   180   225   163   538   420   499   456    10]
 [    2   163     9    21    39    78    68    88    66     2]
 [    0     5   148     8    30    76    62    83    75     6]
 [    0     6     3   205    33    87   122   141    91     5]
 [    0     8    11    14   354   139   202   352   207    22]
 [    0     0     6    22    33   601   408   621   495    53]
 [    0     0     6    19    41   194  1238  1391  1058   127]
 [    0     0    14    16    37   167   638  3734  2896   630]
 [    0     0     1    11    55    97   413  1801  6567  2591]
 [    4     8    34    91    88   649  1156  4181 14421 63684]]
Accuracy: 0.6669044343676228
Precision: [0.99675325 0.56013746 0.3592233  0.32436709 0.40549828 0.22886519
 0.26189972 0.28965945 0.24939237 0.94866677]
Recall: [0.41542625 0.30410448 0.30020284 0.2958153  0.27043545 0.2684234
 0.30387825 0.45917364 0.56926144 0.75530148]
F1: [0.58643744 0.39419589 0.32707182 0.30943396 0.32447296 0.24707091
 0.281

In [73]:
avg_genre_analyzer.analyze_error()

MAE: 1.362633892775186
MSE: 4.2375135853945505
RMSE: 2.0585221848196222
MAPE: 0.4528222011236738


In [74]:
avg_genre_analyzer.analyze_confusion_matrix()

Confusion Matrix :
[[     0      0      0      0      0      1      6      0   9851      0]
 [     0      0      0      0      0      0      6      0   1060      0]
 [     0      0      0      0      0      1      6      0    909      0]
 [     0      0      0      0      0      0      0      0   1354      0]
 [     0      0      0      0      0      1     12      0   2459      0]
 [     0      0      0      0      0      1      2      0   4004      0]
 [     0      0      0      0      0      0      4      0   7172      0]
 [     0      0      0      0      0      2      2      2  14895      0]
 [     0      0      0      0      0      0      2      0  22249      0]
 [     0      0      0      0      0      2     22      3 184171      0]]
Accuracy: 0.08966998255432133
Precision: [       nan        nan        nan        nan        nan 0.125
 0.06451613 0.4        0.08966888        nan]
Recall: [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 2.49563264e-04 5

In [75]:
avg_actor_analyzer.analyze_error()

MAE: 1.2355143346378739
MSE: 3.8636906313595714
RMSE: 1.9656272869899754
MAPE: 0.4157085561502365


In [76]:
avg_actor_analyzer.analyze_confusion_matrix()

Confusion Matrix :
[[     0      0      2     11     72     84    424    778   9191      2]
 [     0      0      0      6      3     11     34    182    822      0]
 [     0      0      0      0      2     10     46    170    711      0]
 [     0      0      0      2      0     11     43    294   1099      0]
 [     0      0      2      0      8     33     65    397   2015      0]
 [     0      0      0      0     25     31     78    745   3176     18]
 [     0      0      0      6     14     32    123   1050   6126      8]
 [     0      0      0      0     16     30    236   2303  13723     28]
 [     0      0      0      0     12     33    142   2449  22847     98]
 [     0      0      1      9     28    165   1025  10656 203254    782]]
Accuracy: 0.09130892690317322
Precision: [       nan        nan 0.         0.05882353 0.04444444 0.07045455
 0.05550542 0.12105761 0.08688262 0.83547009]
Recall: [0.         0.         0.         0.00138026 0.0031746  0.0076111
 0.01671423 0.14097698

  self.f1 = 2 * (self.precision * self.recall) / (self.precision + self.recall)
