# 수행목표
- 영화, 사용자, 장르, 주연배우 평균을 조합해서 평균 평점으로 사용자별 평점과 비교한다.

# 수행단계
영화, 사용자, 장르, 주연배우 평균을 조합해서 평균 평점을 계산해서 비교한다.

- 추천 코드 class 명을 `AvgMultiRecommender로` 한다.
- 영화, 사용자, 장르, 주연배우 평균에 각각 가중치를 곱해서 더하고 가중치 합으로 나눈다.
- 결과에 `rate_avg_multi` 필드를 추가해서 특징들의 가중치 평점을 계산한 다음 업데이트 한다.
- `rate_avg_multi` 결과를 회귀 평가 한다.
- `rate_avg_multi_class` 필드를 추가해서 `rate_avg_multi` 값을 반올림한 값을 업데이트 한다.
- `rate_avg_multi_class` 결과를 분류 평가 한다.
- 결과 데이터 평점 높은 순 n개를 출력하고 그 아래에 분석 결과를 출력한다.
- 사용자 평균 평점 추천과 평가를 비교한다.

# Library 설치

In [1]:
# Library

import os
import sys
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.rcParams['font.family'] = 'AppleGothic'
mpl.rcParams['axes.unicode_minus'] = False

# DataLoader

In [2]:
class MovieDataLoader:
    def __init__(self, file_path):
        self.file_path = file_path
        movie_path = os.path.join(self.file_path, 'movies.txt')
        rate_path = os.path.join(self.file_path, 'rates.csv')
        gerne_path = os.path.join(self.file_path, 'genres.csv')
        casting_path = os.path.join(self.file_path, 'castings.csv')
        self.movies = pd.read_csv(movie_path, sep='\t')
        self.rates = pd.read_csv(rate_path)
        self.gernes = pd.read_csv(gerne_path)
        self.castings = pd.read_csv(casting_path)

    def load(self):
        self._preprocess()

        return self.movies, self.rates, self.gernes, self.castings

    def _preprocess(self):
        self.movies.dropna(subset=['title_eng'], inplace=True)

        if self.movies['year'].isnull().sum() > 0:
            non_year = self.movies[self.movies['year'].isnull()]
            for row in non_year.iterrows():
                movie = row[1]['movie']
                title_eng = row[1]['title_eng'].split(' , ')[:-1]
                title_eng = ' , '.join(title_eng)
                year = row[1]['title_eng'].split(' , ')[-1]
                self.movies.loc[self.movies['movie'] == movie, 'title_eng'] = title_eng
                self.movies.loc[self.movies['movie'] == movie, 'year'] = year

        if self.movies['grade'].isnull().sum() > 0:
            self.movies['grade'] = self.movies['grade'].fillna('NR grade')


In [3]:
movies_df, rates_df, genres_df, castings_df = MovieDataLoader('../data/kmrd/').load()

movies_df.info()
rates_df.info()
genres_df.info()
castings_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 991 entries, 0 to 998
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   movie      991 non-null    int64 
 1   title      991 non-null    object
 2   title_eng  991 non-null    object
 3   year       991 non-null    object
 4   grade      991 non-null    object
dtypes: int64(1), object(4)
memory usage: 46.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140710 entries, 0 to 140709
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   user    140710 non-null  int64
 1   movie   140710 non-null  int64
 2   rate    140710 non-null  int64
 3   time    140710 non-null  int64
dtypes: int64(4)
memory usage: 4.3 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2025 entries, 0 to 2024
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   movie   2025 non-null   int64 

  self.movies.loc[self.movies['movie'] == movie, 'year'] = year


# Analyzer

In [4]:
class Analyzer:
    def __init__(self, y_true, y_pred, y_pred_class):
        self.y_true = y_true
        self.y_pred = y_pred
        self.y_pred_class = y_pred_class

    def analyze_error(self):
        self._analyze_mae()
        self._analyze_mse()
        self._analyze_rmse()
        self._analyze_mape()

    def analyze_confusion_matrix(self):
        self._make_confusion_matrix()
        self._analyze_accuracy()
        self._analyze_precision()
        self._analyze_recall()
        self._analyze_f1()

    def _analyze_mae(self):
        self.mae = np.abs(self.y_true - self.y_pred).mean()
        print(f'MAE: {self.mae}')

    def _analyze_mse(self):
        self.mse = ((self.y_true - self.y_pred) ** 2).mean()
        print(f'MSE: {self.mse}')

    def _analyze_rmse(self):
        self.rmse = np.sqrt(self.mse)
        print(f'RMSE: {self.rmse}')

    def _analyze_mape(self):
        self.mape = (np.abs(self.y_true - self.y_pred) / self.y_true).mean()
        print(f'MAPE: {self.mape}')

    def _make_confusion_matrix(self):
        unique_labels = np.unique(np.concatenate((self.y_true, self.y_pred_class)))
        self.confusion_matrix = np.zeros((len(unique_labels), len(unique_labels)), dtype=int)

        label_to_index = {label: index for index, label in enumerate(unique_labels)}
        for true, pred in zip(self.y_true, self.y_pred_class):
            self.confusion_matrix[label_to_index[true]][label_to_index[pred]] += 1

        print('Confusion Matrix :')
        print(self.confusion_matrix)

    def _analyze_accuracy(self):
        self.accuracy = np.diag(self.confusion_matrix).sum() / self.confusion_matrix.sum()
        print(f'Accuracy: {self.accuracy}')

    def _analyze_precision(self):
        with np.errstate(divide='ignore', invalid='ignore'):
            try:
                self.precision = np.diag(self.confusion_matrix) / self.confusion_matrix.sum(axis=0)
            except:
                self.precision = np.nan
        print(f'Precision: {self.precision}')

    def _analyze_recall(self):
        with np.errstate(divide='ignore', invalid='ignore'):
            try:
                self.recall = np.diag(self.confusion_matrix) / self.confusion_matrix.sum(axis=1)
            except:
                self.recall = np.nan
        print(f'Recall: {self.recall}')

    def _analyze_f1(self):
        self.f1 = 2 * (self.precision * self.recall) / (self.precision + self.recall)
        print(f'F1: {self.f1}')


# Recommenders

In [5]:
class BaseRecommender:
    def __init__(self, movies_df, rates_df, genres_df=None, castings_df=None):
        self.movies_df = movies_df
        self.rates_df = rates_df
        self.genres_df = genres_df
        self.castings_df = castings_df

    def get_top_n(self, df, score_col, n_items=10):
        top_n = (
            df.groupby('user', group_keys=False)
            .apply(lambda x: x.nlargest(n_items, score_col))
        )
        top_n = top_n.reset_index(drop=True)

        class_col = f'{score_col}_class'
        top_n[class_col] = top_n[score_col].apply(lambda x: np.round(x))
        return top_n

    def run(self, n_items=10):
        raise NotImplementedError


In [6]:
class RandomRecommender(BaseRecommender):
    def __init__(self, movies_df, rates_df, genres_df=None):
        super().__init__(movies_df, rates_df, genres_df)

    def run(self, n_items=10):
        df = self.rates_df.copy()
        df['rate_random'] = np.random.rand(len(df)) * 10

        top_n = self.get_top_n(df, 'rate_random', n_items)
        return top_n

In [7]:
class AvgMovieRecommender(BaseRecommender):
    def __init__(self, movies_df, rates_df, genres_df=None):
        super().__init__(movies_df, rates_df, genres_df)

    def run(self, n_items=10):
        df = self.rates_df.copy()
        df['rate_avg_movie'] = df.groupby('movie')['rate'].transform('mean')

        top_n = self.get_top_n(df, 'rate_avg_movie', n_items)
        return top_n

In [8]:
class AvgUserRecommender(BaseRecommender):
    def __init__(self, movies_df, rates_df, genres_df=None):
        super().__init__(movies_df, rates_df, genres_df)

    def run(self, n_items=10):
        df = self.rates_df.copy()
        df['rate_avg_user'] = df.groupby('user')['rate'].transform('mean')

        top_n = self.get_top_n(df, 'rate_avg_user', n_items)
        return top_n


In [9]:
class AvgGenreRecommender(BaseRecommender):
    def __init__(self, movies_df, rates_df, genres_df):
        super().__init__(movies_df, rates_df, genres_df)

    def run(self, n_items=10):
        df = self.rates_df.copy()
        df = pd.merge(df, self.genres_df, on='movie', how='left')

        genre_avg = df.groupby('genre')['rate'].mean()
        movie_genre = self.genres_df.groupby('movie')['genre'].apply(list)
        movie_genre_rating = {}
        for movie, g_list in movie_genre.items():
            valid_avg = [genre_avg[g] for g in g_list if g in genre_avg]
            movie_genre_rating[movie] = np.mean(valid_avg) if valid_avg else np.nan

        df['rate_avg_genre'] = df['movie'].apply(lambda m: movie_genre_rating.get(m, np.nan))
        df = df.dropna(subset=['rate_avg_genre'])  # NaN 제거

        top_n = self.get_top_n(df, 'rate_avg_genre', n_items)

        return top_n


In [10]:
class AvgActorRecommender(BaseRecommender):
    def __init__(self, movies_df, rates_df, castings_df):
        super().__init__(movies_df, rates_df, castings_df=castings_df)

    def run(self, n_items=10):
        df = self.rates_df.copy()
        df = pd.merge(df, self.castings_df, on='movie', how='left')

        df = df[df['leading'] == 1]
        df['rate_avg_actor'] = df.groupby('people')['rate'].transform('mean')

        top_n = self.get_top_n(df, 'rate_avg_actor', n_items)

        return top_n

In [11]:
random_df = RandomRecommender(movies_df, rates_df).run(20)
random_df.head()

  df.groupby('user', group_keys=False)


Unnamed: 0,user,movie,rate,time,rate_random,rate_random_class
0,0,10980,7,1452516180,9.983309,10.0
1,0,10330,10,1388915880,9.884021,10.0
2,0,10592,6,1505982840,9.865936,10.0
3,0,10561,8,1421476740,9.755048,10.0
4,0,10217,8,1423819860,9.75031,10.0


In [12]:
avg_movie_df = AvgMovieRecommender(movies_df, rates_df).run(20)
avg_movie_df.head()

  df.groupby('user', group_keys=False)


Unnamed: 0,user,movie,rate,time,rate_avg_movie,rate_avg_movie_class
0,0,10200,10,1483582800,9.434136,9.0
1,0,10048,7,1394114760,9.378706,9.0
2,0,10038,9,1495625940,9.343516,9.0
3,0,10021,9,1424497980,9.317726,9.0
4,0,10185,7,1473424740,9.29918,9.0


In [13]:
avg_user_df = AvgUserRecommender(movies_df, rates_df).run(20)
avg_user_df.head()

  df.groupby('user', group_keys=False)


Unnamed: 0,user,movie,rate,time,rate_avg_user,rate_avg_user_class
0,0,10003,7,1494128040,7.388235,7.0
1,0,10004,7,1467529800,7.388235,7.0
2,0,10018,9,1513344120,7.388235,7.0
3,0,10021,9,1424497980,7.388235,7.0
4,0,10022,7,1427627340,7.388235,7.0


In [14]:
avg_genre_df = AvgGenreRecommender(movies_df, rates_df, genres_df).run(20)
avg_genre_df.head()

  df.groupby('user', group_keys=False)


Unnamed: 0,user,movie,rate,time,genre,rate_avg_genre,rate_avg_genre_class
0,0,10087,6,1455366120,멜로/로맨스,9.068975,9.0
1,0,10087,6,1455366120,드라마,9.068975,9.0
2,0,10087,6,1455366120,뮤지컬,9.068975,9.0
3,0,10395,8,1452179460,멜로/로맨스,9.068975,9.0
4,0,10395,8,1452179460,드라마,9.068975,9.0


In [15]:
avg_actor_df = AvgActorRecommender(movies_df, rates_df, castings_df).run(20)
avg_actor_df.head()

  df.groupby('user', group_keys=False)


Unnamed: 0,user,movie,rate,time,people,order,leading,rate_avg_actor,rate_avg_actor_class
0,0,10048,7,1394114760,1349.0,1.0,1.0,9.367127,9.0
1,0,10004,7,1467529800,5104.0,4.0,1.0,9.360691,9.0
2,0,10038,9,1495625940,7049.0,3.0,1.0,9.343516,9.0
3,0,10038,9,1495625940,10587.0,5.0,1.0,9.343516,9.0
4,0,10003,7,1494128040,1076.0,1.0,1.0,9.336012,9.0


In [16]:
# analysis all recommenders

random_analyzer = Analyzer(random_df['rate'], random_df['rate_random'], random_df['rate_random_class'])
avg_movie_analyzer = Analyzer(avg_movie_df['rate'], avg_movie_df['rate_avg_movie'], avg_movie_df['rate_avg_movie_class'])
avg_user_analyzer = Analyzer(avg_user_df['rate'], avg_user_df['rate_avg_user'], avg_user_df['rate_avg_user_class'])
avg_genre_analyzer = Analyzer(avg_genre_df['rate'], avg_genre_df['rate_avg_genre'], avg_genre_df['rate_avg_genre_class'])
avg_actor_analyzer = Analyzer(avg_actor_df['rate'], avg_actor_df['rate_avg_actor'], avg_actor_df['rate_avg_actor_class'])

In [17]:
random_analyzer.analyze_error()

MAE: 4.361968570800101
MSE: 27.143180688332453
RMSE: 5.2099117735651195
MAPE: 0.6170737387712044


In [18]:
random_analyzer.analyze_confusion_matrix()

Confusion Matrix :
[[   0    0    0    0    0    0    0    0    0    0    0]
 [ 178  418  414  379  432  424  474  468  483  520  264]
 [  32   50   49   51   57   46   50   55   53   70   33]
 [  14   47   42   40   41   48   44   63   60   74   40]
 [  22   55   60   67   53   73   71   78   76   93   63]
 [  53   96  122  120  113  127  109  143  170  202  105]
 [  85  175  190  172  207  207  224  262  261  328  205]
 [ 160  278  311  324  345  366  426  486  595  617  326]
 [ 349  663  609  709  727  767  878  872 1013 1096  610]
 [ 475  975 1033  971 1080 1102 1157 1271 1307 1433  728]
 [3743 7430 7815 7832 8203 8268 8614 8873 9076 9165 4784]]
Accuracy: 0.0732579270053158
Precision: [0.         0.04103269 0.0046031  0.00375059 0.00470776 0.01111306
 0.01859384 0.03866041 0.07736368 0.10538314 0.66834311]
Recall: [       nan 0.09384823 0.08974359 0.07797271 0.0745429  0.09338235
 0.09671848 0.11478507 0.12215121 0.12426292 0.05708626]
F1: [       nan 0.05709992 0.00875704 0.007156

In [19]:
avg_movie_analyzer.analyze_error()

MAE: 1.255901668956898
MSE: 3.952888516210945
RMSE: 1.9881872437501817
MAPE: 0.42152373306057334


In [20]:
avg_movie_analyzer.analyze_confusion_matrix()

Confusion Matrix :
[[    0     1     1    11    68    56   316   406  3570     4]
 [    0     0     0     4     9    13    27   117   321     0]
 [    0     0     0     0    11    13    47    88   283     0]
 [    0     0     0     3     5    14    45   166   424     0]
 [    0     0     1     2     8    25    70   256   846     0]
 [    0     0     0     0    16    27   105   484  1456     5]
 [    0     0     0     2    14    31   146   746  2903     7]
 [    0     0     0     0    14    31   238  1357  6288    17]
 [    0     0     0     1    14    22   151  1416  9957    40]
 [    0     0     0     7    41   114   878  5497 78155   351]]
Accuracy: 0.10061819602248603
Precision: [       nan 0.         0.         0.1        0.04       0.07803468
 0.07217004 0.12883319 0.09555387 0.82783019]
Recall: [0.         0.         0.         0.00456621 0.00662252 0.01290014
 0.03793193 0.17079924 0.85828808 0.00412732]
F1: [       nan        nan        nan 0.00873362 0.01136364 0.02214022
 0.0

  self.f1 = 2 * (self.precision * self.recall) / (self.precision + self.recall)


In [21]:
avg_user_analyzer.analyze_error()

MAE: 0.6168470944568313
MSE: 1.7458033564580981
RMSE: 1.3212885212769003
MAPE: 0.18796969777773848


In [22]:
avg_user_analyzer.analyze_confusion_matrix()

Confusion Matrix :
[[ 1842   101   180   225   163   538   420   499   456    10]
 [    2   163     9    21    39    78    68    88    66     2]
 [    0     5   148     8    30    76    62    83    75     6]
 [    0     6     3   205    33    87   122   141    91     5]
 [    0     8    11    14   354   139   202   352   207    22]
 [    0     0     6    22    33   601   408   621   495    53]
 [    0     0     6    19    41   194  1238  1391  1058   127]
 [    0     0    14    16    37   167   638  3734  2896   630]
 [    0     0     1    11    55    97   413  1801  6567  2591]
 [    4     8    34    91    88   649  1156  4181 14421 63684]]
Accuracy: 0.6669044343676228
Precision: [0.99675325 0.56013746 0.3592233  0.32436709 0.40549828 0.22886519
 0.26189972 0.28965945 0.24939237 0.94866677]
Recall: [0.41542625 0.30410448 0.30020284 0.2958153  0.27043545 0.2684234
 0.30387825 0.45917364 0.56926144 0.75530148]
F1: [0.58643744 0.39419589 0.32707182 0.30943396 0.32447296 0.24707091
 0.281

In [23]:
avg_genre_analyzer.analyze_error()

MAE: 1.362633892775186
MSE: 4.2375135853945505
RMSE: 2.0585221848196222
MAPE: 0.4528222011236738


In [24]:
avg_genre_analyzer.analyze_confusion_matrix()

Confusion Matrix :
[[     0      0      0      0      0      1      6      0   9851      0]
 [     0      0      0      0      0      0      6      0   1060      0]
 [     0      0      0      0      0      1      6      0    909      0]
 [     0      0      0      0      0      0      0      0   1354      0]
 [     0      0      0      0      0      1     12      0   2459      0]
 [     0      0      0      0      0      1      2      0   4004      0]
 [     0      0      0      0      0      0      4      0   7172      0]
 [     0      0      0      0      0      2      2      2  14895      0]
 [     0      0      0      0      0      0      2      0  22249      0]
 [     0      0      0      0      0      2     22      3 184171      0]]
Accuracy: 0.08966998255432133
Precision: [       nan        nan        nan        nan        nan 0.125
 0.06451613 0.4        0.08966888        nan]
Recall: [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 2.49563264e-04 5

In [25]:
avg_actor_analyzer.analyze_error()

MAE: 1.2355143346378739
MSE: 3.8636906313595714
RMSE: 1.9656272869899754
MAPE: 0.4157085561502365


In [26]:
avg_actor_analyzer.analyze_confusion_matrix()

Confusion Matrix :
[[     0      0      2     11     72     84    424    778   9191      2]
 [     0      0      0      6      3     11     34    182    822      0]
 [     0      0      0      0      2     10     46    170    711      0]
 [     0      0      0      2      0     11     43    294   1099      0]
 [     0      0      2      0      8     33     65    397   2015      0]
 [     0      0      0      0     25     31     78    745   3176     18]
 [     0      0      0      6     14     32    123   1050   6126      8]
 [     0      0      0      0     16     30    236   2303  13723     28]
 [     0      0      0      0     12     33    142   2449  22847     98]
 [     0      0      1      9     28    165   1025  10656 203254    782]]
Accuracy: 0.09130892690317322
Precision: [       nan        nan 0.         0.05882353 0.04444444 0.07045455
 0.05550542 0.12105761 0.08688262 0.83547009]
Recall: [0.         0.         0.         0.00138026 0.0031746  0.0076111
 0.01671423 0.14097698

  self.f1 = 2 * (self.precision * self.recall) / (self.precision + self.recall)


In [28]:
class AvgMultiRecommender(BaseRecommender):
    def __init__(self, movies_df, rates_df, genres_df, castings_df, weights=(1, 1, 1, 1)):
        super().__init__(movies_df, rates_df, genres_df, castings_df)
        self.weights = weights

    def run(self, n_items=10):
        df = self.rates_df.copy()

        # avg_movie
        df['rate_avg_movie'] = df.groupby('movie')['rate'].transform('mean')

        # avg_user
        df['rate_avg_user'] = df.groupby('user')['rate'].transform('mean')

        # avg_genre
        df = pd.merge(df, self.genres_df, on='movie', how='left')
        genre_avg = df.groupby('genre')['rate'].mean()
        movie_genre = self.genres_df.groupby('movie')['genre'].apply(list)
        movie_genre_rating = {}
        for movie, g_list in movie_genre.items():
            valid_avg = [genre_avg[g] for g in g_list if g in genre_avg]
            movie_genre_rating[movie] = np.mean(valid_avg) if valid_avg else np.nan
        df['rate_avg_genre'] = df['movie'].apply(lambda m: movie_genre_rating.get(m, np.nan))
        df = df.dropna(subset=['rate_avg_genre'])

        # avg_actor
        df = pd.merge(df, self.castings_df, on='movie', how='left')
        df = df[df['leading'] == 1]
        df['rate_avg_actor'] = df.groupby('people')['rate'].transform('mean')

        # avg_multi
        df['rate_avg_multi'] = (
            self.weights[0] * df['rate_avg_movie'] +
            self.weights[1] * df['rate_avg_user'] +
            self.weights[2] * df['rate_avg_genre'] +
            self.weights[3] * df['rate_avg_actor']
        ) / sum(self.weights)

        top_n = self.get_top_n(df, 'rate_avg_multi', n_items)

        return top_n


In [29]:
avg_multi_df = AvgMultiRecommender(movies_df, rates_df, genres_df, castings_df, weights=(1, 1, 1, 1)).run(20)
avg_multi_df.head()

  df.groupby('user', group_keys=False)


Unnamed: 0,user,movie,rate,time,rate_avg_movie,rate_avg_user,genre,rate_avg_genre,people,order,leading,rate_avg_actor,rate_avg_multi,rate_avg_multi_class
0,0,10200,10,1483582800,9.434136,7.388235,SF,8.980535,99,1,1,9.318143,8.780263,9.0
1,0,10200,10,1483582800,9.434136,7.388235,액션,8.980535,99,1,1,9.318143,8.780263,9.0
2,0,10200,10,1483582800,9.434136,7.388235,스릴러,8.980535,99,1,1,9.318143,8.780263,9.0
3,0,10048,7,1394114760,9.378706,7.388235,드라마,8.965973,1349,1,1,9.338617,8.767883,9.0
4,0,10038,9,1495625940,9.343516,7.388235,SF,8.943981,7049,3,1,9.343516,8.754812,9.0


In [30]:
avg_multi_analyzer = Analyzer(avg_multi_df['rate'], avg_multi_df['rate_avg_multi'], avg_multi_df['rate_avg_multi_class'])
avg_multi_analyzer.analyze_error()

MAE: 1.01168918780073
MSE: 2.567496611301981
RMSE: 1.6023409784755493
MAPE: 0.34056344349677065


In [31]:
avg_multi_analyzer.analyze_confusion_matrix()

Confusion Matrix :
[[     0      0      0     16    136   1436  10837   4366   3269      0]
 [     0      0      0      0     18     97   1077    450    265      0]
 [     0      0      0      0      0     65    583    698    253      0]
 [     0      0      0      0      6      9    466   1409    538      0]
 [     0      0      0      2      2     27    476   2725   1195      0]
 [     0      0      0      0      0     42    209   3776   2088      0]
 [     0      0      0      0      0      4    156   3796   7264      0]
 [     0      0      0      0      0      0     98   4147  21278      0]
 [     0      0      0      0      0      0     50   1977  39431     22]
 [     0      0      0      0      0      4    201   7829 411867    596]]
Accuracy: 0.08290238689524265
Precision: [       nan        nan        nan 0.         0.01234568 0.02494062
 0.0110224  0.13303179 0.08089273 0.96440129]
Recall: [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 4.51773210e-04 6.86835650e

  self.f1 = 2 * (self.precision * self.recall) / (self.precision + self.recall)


In [32]:
avg_multi_df = AvgMultiRecommender(movies_df, rates_df, genres_df, castings_df, weights=(1, 7, 1, 1)).run(20)
avg_multi_df.head()

  df.groupby('user', group_keys=False)


Unnamed: 0,user,movie,rate,time,rate_avg_movie,rate_avg_user,genre,rate_avg_genre,people,order,leading,rate_avg_actor,rate_avg_multi,rate_avg_multi_class
0,0,10200,10,1483582800,9.434136,7.388235,SF,8.980535,99,1,1,9.318143,7.945046,8.0
1,0,10200,10,1483582800,9.434136,7.388235,액션,8.980535,99,1,1,9.318143,7.945046,8.0
2,0,10200,10,1483582800,9.434136,7.388235,스릴러,8.980535,99,1,1,9.318143,7.945046,8.0
3,0,10048,7,1394114760,9.378706,7.388235,드라마,8.965973,1349,1,1,9.338617,7.940094,8.0
4,0,10038,9,1495625940,9.343516,7.388235,SF,8.943981,7049,3,1,9.343516,7.934866,8.0


In [33]:
avg_multi_analyzer = Analyzer(avg_multi_df['rate'], avg_multi_df['rate_avg_multi'], avg_multi_df['rate_avg_multi_class'])
avg_multi_analyzer.analyze_error()

MAE: 0.6201643636429537
MSE: 1.2702337352117594
RMSE: 1.127046465418245
MAPE: 0.2046322787991987


In [34]:
avg_multi_analyzer.analyze_confusion_matrix()

Confusion Matrix :
[[     0     18  11502    444    911   1508   2560   1894   1207     16]
 [     0      0     23   1101     66    145    277    217     78      0]
 [     0      0      0    124    929     74    237    122    113      0]
 [     0      0      0     27    779    661    366    438    157      0]
 [     0      0      0     34    112   2313    656    873    429     10]
 [     0      0      0      0    101    275   3430   1522    762     25]
 [     0      0      0      0     49    193   1750   6150   3054     24]
 [     0      0      0      0     28    210    838  15054   9242    151]
 [     0      0      0      0     18    225    627   3470  34833   2307]
 [     0      0      0     36    156    747   3916  11523  60483 343636]]
Accuracy: 0.7392481354716248
Precision: [       nan 0.         0.         0.01528879 0.03556685 0.04330027
 0.11939688 0.36483048 0.31563638 0.99268276]
Recall: [0.         0.         0.         0.01112026 0.0252993  0.04497138
 0.15597148 0.58982095

  self.f1 = 2 * (self.precision * self.recall) / (self.precision + self.recall)
