# 수행목표
- 랜덤 추천의 결과를 통해 분류 평가를 계산하는 기능을 개발한다.

# 수행단계
- 기존 `rate_random`은 그대로 두고 `rate_random_class` 필드를 추가해서 랜덤 평점을 반올림한 값을 업데이트 한다.
- 아래와 같은 평가 계산을 Analyzer class에 추가한다.
    - 혼동 행렬 (`Confusion Matrix`)을 생성한다.
    - 정확도 (`Accuracy`)를 계산한다.
    - 정밀도 (`Precision`)를 계산한다.
    - 재현율 (`Recall`)을 계산한다.
    - F1 점수 (`F1 Score`)를 계산한다.
- 결과 데이터 평점 높은 순 n개를 출력하고 그 아래에 분석 결과를 출력한다.


In [20]:
# Library

import os
import sys
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.rcParams['font.family'] = 'AppleGothic'
mpl.rcParams['axes.unicode_minus'] = False

In [21]:
class MovieDataLoader:
    def __init__(self, file_path):
        self.file_path = file_path
        movie_path = os.path.join(self.file_path, 'movies.txt')
        rate_path = os.path.join(self.file_path, 'rates.csv')
        self.movies = pd.read_csv(movie_path, sep='\t')
        self.rates = pd.read_csv(rate_path)

    def load(self):
        self._preprocess()

        return self.movies, self.rates

    def _preprocess(self):
        self.movies.dropna(subset=['title_eng'], inplace=True)

        if self.movies['year'].isnull().sum() > 0:
            non_year = self.movies[self.movies['year'].isnull()]
            for row in non_year.iterrows():
                movie = row[1]['movie']
                title_eng = row[1]['title_eng'].split(' , ')[:-1]
                title_eng = ' , '.join(title_eng)
                year = row[1]['title_eng'].split(' , ')[-1]
                self.movies.loc[self.movies['movie'] == movie, 'title_eng'] = title_eng
                self.movies.loc[self.movies['movie'] == movie, 'year'] = year

        if self.movies['grade'].isnull().sum() > 0:
            self.movies['grade'] = self.movies['grade'].fillna('NR grade')



In [22]:
movies_df, rates_df = MovieDataLoader('data/kmrd/').load()

movies_df.info()
rates_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 991 entries, 0 to 998
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   movie      991 non-null    int64 
 1   title      991 non-null    object
 2   title_eng  991 non-null    object
 3   year       991 non-null    object
 4   grade      991 non-null    object
dtypes: int64(1), object(4)
memory usage: 46.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140710 entries, 0 to 140709
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   user    140710 non-null  int64
 1   movie   140710 non-null  int64
 2   rate    140710 non-null  int64
 3   time    140710 non-null  int64
dtypes: int64(4)
memory usage: 4.3 MB


  self.movies.loc[self.movies['movie'] == movie, 'year'] = year


In [23]:
# 결과에 random 평점 field(rate_random)를 추가하고 random 값을 반영해서 결과를 return 한다.
# 결과에 random 평점 field(rate_random)를 추가하고 random 값을 반영해서 결과를 return 한다.

class RandomRecommender:
    def __init__(self, movies_df, rates_df):
        self.movies_df = movies_df
        self.rates_df = rates_df

    def run(self, n_items=10):
        df = self.rates_df.copy()
        df['rate_random'] = np.random.rand(len(df)) * 10

        top_n = (
            df.groupby('user', group_keys=True)
            .apply(lambda x: x.nlargest(n_items, 'rate_random'))
        )

        top_n = top_n.reset_index(drop=True)
        top_n['rate_random_class'] = top_n['rate_random'].apply(lambda x: np.round(x))
        return top_n

    def run_for_user(self, user_id, n_items=10):
        user_rates = self.rates_df[self.rates_df['user'] == user_id].copy()
        rate_random = np.random.rand(len(user_rates)) * 10
        user_rates['rate_random'] = rate_random
        user_rates = user_rates.sort_values(by='rate_random', ascending=False)
        user_rates = user_rates.head(n_items)
        user_rates['rate_random_class'] = user_rates['rate_random'].apply(lambda x: np.round(x))


        return user_rates

In [25]:
result = RandomRecommender(movies_df, rates_df).run(10)
result

  df.groupby('user', group_keys=True)


Unnamed: 0,user,movie,rate,time,rate_random,rate_random_class
0,0,10215,7,1452407820,9.994730,10.0
1,0,10349,6,1489312440,9.902076,10.0
2,0,10021,9,1424497980,9.856270,10.0
3,0,10395,8,1452179460,9.652395,10.0
4,0,10701,7,1423288920,9.546071,10.0
...,...,...,...,...,...,...
104703,52023,10998,10,1421679660,1.692361,2.0
104704,52024,10998,10,1204622460,1.825016,2.0
104705,52025,10998,7,1498546920,2.662273,3.0
104706,52026,10998,9,1434090420,9.395504,9.0


In [32]:
class Analyzer:
    def __init__(self, result_df):
        self.result_df = result_df

    def analyze_error(self):
        self._analyze_mae()
        self._analyze_mse()
        self._analyze_rmse()
        self._analyze_mape()

    def analyze_confusion_matrix(self):
        self._make_confusion_matrix()
        self._analyze_accuracy()
        self._analyze_precision()
        self._analyze_recall()
        self._analyze_f1()

    def _analyze_mae(self):
        self.mae = np.abs(self.result_df['rate'] - self.result_df['rate_random']).mean()
        print(f'MAE: {self.mae}')

    def _analyze_mse(self):
        self.mse = ((self.result_df['rate'] - self.result_df['rate_random']) ** 2).mean()
        print(f'MSE: {self.mse}')

    def _analyze_rmse(self):
        self.rmse = np.sqrt(self.mse)
        print(f'RMSE: {self.rmse}')

    def _analyze_mape(self):
        self.mape = (np.abs(self.result_df['rate'] - self.result_df['rate_random']) / self.result_df['rate']).mean()
        print(f'MAPE: {self.mape}')

    def _make_confusion_matrix(self):
        y_true = self.result_df['rate']
        y_pred = self.result_df['rate_random_class']
        unique_labels = np.unique(np.concatenate((y_true, y_pred)))
        self.confusion_matrix = np.zeros((len(unique_labels), len(unique_labels)), dtype=int)

        label_to_index = {label: index for index, label in enumerate(unique_labels)}
        for true, pred in zip(y_true, y_pred):
            self.confusion_matrix[label_to_index[true]][label_to_index[pred]] += 1

        print('Confusion Matrix :')
        print(self.confusion_matrix)

    def _analyze_accuracy(self):
        self.accuracy = np.diag(self.confusion_matrix).sum() / self.confusion_matrix.sum()
        print(f'Accuracy: {self.accuracy}')

    def _analyze_precision(self):
        with np.errstate(divide='ignore', invalid='ignore'):
            try:
                self.precision = np.diag(self.confusion_matrix) / self.confusion_matrix.sum(axis=0)
            except:
                self.precision = np.nan
        print(f'Precision: {self.precision}')

    def _analyze_recall(self):
        with np.errstate(divide='ignore', invalid='ignore'):
            try:
                self.recall = np.diag(self.confusion_matrix) / self.confusion_matrix.sum(axis=1)
            except:
                self.recall = np.nan
        print(f'Recall: {self.recall}')

    def _analyze_f1(self):
        self.f1 = 2 * (self.precision * self.recall) / (self.precision + self.recall)
        print(f'F1: {self.f1}')


In [33]:
analyzer = Analyzer(result)

analyzer.analyze_confusion_matrix()

Confusion Matrix :
[[   0    0    0    0    0    0    0    0    0    0    0]
 [ 134  353  339  384  423  401  396  416  435  456  278]
 [  16   42   38   44   34   31   53   47   63   75   35]
 [  17   40   28   41   40   27   42   40   54   59   41]
 [  20   46   53   45   61   46   57   62   85   94   54]
 [  36   87   93   87   97   95  105  118  136  163  103]
 [  60  113  148  137  164  160  180  200  232  286  169]
 [ 109  246  247  237  267  274  317  342  441  536  295]
 [ 215  476  538  545  586  585  646  755  860 1031  549]
 [ 398  808  820  813  845  982  932 1035 1141 1343  736]
 [3308 6589 6750 7174 7196 7346 7668 8156 8426 8960 4671]]
Accuracy: 0.07625014325552966
Precision: [0.         0.04011364 0.00419704 0.00431261 0.00628024 0.00955062
 0.01731435 0.03061499 0.07243325 0.10328386 0.67392873]
Recall: [       nan 0.0879203  0.07949791 0.0955711  0.09791332 0.08482143
 0.09734992 0.10329206 0.12673151 0.13630366 0.06126384]
F1: [       nan 0.05509169 0.00797314 0.00825