# 수행목표
- 랜덤 추천의 결과를 통해 회귀 평가를 계산하는 기능을 개발한다.

# 수행단계
- RandomRecommender.run()에서 return한 결과를 통해 random 평점이 결과(rate)와 얼마나 비슷한지 확인하는 코드를 만든다.
- 결과를 분석하는 class 명을 `Analyzer`로 하고 결과 데이터를 입력하여 분석 결과를 return 한다.
- 평균 절대 오차 (Mean Absolute Error - `MAE`)를 계산한다.
- 평균 제곱 오차 (Mean Squared Error - `MSE`)를 계산한다.
- 평균 제곱근 오차 (Root Mean Squared Error - `RMSE`)를 계산한다.
- 평균 절대 비율 오차 (Mean Absolute Percentage Error - `MAPE`)를 계산한다.
- 결과 데이터 평점 높은 순 n개를 출력하고 그 아래에 분석 결과를 출력한다.


In [1]:
# Library

import os
import sys
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mpl.rcParams['font.family'] = 'AppleGothic'
mpl.rcParams['axes.unicode_minus'] = False

In [2]:
class MovieDataLoader:
    def __init__(self, file_path):
        self.file_path = file_path
        movie_path = os.path.join(self.file_path, 'movies.txt')
        rate_path = os.path.join(self.file_path, 'rates.csv')
        self.movies = pd.read_csv(movie_path, sep='\t')
        self.rates = pd.read_csv(rate_path)

    def load(self):
        self._preprocess()

        return self.movies, self.rates

    def _preprocess(self):
        self.movies.dropna(subset=['title_eng'], inplace=True)

        if self.movies['year'].isnull().sum() > 0:
            non_year = self.movies[self.movies['year'].isnull()]
            for row in non_year.iterrows():
                movie = row[1]['movie']
                title_eng = row[1]['title_eng'].split(' , ')[:-1]
                title_eng = ' , '.join(title_eng)
                year = row[1]['title_eng'].split(' , ')[-1]
                self.movies.loc[self.movies['movie'] == movie, 'title_eng'] = title_eng
                self.movies.loc[self.movies['movie'] == movie, 'year'] = year

        if self.movies['grade'].isnull().sum() > 0:
            self.movies['grade'] = self.movies['grade'].fillna('NR grade')



In [3]:
movies_df, rates_df = MovieDataLoader('data/kmrd/').load()

movies_df.info()
rates_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 991 entries, 0 to 998
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   movie      991 non-null    int64 
 1   title      991 non-null    object
 2   title_eng  991 non-null    object
 3   year       991 non-null    object
 4   grade      991 non-null    object
dtypes: int64(1), object(4)
memory usage: 46.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140710 entries, 0 to 140709
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   user    140710 non-null  int64
 1   movie   140710 non-null  int64
 2   rate    140710 non-null  int64
 3   time    140710 non-null  int64
dtypes: int64(4)
memory usage: 4.3 MB


  self.movies.loc[self.movies['movie'] == movie, 'year'] = year


In [4]:
# 결과에 random 평점 field(rate_random)를 추가하고 random 값을 반영해서 결과를 return 한다.

class RandomRecommender:
    def __init__(self, movies_df, rates_df):
        self.movies_df = movies_df
        self.rates_df = rates_df

    def run(self, user_id, n_items=10):
        user_rates = self.rates_df[self.rates_df['user'] == user_id].copy()
        rate_random = np.random.rand(len(user_rates)) * 10
        user_rates['rate_random'] = rate_random
        user_rates = user_rates.sort_values(by='rate_random', ascending=False)
        user_rates = user_rates.head(n_items)

        return user_rates

In [5]:
result = RandomRecommender(movies_df, rates_df).run(0, 20)
result

Unnamed: 0,user,movie,rate,time,rate_random
54,0,10433,7,1496214240,9.766687
21,0,10073,7,1421845740,9.751335
24,0,10095,6,1423398360,9.722403
77,0,10813,8,1501673820,9.690677
41,0,10217,8,1423819860,9.49957
69,0,10629,8,1429710180,9.157018
5,0,10023,7,1428738480,8.826976
10,0,10038,9,1495625940,8.513086
35,0,10173,8,1451450460,8.47417
74,0,10746,7,1442579400,8.32582


In [6]:
class Analyzer:
    def __init__(self, result_df):
        self.result_df = result_df

    def analyze(self):
        self._analyze_mae()
        self._analyze_mse()
        self._analyze_rmse()
        self._analyze_mape()

    def _analyze_mae(self):
        self.mae = np.abs(self.result_df['rate'] - self.result_df['rate_random']).mean()
        print(f'MAE: {self.mae}')

    def _analyze_mse(self):
        self.mse = ((self.result_df['rate'] - self.result_df['rate_random']) ** 2).mean()
        print(f'MSE: {self.mse}')

    def _analyze_rmse(self):
        self.rmse = np.sqrt(self.mse)
        print(f'RMSE: {self.rmse}')

    def _analyze_mape(self):
        self.mape = (np.abs(self.result_df['rate'] - self.result_df['rate_random']) / self.result_df['rate']).mean()
        print(f'MAPE: {self.mape}')


In [7]:
result

Unnamed: 0,user,movie,rate,time,rate_random
54,0,10433,7,1496214240,9.766687
21,0,10073,7,1421845740,9.751335
24,0,10095,6,1423398360,9.722403
77,0,10813,8,1501673820,9.690677
41,0,10217,8,1423819860,9.49957
69,0,10629,8,1429710180,9.157018
5,0,10023,7,1428738480,8.826976
10,0,10038,9,1495625940,8.513086
35,0,10173,8,1451450460,8.47417
74,0,10746,7,1442579400,8.32582


In [8]:
analyzer = Analyzer(result)

analyzer.analyze()

MAE: 1.6306536038331345
MSE: 3.7854329713989308
RMSE: 1.9456189173111293
MAPE: 0.24064244656914355
