# 기본적인 추천 시스템
1. "MovieLens" 데이터 사용
2. 영화 1점 ~ 5점 평가
3. MovieLens 100,000개, 2,000만개 사용

## 2.1 데이터 읽기

In [2]:
import os
import pandas as pd

base_src = 'D:/개인화추천/'
u_user_src = os.path.join(base_src, 'u.user')
u_cols = ['user_id','age','sex','occupation','zip_code']
users = pd.read_csv(u_user_src, sep='|', names=u_cols, encoding='latin-1')
users = users.set_index('user_id')
users.head()

Unnamed: 0_level_0,age,sex,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


In [3]:
u_item_src = os.path.join(base_src,'u.item')
i_cols = ['movie_id','title','release','date','video release date',
'IMDB','URL','unknown','Action','Adventure','Animation',
'Children\'s','Comedy','Crime','Documentary'
,'Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance'
,'Sci-Fi','Thriller','War','Western']
movies = pd.read_csv(u_item_src, sep='|', names=i_cols, encoding='latin-1')
movies = movies.set_index('movie_id')
movies.head()

Unnamed: 0_level_0,title,release,date,video release date,IMDB,URL,unknown,Action,Adventure,Animation,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,,
2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,1,0,0,,
3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,,
4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,,
5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,,


In [4]:
u_data_src = os.path.join(base_src, 'u.data')
r_cols = ['user_id','movie_id','rating','timestamp']
ratings = pd.read_csv(u_data_src, sep='\t', names=r_cols, encoding='latin-1')
ratings = ratings.set_index('user_id')
ratings.head()

Unnamed: 0_level_0,movie_id,rating,timestamp
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
196,242,3,881250949
186,302,3,891717742
22,377,1,878887116
244,51,2,880606923
166,346,1,886397596


## 2.2 인기 제품 방식 추천
- 각 제품의 평균을 구해서 가장 높은 평점의 제품을 추천

In [5]:
def recom_movie(n_items):
    movie_mean = ratings.groupby(['movie_id'])['rating'].mean()
    # sort할 떄, n_items의 개수만큼 뽑아내겠다. -> [:n_items]
    movie_sort = movie_mean.sort_values(ascending=False)[:n_items]
    # movie_sort에서는 movie에 대한 이름을 가져올 수 없음 
    # 따라서 index를 가져옴
    recom_movies = movies.loc[movie_sort.index]
    # index를 가져와서 title만 다시 또 가져옴
    recommendation = recom_movies['title']
    return recommendation

recom_movie(5)

movie_id
1293                                      Star Kid (1997)
1467                 Saint of Fort Washington, The (1993)
1653    Entertaining Angels: The Dorothy Day Story (1996)
814                         Great Day in Harlem, A (1994)
1122                       They Made Me a Criminal (1939)
Name: title, dtype: object

## 2.3 추천시스템의 정확도 측정
- 추천시스템의 성능 = '정확성'!!
- 예측 선호도와 실제 선호도의 차이가 없어야 됌
- 추천시스템도 훈련 데이터와 테스트 데이터로 분리함
- 정확도 측정 방법 : RMSE

In [6]:
# 100k 영화 평점에 대해서 실제값과 best-seller 방식으로 구한 예측값의 RMSE룰 계산
import numpy as np

def RMSE(y,y_pred):
    return np.sqrt(np.mean((np.array(y) - np.array(y_pred))**2))

# 정확도 계산
rmse = []
movie_mean = ratings.groupby(["movie_id"])["rating"].mean()

for user in set(ratings.index):
    y = ratings.loc[user]["rating"]
    # best-seller 방식으로 (movie_id에 대한 예측값이 나옴)
    y_pred = movie_mean[ratings.loc[user]['movie_id']]
    accuracy = RMSE(y,y_pred)
    rmse.append(accuracy)
    
# RMSE 계산
print(np.mean(rmse))

0.996007224010567


### 2.4 사용자 집단별 추천
- 집단을 나누기 위한 변수 설정
(남자 or 여자)

In [9]:
import os
import pandas as pd
import numpy as np

base_src = 'D:/개인화추천/'
u_user_src = os.path.join(base_src, 'u.user')
u_cols = ['user_id','age','sex','occupation','zip_code']
users = pd.read_csv(u_user_src, sep='|', names=u_cols, encoding='latin-1')

u_item_src = os.path.join(base_src,'u.item')
i_cols = ['movie_id','title','release','date','video release date',
'IMDB','URL','unknown','Action','Adventure','Animation',
'Children\'s','Comedy','Crime','Documentary'
,'Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance'
,'Sci-Fi','Thriller','War','Western']
movies = pd.read_csv(u_item_src, sep='|', names=i_cols, encoding='latin-1')

u_data_src = os.path.join(base_src, 'u.data')
r_cols = ['user_id','movie_id','rating','timestamp']
ratings = pd.read_csv(u_data_src, sep='\t', names=r_cols, encoding='latin-1')

In [10]:
# ratings DataFrame에서 timestamp 제거
ratings = ratings.drop("timestamp",axis=1)
movies = movies[["movie_id","title"]]

In [11]:
# train, test set 분리
from sklearn.model_selection import train_test_split
x = ratings.copy()
y = ratings["user_id"]

# stratify -> 계층화 추출 (원천 데이터 y 환경을 똑같이 train, test 셋에 반영)
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25,stratify = y)

In [12]:
## 정확도(RMSE) 계산
def RMSE(y,y_pred):
    return np.sqrt(np.mean((np.array(y) - np.array(y_pred))**2))

In [13]:
# 모델별 RMSE 계산
def score(model):
    id_pairs = zip(x_test["user_id"],x_test["movie_id"])
    y_pred = np.array([model(user,movie) for (user,movie) in id_pairs])
    y = np.array(x_test["rating"])
    return RMSE(y,y_pred)

In [14]:
# best_seller 함수로 정확도 계산
train_mean = x_train.groupby(["movie_id"])["rating"].mean()
def best_seller(user_id,movie_id):
    try:
        rating = train_mean[movie_id]
    except:
        rating = 3.0
    return rating

score(best_seller)

1.0296149100165848

In [22]:
# 성별에 따른 예측값 계산
users = users.reset_index()
merged_ratings = x_train.merge(users, how = 'inner', on ="user_id")

users = users.set_index('user_id')

g_mean = merged_ratings[["movie_id","sex","rating"]].groupby(["movie_id","sex"])["rating"].mean()
g_mean

movie_id  sex
1         F      3.766667
          M      3.888889
2         F      3.294118
          M      3.197531
3         F      2.642857
                   ...   
1678      M      1.000000
1679      M      3.000000
1680      M      2.000000
1681      M      3.000000
1682      M      3.000000
Name: rating, Length: 3037, dtype: float64

In [25]:
# full matrix 형태
rating_matrix = x_train.pivot(index="user_id",
                             columns = "movie_id",
                             values = "rating")
rating_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,3.0,4.0,,3.0,5.0,,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,2.0,,,,5.0,3.0,,...,,,,,,,,,,
941,5.0,,,,,,,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [26]:
# Gender 기준 추천
def cf_gender(user_id,movie_id):
    if movie_id in rating_matrix.columns:
        gender = users.loc[user_id]["sex"]
        if gender in g_mean[movie_id].index:
            gender_rating = g_mean[movie_id][gender]
        else:
            gender_rating = 3.0
    else:
        gender_rating = 3.0
    return gender_rating

score(cf_gender)

1.0397810399982537