# 2.1 데이터 읽기
- MoviesLens 100K 데이터는 3가지 파일로 구성
  1. 사용자 데이터 : u.user
  2. 영화에 대한 데이터 : u.item
  3. 영화 평가 : u.data

In [32]:
import pandas as pd

In [33]:
# 사용자 u.user 파일을 DataFrame으로 읽기
u_cols = ['user_id','age','sex','occupation','zip_code']
users = pd.read_csv(
    'u.user', 
    sep='|', 
    names=u_cols, 
    encoding='latin-1'
)
users = users.set_index('user_id')
users.head(2)

Unnamed: 0_level_0,age,sex,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043


In [34]:
# u.item 파일을 DataFrame으로 읽기
i_cols = ['movie_id','title','release date','video release date',
          'IMDB URL','unknown','Action','Adventure','Animation',
          'Children\'s','Comedy','Crime','Documentary','Drama','Fantasy',
          'Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western']
movies = pd.read_csv(
    'u.item', 
    sep='|',
    names=i_cols,
    encoding='latin-1'
)
movies = movies.set_index('movie_id')
movies.head(2)

Unnamed: 0_level_0,title,release date,video release date,IMDB URL,unknown,Action,Adventure,Animation,Children's,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [35]:
# u.data 파일을 DataFrame으로 읽기
r_cols = ['user_id','movie_id','rating','timestamp']
ratings = pd.read_csv(
    'u.data',
    sep='\t',
    names=r_cols,
    encoding= 'latin-1'
)
ratings = ratings.set_index('user_id')
ratings.head(2)

Unnamed: 0_level_0,movie_id,rating,timestamp
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
196,242,3,881250949
186,302,3,891717742


---
# 2.2 인기제품 방식

In [36]:
# 개별 사용자에 대한 정보가 거의 없거나 간단한 추천만을 제공할 때 사용
# 모든 사용자에게 인기 있는 제품을 추천 (best-seller)
# 각 제품에 대한 평가의 평균을 구해 높은 순서대로 추천해보자

def rec_movie(n_items):
    movie_mean = ratings.groupby(['movie_id'])['rating'].mean()
    movie_sort = movie_mean.sort_values(ascending=False)[:n_items]
    recommendations = movies.loc[movie_sort.index]['title']
    return recommendations

rec_movie(5)

movie_id
814                         Great Day in Harlem, A (1994)
1599                        Someone Else's America (1995)
1201           Marlene Dietrich: Shadow and Light (1996) 
1122                       They Made Me a Criminal (1939)
1653    Entertaining Angels: The Dorothy Day Story (1996)
Name: title, dtype: object

---
# 2.3 추천 시스템의 정확도 측정

In [37]:
# 영화 평점에 대해서 실제값과 best-seller 방식으로 구한 예측값의 RMSE를 계산
import numpy as np

def RMSE(y, y_pred):
    return np.sqrt(np.mean((np.array(y) - np.array(y_pred))**2))

In [38]:
# 정확도 계산 
rmse = []
movie_mean = ratings.groupby(['movie_id'])['rating'].mean()

for user in set(ratings.index):
    # 각 사용자가 평가한 모든 영화의 평점을 y에 저장 
    y = ratings.loc[user, 'rating']
    # best-seller 방식으로 예측 
    y_pred = movie_mean[ratings.loc[user, 'movie_id']]
    accuracy = RMSE(y, y_pred)
    rmse.append(accuracy)

print(np.mean(rmse))

0.996007224010567


---
# 2.4 사용자 집단별 추천

In [39]:
# 전체 사용자를 대상으로 best-seller를 구하면 실젯값과의 noise가 커질 것 
# 집단을 나누기 위한 변수를 설정하자 (여기서는 성별)
from sklearn.model_selection import train_test_split

users = users.reset_index()
movies = movies.reset_index()
ratings = ratings.reset_index()

# train/test set을 분리한 후 best-seller를 찾아 보자 
X = ratings.copy()
y = ratings['user_id']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

# 모델별 RMSE를 계산할 수 있도록 
def score(model):
    id_pairs = zip(X_test['user_id'], X_test['movie_id'])
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])
    y = np.array(X_test['rating'])
    return RMSE(y, y_pred)

train_mean = X_train.groupby(['movie_id'])['rating'].mean()

def best_seller(user_id, movie_id):
    try: rating = train_mean[movie_id]
    except: rating = 3.0
    return rating

score(best_seller)

1.0258108908333214

In [40]:
# 성별에 따른 예측값 계산
merged_ratings = pd.merge(X_train, users)
users = users.set_index('user_id')
g_mean = merged_ratings[['movie_id', 'sex', 'rating']].groupby(['movie_id', 'sex'])['rating'].mean()

rating_matrix = X_train.pivot(
    index='user_id',
    columns='movie_id',
    values='rating'
)

In [41]:
rating_matrix.shape

(943, 1647)

In [42]:
# 성별에 따른 추천
def cf_gender(user_id, movie_id):
    if movie_id in rating_matrix.columns:
        gender = users.loc[user_id]['sex']
        if gender in g_mean[movie_id].index:
            gender_rating = g_mean[movie_id][gender]
        else:
            gender_rating = 3.0
    else:
        gender_rating = 3.0
    return gender_rating

score(cf_gender)

1.0361820582655747

###### 성별을 나누어 추천했을 때 추천 성능에는 별다른 개선이 없었다. 