# 1. Collaborative Filtering

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
u_cols = ['user_id','age','sex','occupation','zip_code']
users = pd.read_csv(
    'u.user',
    sep='|',
    names=u_cols,
    encoding='latin-1'
)
users = users.set_index('user_id')

In [4]:
i_cols = ['movie_id','title','release date','video release date',
          'IMDB URL','unknown','Action','Adventure','Animation',
          'Children\'s','Comedy','Crime','Documentary','Drama','Fantasy',
          'Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller',
          'War','Western']
movies = pd.read_csv(
    'u.item',
    sep='|',
    names=i_cols,
    encoding='latin-1'
)
movies = movies.set_index('movie_id')

In [5]:
r_cols = ['user_id','movie_id','rating','timestamp']
ratings = pd.read_csv(
    'u.data',
    sep='\t',
    names=r_cols,
    encoding='latin-1'
)

In [6]:
def RMSE(y, y_pred):
    return np.sqrt(np.mean((np.array(y) - np.array(y_pred)) ** 2 ))

In [7]:
def score(model):
    id_pairs = zip(X_test['user_id'], X_test['movie_id'])
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])
    y = np.array(X_test['rating'])
    return RMSE(y, y_pred)

In [8]:
X = ratings.copy()
y = ratings['user_id']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
ratings_matrix = X_train.pivot(
    index='user_id',
    columns='movie_id',
    values='rating'
)

In [9]:
# 코사인 유사도 계산하기 
from sklearn.metrics.pairwise import cosine_similarity

# NaN값 에러를 방지하기 위해 결측치 대치 
matrix_dummy = ratings_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(
    user_similarity,
    index=ratings_matrix.index,
    columns=ratings_matrix.index
)

def CF_simple(user_id, movie_id):
    if movie_id in ratings_matrix.columns:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = ratings_matrix[movie_id].copy()
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.dropna()
        sim_scores = sim_scores.drop(none_rating_idx)
        mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
    else:
        mean_rating = 3.0
    return mean_rating

score(CF_simple)

1.0177463106968754

In [10]:
user_similarity

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.130780,0.053644,0.058227,0.294037,0.347905,0.342826,0.301662,0.058481,0.283839,...,0.302085,0.106329,0.216712,0.160011,0.161887,0.101127,0.252517,0.140626,0.141501,0.327871
2,0.130780,1.000000,0.059370,0.087744,0.091283,0.205126,0.087883,0.069980,0.131468,0.131102,...,0.151702,0.319292,0.288292,0.375486,0.251483,0.210715,0.214435,0.100997,0.119622,0.114557
3,0.053644,0.059370,1.000000,0.319858,0.027061,0.056553,0.040382,0.092827,0.022305,0.074243,...,0.027277,0.054172,0.176762,0.082120,0.110510,0.014523,0.098533,0.081214,0.056912,0.019287
4,0.058227,0.087744,0.319858,1.000000,0.000000,0.055113,0.073904,0.116686,0.058604,0.052360,...,0.014333,0.047443,0.093526,0.183072,0.111369,0.038159,0.144005,0.073160,0.156433,0.015203
5,0.294037,0.091283,0.027061,0.000000,1.000000,0.195833,0.314563,0.166586,0.072277,0.110081,...,0.261552,0.075230,0.089645,0.051344,0.140155,0.049943,0.206068,0.162656,0.148228,0.215557
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.101127,0.210715,0.014523,0.038159,0.049943,0.086856,0.064742,0.090440,0.049478,0.062409,...,0.044371,0.352484,0.221523,0.268496,0.340641,1.000000,0.087428,0.187359,0.037291,0.135198
940,0.252517,0.214435,0.098533,0.144005,0.206068,0.279776,0.265721,0.163255,0.075527,0.281681,...,0.247665,0.135874,0.132824,0.181259,0.156742,0.087428,1.000000,0.057619,0.220491,0.202459
941,0.140626,0.100997,0.081214,0.073160,0.162656,0.094456,0.044218,0.130045,0.079051,0.080876,...,0.059291,0.191987,0.233647,0.163688,0.331351,0.187359,0.057619,1.000000,0.026480,0.091597
942,0.141501,0.119622,0.056912,0.156433,0.148228,0.242891,0.235973,0.126701,0.084315,0.172234,...,0.208162,0.070834,0.047994,0.109331,0.094151,0.037291,0.220491,0.026480,1.000000,0.159573


# 2. 이웃을 고려한 CF

In [11]:
# 유사집단의 크기를 미리 정하기 위해 기존 score 함수에 인자를 추가 
def score(model, neighbor_size=0):
    id_pairs = zip(X_test['user_id'], X_test['movie_id'])
    y_pred = np.array([model(user, movie, neighbor_size) for (user, movie) in id_pairs])
    y = np.array(X_test['rating'])
    return RMSE(y, y_pred)

In [12]:
X = ratings.copy()
y = ratings['user_id']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
rating_matrix = X_train.pivot(
    index='user_id',
    columns='movie_id',
    values='rating'
)

In [13]:
def CF_knn(user_id, movie_id, neighbor_size=0):
    if movie_id in rating_matrix.columns:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = rating_matrix[movie_id].copy()
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.dropna()
        sim_scores = sim_scores.drop(none_rating_idx)
        
        if neighbor_size == 0:
            mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
        else:
            if len(sim_scores) > 1:
                neighbor_size = min(neighbor_size, len(sim_scores))
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                user_idx = np.argsort(sim_scores)
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
            else:
                mean_rating = 3.0
                
    else:
        mean_rating = 3.0
        
    return mean_rating

score(CF_knn, neighbor_size=30)

1.0145902130335374

In [15]:
# 실제 주어진 사용자에 대한 추천 기능 구현 
rating_matrix = ratings.pivot_table(
    values='rating',
    index='user_id',
    columns = 'movie_id'
)
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(
    user_similarity, 
    index=rating_matrix.index, 
    columns=rating_matrix.index
)

def recom_movie(user_id, n_items, neighbor_size=30):
    user_movie = rating_matrix.loc[user_id].copy()
    
    for movie in rating_matrix.columns:
        if pd.notnull(user_movie.loc[movie]):
            user_movie.loc[movie] = 0
        else:
            user_movie.loc[movie] = CF_knn(user_id, movie, neighbor_size)
    
    movie_sort = user_movie.sort_values(ascending=False)[:n_items]
    recom_movies = movies.loc[movie_sort.index]
    recommendations = recom_movies['title']
    return recommendations

recom_movie(user_id=729, n_items=5, neighbor_size=30)

movie_id
1189                      Prefontaine (1997)
1293                         Star Kid (1997)
1467    Saint of Fort Washington, The (1993)
1500               Santa with Muscles (1996)
22                         Braveheart (1995)
Name: title, dtype: object

# 3. 최적의 neightbor size 결정

In [17]:
# 실행 환경을 고려해서 neighbor size를 5씩 증가시키며 확인
for neighbor_size in range(10, 50, 5):
    print(f'n_size: {neighbor_size}, RMSE={score(CF_knn, neighbor_size):.4f}')

n_size: 10, RMSE=0.8184
n_size: 15, RMSE=0.8612
n_size: 20, RMSE=0.8856
n_size: 25, RMSE=0.9015
n_size: 30, RMSE=0.9128
n_size: 35, RMSE=0.9208
n_size: 40, RMSE=0.9276
n_size: 45, RMSE=0.9331


# 4. 사용자 평가경향을 고려한 CF

In [18]:
rating_mean = rating_matrix.mean(axis=1)
rating_bias = (rating_matrix.T - rating_mean).T

In [19]:
def CF_knn_bias(user_id, movie_id, neighbor_size=0):
    if movie_id in rating_bias.columns:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = rating_bias[movie_id].copy()
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.drop(none_rating_idx)
        sim_scores = sim_scores.drop(none_rating_idx)
        
        if neighbor_size == 0:
            prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
            prediction = prediction + rating_mean[user_id]
        else:
            if len(sim_scores) > 1:
                neighbor_size = min(neighbor_size, len(sim_scores))
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                user_idx = np.argsort(sim_scores)
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
                prediction = prediction + rating_mean[user_id]
            else: 
                prediction = rating_mean[user_id]
    else:
        prediction = rating_mean[user_id]
    
    return prediction

score(CF_knn_bias, 30)

0.8488557330775994

# 5. 그 외의 방법으로 CF 성능 개선

In [20]:
x = ratings.copy()
y = ratings['user_id']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)
rating_matrix = X_train.pivot(
    index='user_id',
    columns='movie_id',
    values='rating'
)

matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(
    user_similarity,
    index=rating_matrix.index,
    columns=rating_matrix.index
)

rating_mean = rating_matrix.mean(axis=1)


movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.389706,-0.610294,0.389706,-0.610294,-0.610294,1.389706,0.389706,-2.610294,1.389706,-0.610294,...,,,,,,,,,,
2,0.290323,,,,,,,,,-1.709677,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,1.125714,0.125714,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,0.734694,,...,,,,,,,,,,
940,,,,-1.457944,,,0.542056,1.542056,-0.457944,,...,,,,,,,,,,
941,0.954545,,,,,,-0.045455,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,
