# 1. Collaborative Filtering

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
u_cols = ['user_id','age','sex','occupation','zip_code']
users = pd.read_csv(
    'u.user',
    sep='|',
    names=u_cols,
    encoding='latin-1'
)
users = users.set_index('user_id')

In [3]:
i_cols = ['movie_id','title','release date','video release date',
          'IMDB URL','unknown','Action','Adventure','Animation',
          'Children\'s','Comedy','Crime','Documentary','Drama','Fantasy',
          'Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller',
          'War','Western']
movies = pd.read_csv(
    'u.item',
    sep='|',
    names=i_cols,
    encoding='latin-1'
)
movies = movies.set_index('movie_id')

In [4]:
r_cols = ['user_id','movie_id','rating','timestamp']
ratings = pd.read_csv(
    'u.data',
    sep='\t',
    names=r_cols,
    encoding='latin-1'
)

In [5]:
def RMSE(y, y_pred):
    return np.sqrt(np.mean((np.array(y) - np.array(y_pred)) ** 2 ))

In [6]:
def score(model):
    id_pairs = zip(X_test['user_id'], X_test['movie_id'])
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])
    y = np.array(X_test['rating'])
    return RMSE(y, y_pred)

In [7]:
X = ratings.copy()
y = ratings['user_id']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
ratings_matrix = X_train.pivot(
    index='user_id',
    columns='movie_id',
    values='rating'
)

In [8]:
# 코사인 유사도 계산하기 
from sklearn.metrics.pairwise import cosine_similarity

# NaN값 에러를 방지하기 위해 결측치 대치 
matrix_dummy = ratings_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(
    user_similarity,
    index=ratings_matrix.index,
    columns=ratings_matrix.index
)

def CF_simple(user_id, movie_id):
    if movie_id in ratings_matrix.columns:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = ratings_matrix[movie_id].copy()
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.dropna()
        sim_scores = sim_scores.drop(none_rating_idx)
        mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
    else:
        mean_rating = 3.0
    return mean_rating

score(CF_simple)

1.0163950497075442

In [9]:
user_similarity

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.153509,0.048828,0.050079,0.278099,0.327639,0.353966,0.275666,0.072532,0.326202,...,0.269456,0.082560,0.221390,0.170126,0.199220,0.077378,0.252588,0.114289,0.168051,0.342715
2,0.153509,1.000000,0.085875,0.159460,0.079519,0.222079,0.078299,0.105642,0.164989,0.159877,...,0.104695,0.272502,0.302868,0.305286,0.326243,0.195688,0.214379,0.145219,0.144460,0.086837
3,0.048828,0.085875,1.000000,0.260107,0.000000,0.059021,0.058169,0.065863,0.021446,0.057932,...,0.031258,0.052507,0.105464,0.082393,0.120345,0.018420,0.124316,0.100498,0.100742,0.019350
4,0.050079,0.159460,0.260107,1.000000,0.028522,0.058558,0.071268,0.166722,0.128394,0.078100,...,0.067725,0.046570,0.116118,0.175384,0.118478,0.000000,0.104881,0.130730,0.186872,0.061784
5,0.278099,0.079519,0.000000,0.028522,1.000000,0.191348,0.354336,0.197028,0.041772,0.168086,...,0.284917,0.064654,0.076207,0.025297,0.118816,0.043547,0.201201,0.083997,0.139963,0.270321
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.077378,0.195688,0.018420,0.000000,0.043547,0.103445,0.051958,0.059167,0.049503,0.089860,...,0.071738,0.355517,0.232803,0.184313,0.378896,1.000000,0.039193,0.125780,0.021260,0.085756
940,0.252588,0.214379,0.124316,0.104881,0.201201,0.247517,0.250828,0.214271,0.117342,0.278815,...,0.195458,0.079803,0.140759,0.166011,0.137178,0.039193,1.000000,0.117101,0.220482,0.183072
941,0.114289,0.145219,0.100498,0.130730,0.083997,0.129900,0.041644,0.115575,0.171055,0.096195,...,0.057486,0.173886,0.261904,0.213454,0.299747,0.125780,0.117101,1.000000,0.064442,0.117783
942,0.168051,0.144460,0.100742,0.186872,0.139963,0.283462,0.267777,0.133550,0.112939,0.190011,...,0.217761,0.044191,0.065762,0.153518,0.094629,0.021260,0.220482,0.064442,1.000000,0.152851


# 2. 이웃을 고려한 CF

In [10]:
# 유사집단의 크기를 미리 정하기 위해 기존 score 함수에 인자를 추가 
def score(model, neighbor_size=0):
    id_pairs = zip(X_test['user_id'], X_test['movie_id'])
    y_pred = np.array([model(user, movie, neighbor_size) for (user, movie) in id_pairs])
    y = np.array(X_test['rating'])
    return RMSE(y, y_pred)

In [11]:
X = ratings.copy()
y = ratings['user_id']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
rating_matrix = X_train.pivot(
    index='user_id',
    columns='movie_id',
    values='rating'
)

In [12]:
def CF_knn(user_id, movie_id, neighbor_size=0):
    if movie_id in rating_matrix.columns:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = rating_matrix[movie_id].copy()
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.dropna()
        sim_scores = sim_scores.drop(none_rating_idx)
        
        if neighbor_size == 0:
            mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
        else:
            if len(sim_scores) > 1:
                neighbor_size = min(neighbor_size, len(sim_scores))
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                user_idx = np.argsort(sim_scores)
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
            else:
                mean_rating = 3.0
                
    else:
        mean_rating = 3.0
        
    return mean_rating

score(CF_knn, neighbor_size=30)

0.9981025933886849

In [13]:
# 실제 주어진 사용자에 대한 추천 기능 구현 
rating_matrix = ratings.pivot_table(
    values='rating',
    index='user_id',
    columns = 'movie_id'
)
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(
    user_similarity, 
    index=rating_matrix.index, 
    columns=rating_matrix.index
)

def recom_movie(user_id, n_items, neighbor_size=30):
    user_movie = rating_matrix.loc[user_id].copy()
    
    for movie in rating_matrix.columns:
        if pd.notnull(user_movie.loc[movie]):
            user_movie.loc[movie] = 0
        else:
            user_movie.loc[movie] = CF_knn(user_id, movie, neighbor_size)
    
    movie_sort = user_movie.sort_values(ascending=False)[:n_items]
    recom_movies = movies.loc[movie_sort.index]
    recommendations = recom_movies['title']
    return recommendations

recom_movie(user_id=729, n_items=5, neighbor_size=30)

movie_id
1189                      Prefontaine (1997)
1293                         Star Kid (1997)
1467    Saint of Fort Washington, The (1993)
1500               Santa with Muscles (1996)
22                         Braveheart (1995)
Name: title, dtype: object

# 3. 최적의 neightbor size 결정

In [14]:
# 실행 환경을 고려해서 neighbor size를 5씩 증가시키며 확인
for neighbor_size in range(10, 50, 5):
    print(f'n_size: {neighbor_size}, RMSE={score(CF_knn, neighbor_size):.4f}')

n_size: 10, RMSE=0.8045
n_size: 15, RMSE=0.8486
n_size: 20, RMSE=0.8711
n_size: 25, RMSE=0.8860
n_size: 30, RMSE=0.8969
n_size: 35, RMSE=0.9051
n_size: 40, RMSE=0.9118
n_size: 45, RMSE=0.9171


# 4. 사용자 평가경향을 고려한 CF

In [15]:
rating_mean = rating_matrix.mean(axis=1)
rating_bias = (rating_matrix.T - rating_mean).T

In [16]:
def CF_knn_bias(user_id, movie_id, neighbor_size=0):
    if movie_id in rating_bias.columns:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = rating_bias[movie_id].copy()
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.drop(none_rating_idx)
        sim_scores = sim_scores.drop(none_rating_idx)
        
        if neighbor_size == 0:
            prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
            prediction = prediction + rating_mean[user_id]
        else:
            if len(sim_scores) > 1:
                neighbor_size = min(neighbor_size, len(sim_scores))
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                user_idx = np.argsort(sim_scores)
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                prediction = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
                prediction = prediction + rating_mean[user_id]
            else: 
                prediction = rating_mean[user_id]
    else:
        prediction = rating_mean[user_id]
    
    return prediction

score(CF_knn_bias, 30)

0.8266870037506906

# 5. 그 외 CF 성능 개선을 고려할 수 있는 방법

In [17]:
x = ratings.copy()
y = ratings['user_id']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)
rating_matrix = X_train.pivot(
    index='user_id',
    columns='movie_id',
    values='rating'
)

matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(
    user_similarity,
    index=rating_matrix.index,
    columns=rating_matrix.index
)

rating_mean = rating_matrix.mean(axis=1)
rating_bias = (rating_matrix.T - rating_mean).T

In [18]:
rating_binary_1 = np.array(rating_matrix>0).astype(float)
rating_binary_2 = rating_binary_1.T

counts = np.dot(rating_binary_1, rating_binary_2)
counts = pd.DataFrame(
    counts,
    index=rating_matrix.index,
    columns=rating_matrix.index
).fillna(0)

In [19]:
def CF_knn_bias_sig(user_id,movie_id,neighbor_size=0):
  if movie_id in rating_bias:
    sim_scores = user_similarity[user_id].copy()
    movie_ratings = rating_bias[movie_id].copy()

    no_rating = movie_ratings.isnull()
    common_counts = counts[user_id]
    low_significance = common_counts < SIG_LEVEL
    none_rating_idx = movie_ratings[no_rating | low_significance].index

    movie_ratings = movie_ratings.drop(none_rating_idx)
    sim_scores = sim_scores.drop(none_rating_idx)

    if neighbor_size == 0:
      prediction = np.dot(sim_scores,movie_ratings) / sim_scores.sum()
      prediction = prediction + rating_mean[user_id]

    else:
      if len(sim_scores) > MIN_RATINGS:
        neighbor_size = min(neighbor_size,len(sim_scores))
        sim_scores = np.array(sim_scores)
        movie_ratings = np.array(movie_ratings)
        user_idx = np.argsort(sim_scores)
        sim_scores = sim_scores[user_idx][-neighbor_size:]
        movie_ratings = movie_ratings[user_idx][-neighbor_size:]
        prediction = np.dot(sim_scores,movie_ratings) / sim_scores.sum()
        prediction = prediction + rating_mean[user_id]
      else:
        prediction = rating_mean[user_id]
  else:
    prediction = rating_mean[user_id]
    
  if prediction <=1:
    prediction = 1
  elif prediction>=5:
    prediction = 5

  return prediction

SIG_LEVEL = 3
MIN_RATINGS = 3

score(CF_knn_bias_sig, 30)

0.944791767566305

# 6. 사용자 기반 CF와 아이템 기반 CF

In [22]:
x = ratings.copy()
y = ratings['user_id']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)
rating_matrix = x_train.pivot(
    index='user_id',
    columns = 'movie_id',
    values='rating'
)

In [24]:
rating_matrix_t = np.transpose(rating_matrix)

matrix_dummy = rating_matrix_t.copy().fillna(0)

item_similarity = cosine_similarity(matrix_dummy,matrix_dummy)
item_similarity = pd.DataFrame(item_similarity,
                               index=rating_matrix_t.index,
                               columns=rating_matrix_t.index)

def score(model):
    id_pairs = zip(x_test['user_id'],x_test['movie_id'])
    y_pred = np.array([model(user,movie) for (user,movie) in id_pairs])
    y = np.array(x_test['rating'])
    return RMSE(y, y_pred)

def CF_IBCF(user_id,movie_id):
  if movie_id in item_similarity.columns:
    sim_scores = item_similarity[movie_id]
    user_rating = rating_matrix_t[user_id]
    none_rating_idx = user_rating[user_rating.isnull()].index
    user_rating = user_rating.dropna()
    sim_scores = sim_scores.drop(none_rating_idx)
    mean_rating = np.dot(sim_scores,user_rating) / sim_scores.sum()
  else:
    mean_rating = 3.0

  return mean_rating

score(CF_IBCF)

1.0101382014718185