# Webtoon Recommendation System
***by Chung-Ang Univ. COCAUIN_Team***

Our team presents **Webtoon Recommendation System**(based on NAVER Webtoon Service) based on **three different models** .

## **1. Latent factor based Collaborative Filtering**  
## **2. Item based Collaborative Filtering**  
## **3. 'Surprise' based recommendation system**  

In [5]:
# (1) Latent factor based Collaborative Filtering

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

def get_rmse(R, P, Q, non_zeros):   # R = rating_matrix
    error = 0
    full_pred_matrix = np.dot(P, Q.T)
    
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeros]
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind]
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)
    
    return rmse

def matrix_factorization(R, K , steps = 200, learning_rate = 0.01, r_lambda = 0.01):
    num_users, num_items = R.shape
    np.random.seed(1)
    P = np.random.normal(scale = 1./K, size = (num_users, K))
    Q = np.random.normal(scale = 1./K, size = (num_items, K))
    
    prev_rmse = 10000
    break_count = 0
    
    non_zeros = [(i,j,R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j]>0]
    
    for step in range(steps):
        for i, j, r in non_zeros:
            eij = r - np.dot(P[i,:], Q[j, :].T)
            P[i,:] = P[i,:] + learning_rate*(eij * Q[j,:] - r_lambda*P[i,:])
            Q[j,:] = Q[j,:] + learning_rate*(eij * P[i,:] - r_lambda*Q[j,:])
            
        rmse = get_rmse(R,P,Q, non_zeros)
        if (step%40) == 0:
            print('STEP_COUNT: ', step, 'RMSE :', rmse)
            
    return P, Q

def get_unseen_webtoons(rating_matrix, userId):
    user_rating = rating_matrix.loc[userId,:]
    already_seen = user_rating[ user_rating > 0].index.tolist()
    webtoons_list = rating_matrix.columns.tolist()
    unseen_list = [ webtoon for webtoon in webtoons_list if webtoon not in already_seen]
    print('평점 매긴 영화수:', len(already_seen), '추천대상 영화수:',len(unseen_list), \
          '전체 영화수:',len(webtoons_list))
    
    return unseen_list

def recomm_webtoons_by_userid(rating_matrix, pred_array, userId, unseen_list, top_n=5):
    
    pred_df=pd.DataFrame(data=pred_array, columns=rating_matrix.columns, index=rating_matrix.index)
    recomm_webtoons = pred_df.loc[userId, unseen_list].sort_values(ascending=False)[:top_n]
    recomm_webtoons_df = pd.DataFrame(data=recomm_webtoons.values,index=recomm_webtoons.index,columns=['pred_score'])
    
    return recomm_webtoons_df


# load data
survey = pd.read_csv('webtoons_survey(preprocessed).csv')
survey.replace('없음', np.NaN, inplace=True)
survey=survey.astype('float64')

P, Q = matrix_factorization(survey.values, K=30, steps=200, learning_rate=0.01, r_lambda = 0.01)
pred_matrix = np.dot(P, Q.T)


# show recommendation list
find_user = pd.read_csv('webtoons_survey(original_form).csv')

users= ['황예은']  # input name of users as lists

for user in users:
    userid=int(find_user[find_user['이름']==user].index[0])
    unseen_list = get_unseen_webtoons(survey, userid)
    recomm_webtoons=recomm_webtoons_by_userid(survey, pred_matrix, userid, unseen_list, top_n=5)
    
    print('\n', '%%%% {} %%%% 님의'.format(user), '\n')
    print('## 추천 5개 웹툰 ## ', '\n', recomm_webtoons)
    print('='*70)

STEP_COUNT:  0 RMSE : 3.532614660716394
STEP_COUNT:  40 RMSE : 0.4428637719198905
STEP_COUNT:  80 RMSE : 0.3302075532662343
STEP_COUNT:  120 RMSE : 0.2968993703202772
STEP_COUNT:  160 RMSE : 0.28072334925892345
평점 매긴 영화수: 14 추천대상 영화수: 135 전체 영화수: 149

 %%%% 황예은 %%%% 님의 

## 추천 5개 웹툰 ##  
                         pred_score
네이버 수요일 웹툰 [고삼무쌍]         5.522287
네이버 완결 웹툰 [신과 함께]         5.394038
네이버 일요일 웹툰 [마루한-구현동화전]    5.093802
네이버 토요일 웹툰 [회춘]           5.069422
네이버 완결 웹툰 [여중생 A]         5.045176


In [8]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
find_user = pd.read_csv('webtoons_survey(original_form).csv')
find_user = find_user.replace('없음', np.NaN)
find_user[(find_user['이름']=='황예은')].dropna(how='any',axis=1)

Unnamed: 0,타임스탬프,귀하의 성별은?,"현재까지 감상한 웹툰 작품을 점수(1~5점)를 매겨주세요😃 보지 않으신 작품은 ""없음""에 표시해주세요.",네이버 월요일 웹툰 [신의 탑],네이버 월요일 웹툰 [뷰티풀 군바리],네이버 화요일 웹툰 [여신강림],네이버 화요일 웹툰 [마음의 소리],네이버 수요일 웹툰 [유미의 세포들],네이버 수요일 웹툰 [헬퍼2 : 킬베로스],네이버 금요일 웹툰 [외모지상주의],네이버 금요일 웹툰 [스위트홈],네이버 완결 웹툰 [후레자식],네이버 완결 웹툰 [한번 더 해요],네이버 완결 웹툰 [노블레스],네이버 완결 웹툰 [대학일기],네이버 완결 웹툰 [치즈인더트랩],네이버 완결 웹툰 [패션왕],이름,연락처
8,2020. 1. 4 오후 12:27:43,여,"예, 읽고 확인했습니다.",5,3,2,4,4,5,3,5,4,4,5,5,5,4,황예은,1096558430


In [None]:
# (2) Item based Collaborative Filtering

from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

def get_rmse_Item(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    
    return np.sqrt(mean_squared_error(pred, actual))

def get_item_sim_df(ratings_matrix):
    
    ratings_matrix_T = ratings_matrix.transpose()
    item_sim = cosine_similarity(ratings_matrix_T, ratings_matrix_T)
    item_sim_df = pd.DataFrame(data=item_sim, index=ratings_matrix.columns,
                          columns=ratings_matrix.columns)
    return item_sim_df

def predict_rating_topsim(ratings_arr, item_sim_arr, n=10):
    
    pred = np.zeros(ratings_arr.shape)

    for col in range(ratings_arr.shape[1]):
        top_n_items = [np.argsort(item_sim_arr[:, col])[:-n-1:-1]]
        for row in range(ratings_arr.shape[0]):
            pred[row, col] = item_sim_arr[col, :][top_n_items].dot(ratings_arr[row, :][top_n_items].T) 
            pred[row, col] /= np.sum(np.abs(item_sim_arr[col, :][top_n_items]))        
    
    return pred

def get_ratings_pred_matrix(ratings_matrix, top_n):
    ratings_pred_arr = predict_rating_topsim(ratings_matrix.values, get_item_sim_df(ratings_matrix).values, n=top_n)
    return ratings_pred_arr

def get_preferred_top_n(ratings_matrix, userId, top_n):
    user_rating_id = ratings_matrix.loc[userId, :]
    return user_rating_id[ user_rating_id > 0].sort_values(ascending=False)[:top_n]

def get_unseen_webtoons(ratings_matrix, userId):
    user_rating = ratings_matrix.loc[userId,:]
    already_seen = user_rating[ user_rating > 0].index.tolist()
    webtoons_list = ratings_matrix.columns.tolist()
    unseen_list = [ webtoon for webtoon in webtoons_list if webtoon not in already_seen]
    print('평점 매긴 영화수:', len(already_seen), '추천대상 영화수:', len(unseen_list), \
              '전체 영화수:', len(webtoons_list))
    return unseen_list

def recomm_webtoons_by_userid(rating_matrix, pred_array, userId, unseen_list, top_n=5):
    
    pred_df=pd.DataFrame(data=pred_array, columns=rating_matrix.columns, index=rating_matrix.index)
    recomm_webtoons = pred_df.loc[userId, unseen_list].sort_values(ascending=False)[:top_n]
    recomm_webtoons_df = pd.DataFrame(data=recomm_webtoons.values,index=recomm_webtoons.index,columns=['pred_score'])
    
    return recomm_webtoons_df

def show_result_Item(rating_matrix, userId):
    
    ratings_pred_arr=get_ratings_pred_matrix(rating_matrix, 10)
    preferred_webtoons=get_preferred_top_n(rating_matrix, 235, 5)
    unseen_list = get_unseen_webtoons(rating_matrix, userId)
    recomm_webtoons = recomm_webtoons_by_userid(survey, ratings_pred_arr, userId, unseen_list, top_n=5)
    
    return preferred_webtoons, recomm_webtoons

# load data
survey = pd.read_csv('webtoons_survey(preprocessed).csv')
survey.replace('없음', np.NaN, inplace=True)
survey = survey.astype('float64')
survey = survey.fillna(0)


# show recommendation list
find_user = pd.read_csv('webtoons_survey(original_form).csv')

users= ['???']  # input name of users as lists

for user in users:
    userid=int(find_user[find_user['이름']==user].index[0])
    result=show_result_Item(survey, userid)
    
    print('\n', '%%%% {} %%%% 님의'.format(user), '\n')
#     print('## 선호 5개 웹툰 ## ', '\n', result[0], '\n')
    print('## 추천 5개 웹툰 ## ', '\n', result[1])
    print('='*70)

In [None]:
# (3) 'Surprise' based recommendation system

import surprise
import pandas as pd
import numpy as np
from surprise import SVD, Dataset, Reader, accuracy
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV
from surprise.dataset import DatasetAutoFolds

def get_unseen_surprise(ratings, webtoons, userId):
    
    seen_webtoons = ratings[ratings['userId']== userId]['webtoonId'].tolist()
    
    total_webtoons = webtoons.columns[:-1].tolist()
    
    unseen_webtoons= [webtoon for webtoon in total_webtoons if webtoon not in seen_webtoons]
    print('평점 매긴 영화수:',len(seen_webtoons), '추천대상 영화수:',len(unseen_webtoons), \
          '전체 영화수:',len(total_webtoons))
    
    return unseen_webtoons

def recomm_webtoon_by_surprise(algo, userId, unseen_webtoons, top_n=10):
    predictions=[algo.predict(str(userId), str(webtoonId)) for webtoonId in unseen_webtoons]
    
    def sortkey_est(pred):
        return pred.est
    
    predictions.sort(key=sortkey_est, reverse=True)
    top_predictions=predictions[:top_n]
    
    top_webtoon_titles=[pred.iid for pred in top_predictions]
    top_webtoon_rating = [pred.est for pred in top_predictions]
    
    top_webtoon_preds= [(title, rating) for title, rating in zip(top_webtoon_titles, top_webtoon_rating)]
    
    return top_webtoon_preds

# load data and model fitting
webtoons = pd.read_csv('webtoons_survey(preprocessed).csv')
ratings=pd.read_csv('webtoons_survey(surprise).csv')
ratings=ratings.dropna(subset=['rating'])

reader = Reader(rating_scale=(1.0, 5.0))

data_folds = DatasetAutoFolds(df=ratings, reader=reader)
trainset = data_folds.build_full_trainset()

algo = SVD(n_epochs=200, n_factors=30, random_state=0)
algo.fit(trainset)

# show recommendation list
find_user = pd.read_csv('webtoons_survey(original_form).csv')

users= ['???']  # input name of users as lists

print('##### Surprise 모델 추천 리스트 #####')
for user in users:
    userid=int(find_user[find_user['이름']==user].index[0])
    unseen_webtoons = get_unseen_surprise(ratings, webtoons, userid)
    top_webtoon_preds=recomm_webtoon_by_surprise(algo, userid, unseen_webtoons, top_n=5)

    print('\n', '%%%% {} %%%% 님의'.format(user), '\n')
    print('## 추천 5개 웹툰 ## ', '\n')
    for top_webtoon in top_webtoon_preds:
        print(top_webtoon[0], ':', top_webtoon[1])
    print('='*70)

In [248]:
cross_validate(algo, data_folds, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0388  1.0433  0.9849  1.0258  1.0771  1.0340  0.0298  
MAE (testset)     0.8265  0.8192  0.7825  0.8173  0.8551  0.8201  0.0232  
Fit time          0.40    0.39    0.38    0.38    0.38    0.39    0.01    
Test time         0.01    0.01    0.01    0.01    0.01    0.01    0.00    


{'test_rmse': array([1.03882731, 1.04325884, 0.98488007, 1.02577387, 1.07712922]),
 'test_mae': array([0.82649253, 0.819209  , 0.782535  , 0.81727934, 0.85506067]),
 'fit_time': (0.4041006565093994,
  0.3929879665374756,
  0.3849978446960449,
  0.3809778690338135,
  0.3762342929840088),
 'test_time': (0.011968135833740234,
  0.01096653938293457,
  0.01194310188293457,
  0.011946916580200195,
  0.011963367462158203)}

In [244]:
param_grid = {'n_epochs': [40, 80, 120, 200], 'n_factors': [10, 20, 30, 50, 100] }

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data_folds)

print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

1.0419455403024
{'n_epochs': 40, 'n_factors': 20}
