In [1]:
from sklearn.metrics.pairwise import cosine_similarity

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# 사용자 / 기사 / 단어에 대한 더미 데이터 생성
users = ['user' + str(i) for i in range(1, 1001)]
articles = ['article' + str(i) for i in range(1, 1001)]
words = ['word' + str(i) for i in range(1, 1001)]

# 사용자 - 단어 학습 횟수 더미 데이터 생성
user_word_count = np.random.randint(0, 11, size = (1000, 1000))
user_word_df = pd.DataFrame(user_word_count, columns = words, index = users)

# 기사 - 단어 TF - IDF 더미 데이터 생성
article_word_tfidf = np.random.random(size = (1000, 1000))
article_word_df = pd.DataFrame(article_word_tfidf, columns = words, index = articles)

In [3]:
user_word_df

Unnamed: 0,word1,word2,word3,word4,word5,word6,word7,word8,word9,word10,...,word991,word992,word993,word994,word995,word996,word997,word998,word999,word1000
user1,10,7,0,2,5,1,0,1,2,9,...,2,3,9,7,6,7,5,6,8,10
user2,10,2,10,5,7,7,9,1,1,7,...,5,6,7,3,4,3,3,1,6,9
user3,4,4,10,9,9,6,1,1,7,1,...,9,10,9,5,9,8,0,5,0,4
user4,5,4,3,9,4,0,5,0,3,8,...,2,1,3,5,3,10,9,7,1,7
user5,2,10,7,4,3,1,3,4,3,10,...,3,6,9,1,7,5,1,7,9,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
user996,2,5,5,1,3,1,8,10,1,7,...,1,9,10,8,5,9,1,10,1,1
user997,7,4,1,4,4,8,5,7,2,8,...,3,7,4,0,5,6,8,8,10,9
user998,1,8,5,9,7,5,6,10,4,8,...,10,2,7,10,1,6,9,1,7,4
user999,6,6,7,7,3,7,3,9,8,9,...,1,3,8,1,6,2,9,5,8,2


In [4]:
article_word_df

Unnamed: 0,word1,word2,word3,word4,word5,word6,word7,word8,word9,word10,...,word991,word992,word993,word994,word995,word996,word997,word998,word999,word1000
article1,0.560375,0.127570,0.375951,0.188707,0.649075,0.322438,0.582118,0.251925,0.547876,0.384269,...,0.882888,0.990175,0.934793,0.328778,0.596971,0.413494,0.170082,0.930967,0.946770,0.840883
article2,0.759788,0.536888,0.317400,0.313354,0.915561,0.819760,0.356475,0.638229,0.501409,0.151345,...,0.358898,0.455696,0.243986,0.715524,0.155790,0.668883,0.837535,0.208373,0.317559,0.651281
article3,0.086533,0.908873,0.111631,0.353321,0.871836,0.781249,0.030468,0.786652,0.475620,0.130343,...,0.592710,0.812278,0.373327,0.954001,0.322412,0.230625,0.183588,0.887107,0.919073,0.793023
article4,0.418025,0.697219,0.943412,0.559001,0.837036,0.879898,0.024832,0.013383,0.298449,0.297294,...,0.628826,0.709000,0.541927,0.227781,0.453275,0.090400,0.693637,0.474238,0.236232,0.558372
article5,0.317190,0.725449,0.489643,0.005560,0.521814,0.598776,0.485303,0.581108,0.042112,0.505743,...,0.015811,0.617332,0.564596,0.367762,0.648301,0.385034,0.553912,0.262057,0.185628,0.481514
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
article996,0.993748,0.842456,0.867987,0.255496,0.380948,0.941409,0.659784,0.291339,0.305653,0.075957,...,0.276755,0.483356,0.885411,0.549180,0.618919,0.834712,0.548275,0.598546,0.231374,0.957374
article997,0.114797,0.039745,0.031399,0.057716,0.795216,0.709361,0.580767,0.562467,0.724495,0.989111,...,0.316390,0.960272,0.241748,0.095536,0.270719,0.931563,0.477572,0.595617,0.113452,0.355608
article998,0.779985,0.129835,0.527720,0.407682,0.352815,0.639991,0.711871,0.214631,0.344801,0.032134,...,0.419444,0.923801,0.356462,0.007214,0.127313,0.081799,0.809994,0.045708,0.399384,0.192265
article999,0.442299,0.084439,0.965508,0.955520,0.327148,0.589276,0.535486,0.352230,0.688816,0.566340,...,0.624543,0.770569,0.708847,0.681259,0.013490,0.594404,0.095394,0.351399,0.445254,0.504003


In [5]:
# 데이터 간 Scale을 맞추기 위하여 Min-Max 정규화
user_word_df_norm = (user_word_df - user_word_df.min()) / (user_word_df.max() - user_word_df.min())
user_word_df_norm

Unnamed: 0,word1,word2,word3,word4,word5,word6,word7,word8,word9,word10,...,word991,word992,word993,word994,word995,word996,word997,word998,word999,word1000
user1,1.0,0.7,0.0,0.2,0.5,0.1,0.0,0.1,0.2,0.9,...,0.2,0.3,0.9,0.7,0.6,0.7,0.5,0.6,0.8,1.0
user2,1.0,0.2,1.0,0.5,0.7,0.7,0.9,0.1,0.1,0.7,...,0.5,0.6,0.7,0.3,0.4,0.3,0.3,0.1,0.6,0.9
user3,0.4,0.4,1.0,0.9,0.9,0.6,0.1,0.1,0.7,0.1,...,0.9,1.0,0.9,0.5,0.9,0.8,0.0,0.5,0.0,0.4
user4,0.5,0.4,0.3,0.9,0.4,0.0,0.5,0.0,0.3,0.8,...,0.2,0.1,0.3,0.5,0.3,1.0,0.9,0.7,0.1,0.7
user5,0.2,1.0,0.7,0.4,0.3,0.1,0.3,0.4,0.3,1.0,...,0.3,0.6,0.9,0.1,0.7,0.5,0.1,0.7,0.9,0.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
user996,0.2,0.5,0.5,0.1,0.3,0.1,0.8,1.0,0.1,0.7,...,0.1,0.9,1.0,0.8,0.5,0.9,0.1,1.0,0.1,0.1
user997,0.7,0.4,0.1,0.4,0.4,0.8,0.5,0.7,0.2,0.8,...,0.3,0.7,0.4,0.0,0.5,0.6,0.8,0.8,1.0,0.9
user998,0.1,0.8,0.5,0.9,0.7,0.5,0.6,1.0,0.4,0.8,...,1.0,0.2,0.7,1.0,0.1,0.6,0.9,0.1,0.7,0.4
user999,0.6,0.6,0.7,0.7,0.3,0.7,0.3,0.9,0.8,0.9,...,0.1,0.3,0.8,0.1,0.6,0.2,0.9,0.5,0.8,0.2


In [6]:
# 추천 방법 1 : Cosine 유사도 기반 Content-Based Filtering Algorithm
# 코사인 유사도 기반으로 사용자와 관련성이 높은 기사를 추천하는 방법

# 사용자 - 기사 간 코사인 유사도 계산
cosine_sim = cosine_similarity(user_word_df_norm, article_word_df)

# 코사인 유사도를 DataFrame으로 변환
cosine_sim_df = pd.DataFrame(cosine_sim, columns = articles, index = users)
cosine_sim_df

Unnamed: 0,article1,article2,article3,article4,article5,article6,article7,article8,article9,article10,...,article991,article992,article993,article994,article995,article996,article997,article998,article999,article1000
user1,0.715469,0.725120,0.727824,0.735908,0.735264,0.722153,0.728287,0.720648,0.721491,0.734535,...,0.734105,0.729002,0.739643,0.743073,0.725821,0.727708,0.722583,0.708814,0.744758,0.719673
user2,0.733338,0.731170,0.722413,0.735979,0.734937,0.750522,0.726879,0.729676,0.710742,0.732029,...,0.739784,0.731579,0.747689,0.721751,0.734740,0.719139,0.730239,0.720806,0.732194,0.736000
user3,0.739492,0.717999,0.735377,0.721271,0.740221,0.737005,0.712469,0.716474,0.729031,0.733764,...,0.721674,0.727868,0.719951,0.720899,0.709712,0.718177,0.716252,0.710238,0.732768,0.719441
user4,0.715912,0.727616,0.725050,0.711578,0.725325,0.717032,0.726716,0.707839,0.738266,0.739559,...,0.721566,0.724280,0.731789,0.733878,0.708269,0.737833,0.727073,0.713610,0.710835,0.726903
user5,0.715629,0.708408,0.737460,0.715645,0.744267,0.730181,0.723384,0.707880,0.702214,0.733065,...,0.702245,0.736839,0.714130,0.735266,0.717065,0.723616,0.743056,0.715381,0.731734,0.717816
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
user996,0.723428,0.711132,0.727419,0.721822,0.740038,0.720453,0.709383,0.739012,0.718622,0.743618,...,0.720930,0.745733,0.710437,0.730802,0.715077,0.733421,0.738845,0.734677,0.725560,0.723494
user997,0.711988,0.721673,0.714932,0.716759,0.725151,0.742386,0.727479,0.715452,0.709982,0.733377,...,0.707651,0.716501,0.731243,0.722594,0.718093,0.716889,0.732106,0.710701,0.731858,0.726828
user998,0.736545,0.734829,0.745009,0.731932,0.742793,0.738250,0.725858,0.719761,0.732802,0.744412,...,0.731407,0.730709,0.717568,0.727822,0.739784,0.744458,0.733610,0.718856,0.743621,0.744499
user999,0.732074,0.732905,0.729700,0.731531,0.732623,0.724837,0.726555,0.735651,0.714014,0.750112,...,0.713903,0.731745,0.737649,0.729356,0.718752,0.727705,0.737935,0.710358,0.721050,0.743489


In [8]:
# 각 사용자에 대한 추천 리스트 생성하여 추천 점수가 높은 기사 추천

# 실제로 구현할 때는 해당 기사가 이미 읽었던 기사인지, 아닌지 검증하는 과정 추가 필요

recommendations = {}
for user in users:
    user_data = cosine_sim_df.loc[user].sort_values(ascending=False)
    recommendations[user] = user_data.index.values.tolist()[:3]

for user in list(recommendations.keys()):
    print(f"{user}: {recommendations[user]}")

user1: ['article750', 'article415', 'article44']
user2: ['article145', 'article832', 'article225']
user3: ['article525', 'article170', 'article221']
user4: ['article632', 'article156', 'article209']
user5: ['article579', 'article111', 'article631']
user6: ['article339', 'article557', 'article131']
user7: ['article657', 'article387', 'article308']
user8: ['article657', 'article867', 'article512']
user9: ['article131', 'article312', 'article819']
user10: ['article979', 'article400', 'article175']
user11: ['article170', 'article819', 'article603']
user12: ['article752', 'article851', 'article674']
user13: ['article478', 'article451', 'article665']
user14: ['article744', 'article814', 'article680']
user15: ['article498', 'article564', 'article851']
user16: ['article521', 'article579', 'article511']
user17: ['article145', 'article765', 'article71']
user18: ['article94', 'article972', 'article680']
user19: ['article852', 'article111', 'article367']
user20: ['article339', 'article726', 'artic

In [9]:
# 단어 추천 방법 : 해당 사용자가 아직 학습하지 않은 단어들 중 해당 사용자와 유사한 사용자들이 많이 학습한 단어를 추천하는 알고리즘

# 사용자 간 유사도 계산
user_similarity = cosine_similarity(user_word_df_norm)

# 유사도를 DataFrame으로 변환
user_similarity_df = pd.DataFrame(user_similarity, index = users, columns = users)
user_similarity_df

Unnamed: 0,user1,user2,user3,user4,user5,user6,user7,user8,user9,user10,...,user991,user992,user993,user994,user995,user996,user997,user998,user999,user1000
user1,1.000000,0.715004,0.695107,0.720281,0.714151,0.707301,0.724401,0.714047,0.706864,0.703834,...,0.727075,0.708680,0.729641,0.681233,0.713875,0.711329,0.704557,0.701832,0.712543,0.713088
user2,0.715004,1.000000,0.703572,0.707468,0.707258,0.707146,0.716728,0.719073,0.713512,0.701919,...,0.703441,0.727817,0.724390,0.706472,0.705915,0.699180,0.730358,0.704603,0.718331,0.722765
user3,0.695107,0.703572,1.000000,0.699676,0.707255,0.694191,0.730672,0.685735,0.717254,0.689838,...,0.719473,0.704861,0.725176,0.710623,0.725066,0.699677,0.689985,0.713293,0.699347,0.716692
user4,0.720281,0.707468,0.699676,1.000000,0.692636,0.714911,0.722364,0.706380,0.707098,0.704873,...,0.722245,0.698925,0.723359,0.720499,0.705961,0.686095,0.692315,0.702466,0.714884,0.710023
user5,0.714151,0.707258,0.707255,0.692636,1.000000,0.703724,0.702592,0.698468,0.718959,0.695727,...,0.696869,0.707441,0.735312,0.701395,0.707679,0.708976,0.700452,0.704392,0.712691,0.701897
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
user996,0.711329,0.699180,0.699677,0.686095,0.708976,0.719042,0.723982,0.715025,0.701543,0.691699,...,0.715867,0.716011,0.713929,0.713870,0.702654,1.000000,0.689702,0.737943,0.698150,0.711993
user997,0.704557,0.730358,0.689985,0.692315,0.700452,0.701271,0.723242,0.705621,0.691195,0.704192,...,0.694606,0.713554,0.696859,0.720836,0.692895,0.689702,1.000000,0.699473,0.699594,0.699405
user998,0.701832,0.704603,0.713293,0.702466,0.704392,0.731879,0.747697,0.708005,0.722778,0.699691,...,0.730229,0.714042,0.718085,0.702560,0.719719,0.737943,0.699473,1.000000,0.725860,0.712573
user999,0.712543,0.718331,0.699347,0.714884,0.712691,0.721519,0.726924,0.706956,0.711254,0.697418,...,0.719244,0.700123,0.716497,0.722736,0.709496,0.698150,0.699594,0.725860,1.000000,0.697426


In [10]:
# 사용자가 아직 학습하지 않은 단어 찾기
unlearned_words = user_word_df_norm[user_word_df_norm == 0]

# 추천 단어 리스트 생성
word_recommendations = {}
for user in users:
    # 사용자와 유사한 사용자들 찾기
    similar_users = user_similarity_df[user].sort_values(ascending=False).index[1:]

    # 유사한 사용자들이 많이 학습한 단어 찾기
    similar_users_words = user_word_df_norm.loc[similar_users].mean().sort_values(ascending=False)

    # 사용자가 아직 학습하지 않은 단어 중에서 유사한 사용자들이 많이 학습한 단어 선택
    user_unlearned_words = unlearned_words.loc[user].dropna()
    recommended_words = similar_users_words.loc[user_unlearned_words.index]

    # 상위 3개 단어 추천
    word_recommendations[user] = recommended_words.sort_values(ascending=False).index[:1].tolist()

In [11]:
print(word_recommendations)

{'user1': ['word83'], 'user2': ['word86'], 'user3': ['word753'], 'user4': ['word83'], 'user5': ['word722'], 'user6': ['word137'], 'user7': ['word554'], 'user8': ['word753'], 'user9': ['word839'], 'user10': ['word76'], 'user11': ['word414'], 'user12': ['word889'], 'user13': ['word646'], 'user14': ['word137'], 'user15': ['word362'], 'user16': ['word910'], 'user17': ['word427'], 'user18': ['word83'], 'user19': ['word39'], 'user20': ['word137'], 'user21': ['word550'], 'user22': ['word889'], 'user23': ['word910'], 'user24': ['word910'], 'user25': ['word889'], 'user26': ['word910'], 'user27': ['word207'], 'user28': ['word362'], 'user29': ['word550'], 'user30': ['word766'], 'user31': ['word427'], 'user32': ['word204'], 'user33': ['word411'], 'user34': ['word83'], 'user35': ['word592'], 'user36': ['word515'], 'user37': ['word432'], 'user38': ['word175'], 'user39': ['word60'], 'user40': ['word86'], 'user41': ['word431'], 'user42': ['word550'], 'user43': ['word554'], 'user44': ['word839'], 'user