In [56]:
import pandas as pd
from gensim.utils import simple_preprocess
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

데이터 로드 및 확인

In [5]:
df = pd.read_csv('./naver_economy_comment.csv', usecols=['user_id', 'comment', 'url'])
df

Unnamed: 0,url,comment,user_id
0,https://n.news.naver.com/mnews/article/003/001...,애징간희. 해쳐먹억으면 조용히 좀 살아 욕나오게,2xxKa
1,https://n.news.naver.com/mnews/article/015/000...,이런 기술 상품화 하기전에 법부터 정비해야~~ 악의적인 목소리제조는 엄벌에~,2mrn1
2,https://n.news.naver.com/mnews/article/015/000...,좌파들 정치적으로 가짜조작뉴스 홍수나겠네~~,2mrn1
3,https://n.news.naver.com/mnews/article/015/000...,맞습니다 수능 킬러 문항이 없어야 합니다,4iTs1
4,https://n.news.naver.com/mnews/article/001/001...,비축기지래 별쑈\n민간기업 비축된걸봐라,JbZk
...,...,...,...
1928753,https://n.news.naver.com/mnews/article/014/000...,부동산 최저점이다 지금이영끌타이잉이다\n애기들아 에휴 고점에 처매수하고 최저점에 방...,6VdEs
1928754,https://n.news.naver.com/mnews/article/421/000...,이수페타시스 상한가!!!!,37w0t
1928755,https://n.news.naver.com/mnews/article/421/000...,삼전 떡상이네,15DfK
1928756,https://n.news.naver.com/mnews/article/025/000...,식량위기에 직면한 유럽,bgcJE


In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1928758 entries, 0 to 1928757
Data columns (total 3 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   url      object
 1   comment  object
 2   user_id  object
dtypes: object(3)
memory usage: 44.1+ MB


In [55]:
# 댓글 글자수 5자 이하 row 제거
df = df[df['comment'].apply(lambda x: len(x) > 5)]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1887372 entries, 0 to 1928757
Data columns (total 3 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   url      object
 1   comment  object
 2   user_id  object
dtypes: object(3)
memory usage: 57.6+ MB


In [63]:
# 3개월간 댓글을 작성한 user 수 확인하기
df.groupby('user_id').count()

Unnamed: 0_level_0,url,comment
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1002O,1,1
1007C,35,35
100IJ,39,39
100Ic,7,7
100Il,2,2
...,...,...
zziX,3,3
zzii,1,1
zzq0,2,2
zzr0,16,16


---
모델링

In [57]:
docs = df["comment"].tolist()

# 텍스트 토큰화
tokenized_docs = [simple_preprocess(doc) for doc in docs]

# TaggedDocument 객체 생성
tagged_docs = [TaggedDocument(doc, [i]) for i, doc in zip(df['user_id'], tokenized_docs)] # tag는 user_id로!

# Doc2Vec 모델 생성
model = Doc2Vec(vector_size=300, min_count=1, epochs=40, workers=8) # min_count=2 불가해서 1로 변경

# 모델 학습
model.build_vocab(tagged_docs)
model.train(tagged_docs, total_examples=model.corpus_count, epochs=model.epochs)
model.save('naver_comment.doc2vec')

---

모델링 여부 확인

In [60]:
len(tagged_docs)

1887372

In [85]:
tagged_docs[:10]

[TaggedDocument(words=['애징간희', '해쳐먹억으면', '조용히', '살아', '욕나오게'], tags=['2xxKa']),
 TaggedDocument(words=['이런', '기술', '상품화', '하기전에', '법부터', '정비해야', '악의적인', '목소리제조는', '엄벌에'], tags=['2mrn1']),
 TaggedDocument(words=['좌파들', '정치적으로', '가짜조작뉴스', '홍수나겠네'], tags=['2mrn1']),
 TaggedDocument(words=['맞습니다', '수능', '킬러', '문항이', '없어야', '합니다'], tags=['4iTs1']),
 TaggedDocument(words=['비축기지래', '별쑈', '민간기업', '비축된걸봐라'], tags=['JbZk']),
 TaggedDocument(words=['딱봐도', '평소재고에', '절반도', '안되네', '얼릉', '사재기', '해놔야겠다'], tags=['7L2jb']),
 TaggedDocument(words=['조선경마에는', '골드쉽', '라이스', '샤워', '사쿠라', '바쿠신', '마지막으로', '동탄맘', '없음', 'ㅜㅜㅜ'], tags=['2BnZO']),
 TaggedDocument(words=['투자사', '프라이머가', '단연', '최고인', '같네요'], tags=['2bKQCv']),
 TaggedDocument(words=['기자님', '제발', '이명박', '이름은', '안나오게', '해주세요', '열받으니까'], tags=['5ElSb']),
 TaggedDocument(words=['아직도', '저러고', '싶냐', '석열대통령', '이건', '아니지', '않나', '명바기한테', '잡힌거', '있나', '이동관', '유인촌', 'ㅠㅠ', '봐라겠노'], tags=['30Iu2'])]

In [58]:
model.dv.most_similar('2mrn1')

[('7UZ9O', 0.47186437249183655),
 ('2bZpKL', 0.46239641308784485),
 ('1hS5c', 0.4574330151081085),
 ('NdEB', 0.4481179714202881),
 ('3bHR6', 0.4438951909542084),
 ('5SBo0', 0.43747130036354065),
 ('33mnY', 0.4366970360279083),
 ('2cS2D', 0.4354141354560852),
 ('1cUTr', 0.4322236180305481),
 ('3Q0AA', 0.4316141903400421)]

In [66]:
# 댓글이 남겨져 있는 기사 수 확인
len(df['url'].unique())

136490

---

## 유사한 유저가 읽은 기사 추천받기

In [78]:
# 각 기사마다 댓글을 남긴 user 확인 -> 해당 기사를 읽은 user로 간주
df_readers = df.groupby('url')['user_id'].apply(lambda x: list(set(x))).reset_index(name='readers')
df_readers

Unnamed: 0,url,readers
0,https://n.news.naver.com/mnews/article/001/001...,[3bfHR]
1,https://n.news.naver.com/mnews/article/001/001...,"[3yWUR, 1yaqn]"
2,https://n.news.naver.com/mnews/article/001/001...,"[3VpHz, NZ4k, mdYm, aY6Z6]"
3,https://n.news.naver.com/mnews/article/001/001...,"[CioL, 1gMlZ, 55HyA]"
4,https://n.news.naver.com/mnews/article/001/001...,[2IorO]
...,...,...
136485,https://n.news.naver.com/mnews/article/666/000...,"[5EnCt, 1cKfH, 1t8Td, dinh0, 2qpq1, 4j8d3, 22siB]"
136486,https://n.news.naver.com/mnews/article/666/000...,"[2bYukE, 1mvYo, 1jmKY, 2oLkE, bhORL, 3zpi1, 5J..."
136487,https://n.news.naver.com/mnews/article/666/000...,"[ODGV, 2KGlj, 2LivW]"
136488,https://n.news.naver.com/mnews/article/666/000...,"[1r6QJ, 1WDFO, 2bfE6, q7WC, 1UQ2v, jxB5, 3U4BY..."


In [122]:
# 가장 유사도가 높은 user 3명 출력
target_user = '2mrn1'
similar_users = [t[0] for t in model.dv.most_similar(target_user)[:3]]
similar_users

['7UZ9O', '2bZpKL', '1hS5c']

In [123]:
# 유사한 user가 읽은 기사만 필터링
filtered_df = df_readers[df_readers['readers'].apply(lambda x: any(reader in x for reader in similar_users))]
filtered_df

Unnamed: 0,url,readers
2109,https://n.news.naver.com/mnews/article/001/001...,"[6ZXJv, cqSEH, 4ACr3, 3x9z3, bxqVT, 4LW8l, 2J3..."
8968,https://n.news.naver.com/mnews/article/001/001...,"[xIEt, 2tPYY, 1TwJ4, 7UZ9O, 3Jmke, 8CNAj]"
9030,https://n.news.naver.com/mnews/article/001/001...,"[atCGC, 8axk, 5MSzp, 1ltue, 3wzJV, 3BduD, 7jYU..."
49567,https://n.news.naver.com/mnews/article/015/000...,"[1g49m, 3nIx5, 6sLZI, 1W16, 2ciQT, c1Pt1, o5UC..."
50623,https://n.news.naver.com/mnews/article/016/000...,"[VHY4, 14iAR, 2JQk0, 4ZPQr, 4Sv65, 33WkB, 7UZ9O]"
61224,https://n.news.naver.com/mnews/article/018/000...,"[Eo1x, 25t9q, 33b1b, 4ebhn, 2GbuZ, 29sfh, 2cZK..."
62850,https://n.news.naver.com/mnews/article/020/000...,"[qKkO, 3sQWv, 1hS5c, 4uUpj, 2euz5, 39FCZ, 2ZQx..."
65301,https://n.news.naver.com/mnews/article/021/000...,"[7UZ9O, d0fSa]"
65545,https://n.news.naver.com/mnews/article/021/000...,[7UZ9O]
68116,https://n.news.naver.com/mnews/article/023/000...,"[1hS5c, 2JH8l, 2Jg48, ZSum, 5DoIJ, 2mrT6, 3JtU..."


In [124]:
# target_user가 읽은 기사 제외 
filtered_df = filtered_df[~filtered_df['readers'].apply(lambda x: target_user in x)]
filtered_df

Unnamed: 0,url,readers
2109,https://n.news.naver.com/mnews/article/001/001...,"[6ZXJv, cqSEH, 4ACr3, 3x9z3, bxqVT, 4LW8l, 2J3..."
8968,https://n.news.naver.com/mnews/article/001/001...,"[xIEt, 2tPYY, 1TwJ4, 7UZ9O, 3Jmke, 8CNAj]"
9030,https://n.news.naver.com/mnews/article/001/001...,"[atCGC, 8axk, 5MSzp, 1ltue, 3wzJV, 3BduD, 7jYU..."
49567,https://n.news.naver.com/mnews/article/015/000...,"[1g49m, 3nIx5, 6sLZI, 1W16, 2ciQT, c1Pt1, o5UC..."
50623,https://n.news.naver.com/mnews/article/016/000...,"[VHY4, 14iAR, 2JQk0, 4ZPQr, 4Sv65, 33WkB, 7UZ9O]"
61224,https://n.news.naver.com/mnews/article/018/000...,"[Eo1x, 25t9q, 33b1b, 4ebhn, 2GbuZ, 29sfh, 2cZK..."
62850,https://n.news.naver.com/mnews/article/020/000...,"[qKkO, 3sQWv, 1hS5c, 4uUpj, 2euz5, 39FCZ, 2ZQx..."
65301,https://n.news.naver.com/mnews/article/021/000...,"[7UZ9O, d0fSa]"
65545,https://n.news.naver.com/mnews/article/021/000...,[7UZ9O]
68116,https://n.news.naver.com/mnews/article/023/000...,"[1hS5c, 2JH8l, 2Jg48, ZSum, 5DoIJ, 2mrT6, 3JtU..."


---
test 2

In [125]:
# 가장 유사도가 높은 user 3명
target_user = '7UZ9O'
similar_users = [t[0] for t in model.dv.most_similar(target_user)[:3]]
similar_users

['2btwZ', '2Lgxt', '951jp']

In [126]:
# 유사한 user가 읽은 기사만 필터링
filtered_df = df_readers[df_readers['readers'].apply(lambda x: any(reader in x for reader in similar_users))]
filtered_df

Unnamed: 0,url,readers
29994,https://n.news.naver.com/mnews/article/009/000...,"[5L4x2, IeTR, 1f2Hn, fWKr, 2AoZP, atqnO, 4oFuG..."
47070,https://n.news.naver.com/mnews/article/015/000...,"[vYha, 1L2AV, 2c5l2G, 1uRf6, 5uAce, 29ue3, 1zX..."
47854,https://n.news.naver.com/mnews/article/015/000...,"[4AH9r, 1WV93, 20ktS, el1Aj, 595lv, bgDpM, 5uA..."
72844,https://n.news.naver.com/mnews/article/028/000...,"[2oHsU, 51Uoh, 6D8Y, 1tVe4, 1d8PF, 4WbYC, 3NLs..."
74846,https://n.news.naver.com/mnews/article/029/000...,"[r8xY, 4lIQb, 1kONe, 1bQMC, 4NaV2, 2Dzfe, 18Jk..."
100226,https://n.news.naver.com/mnews/article/243/000...,"[6jCRx, d8aTG, 1VhOV, 3Okf9, Y13l, 68bYq, 32vZ..."
101683,https://n.news.naver.com/mnews/article/243/000...,"[cMgpl, 14WJ6, 3jQqh, 5qFsj, 1q3WX, 1e9ny, xhk..."
101702,https://n.news.naver.com/mnews/article/243/000...,"[Znzv, d8aTG, 2yGw0, 2oK5h, YvYI, d7wqo, 1I60w..."
105174,https://n.news.naver.com/mnews/article/277/000...,"[48CiX, 3nYYD, 2btwZ]"
106855,https://n.news.naver.com/mnews/article/366/000...,"[qWmh, e8JoF, JKNx, 4Ym36, 2A6ZD, 1eEZ0, 6jyfQ..."


In [127]:
# target_user가 읽은 기사 제외 
filtered_df = filtered_df[~filtered_df['readers'].apply(lambda x: target_user in x)]
filtered_df

Unnamed: 0,url,readers
29994,https://n.news.naver.com/mnews/article/009/000...,"[5L4x2, IeTR, 1f2Hn, fWKr, 2AoZP, atqnO, 4oFuG..."
47070,https://n.news.naver.com/mnews/article/015/000...,"[vYha, 1L2AV, 2c5l2G, 1uRf6, 5uAce, 29ue3, 1zX..."
47854,https://n.news.naver.com/mnews/article/015/000...,"[4AH9r, 1WV93, 20ktS, el1Aj, 595lv, bgDpM, 5uA..."
72844,https://n.news.naver.com/mnews/article/028/000...,"[2oHsU, 51Uoh, 6D8Y, 1tVe4, 1d8PF, 4WbYC, 3NLs..."
74846,https://n.news.naver.com/mnews/article/029/000...,"[r8xY, 4lIQb, 1kONe, 1bQMC, 4NaV2, 2Dzfe, 18Jk..."
100226,https://n.news.naver.com/mnews/article/243/000...,"[6jCRx, d8aTG, 1VhOV, 3Okf9, Y13l, 68bYq, 32vZ..."
101683,https://n.news.naver.com/mnews/article/243/000...,"[cMgpl, 14WJ6, 3jQqh, 5qFsj, 1q3WX, 1e9ny, xhk..."
101702,https://n.news.naver.com/mnews/article/243/000...,"[Znzv, d8aTG, 2yGw0, 2oK5h, YvYI, d7wqo, 1I60w..."
105174,https://n.news.naver.com/mnews/article/277/000...,"[48CiX, 3nYYD, 2btwZ]"
106855,https://n.news.naver.com/mnews/article/366/000...,"[qWmh, e8JoF, JKNx, 4Ym36, 2A6ZD, 1eEZ0, 6jyfQ..."


---
test 3

In [128]:
# 가장 유사도가 높은 user 5명
target_user = '951jp'
similar_users = [t[0] for t in model.dv.most_similar(target_user)[:5]]
similar_users

['banwM', '1BueJ', 'lQ91', '2bTlhK', '4jKor']

In [135]:
# 유사한 user가 읽은 기사만 필터링
filtered_df = df_readers[df_readers['readers'].apply(lambda x: any(reader in x for reader in similar_users))]
filtered_df

Unnamed: 0,url,readers
178,https://n.news.naver.com/mnews/article/001/001...,"[1Z7rv, 4jKor, 3ABhD, OIgv, 1jwL1, 1ieUB, 28uN..."
197,https://n.news.naver.com/mnews/article/001/001...,"[1ENdl, ThL6, 2DDL1, OC59, 2yagM, kkt2, BM9v, ..."
12300,https://n.news.naver.com/mnews/article/003/001...,"[6uGA, 109NM, 1nHfd, WGsn, 3jisr, 3qiTD, KUGH,..."
64683,https://n.news.naver.com/mnews/article/021/000...,"[16Ifb, 1NJl0, 23AcB, 2bTlhK, 22I10, 2w1kH, 2s..."
73480,https://n.news.naver.com/mnews/article/029/000...,"[2XDYc, 2PKIj, x3ja, 2ERUb, 2pByx, 31OgR, 4UsI..."
124896,https://n.news.naver.com/mnews/article/422/000...,"[1QxLr, 26xVW, HaSY, 1rhu4, 44bzx, 3skBZ, banw..."


In [136]:
# target_user가 읽은 기사 제외 
filtered_df = filtered_df[~filtered_df['readers'].apply(lambda x: target_user in x)]
filtered_df

Unnamed: 0,url,readers
178,https://n.news.naver.com/mnews/article/001/001...,"[1Z7rv, 4jKor, 3ABhD, OIgv, 1jwL1, 1ieUB, 28uN..."
197,https://n.news.naver.com/mnews/article/001/001...,"[1ENdl, ThL6, 2DDL1, OC59, 2yagM, kkt2, BM9v, ..."
12300,https://n.news.naver.com/mnews/article/003/001...,"[6uGA, 109NM, 1nHfd, WGsn, 3jisr, 3qiTD, KUGH,..."
64683,https://n.news.naver.com/mnews/article/021/000...,"[16Ifb, 1NJl0, 23AcB, 2bTlhK, 22I10, 2w1kH, 2s..."
73480,https://n.news.naver.com/mnews/article/029/000...,"[2XDYc, 2PKIj, x3ja, 2ERUb, 2pByx, 31OgR, 4UsI..."
124896,https://n.news.naver.com/mnews/article/422/000...,"[1QxLr, 26xVW, HaSY, 1rhu4, 44bzx, 3skBZ, banw..."


---

In [137]:
# refactored code -> 강사님 피드백 받고 py 파일 만들 때 참고하기
def recommnend_news_by_similar_users(data, model, target_user):
    filtered_df = df_readers[df_readers['readers'].apply(lambda x: any(reader in x for reader in similar_users) and not target_user in x)]
    return filtered_df

In [139]:
recommnend_news_by_similar_users(data=df, model=model, target_user='951jp')

Unnamed: 0,url,readers
178,https://n.news.naver.com/mnews/article/001/001...,"[1Z7rv, 4jKor, 3ABhD, OIgv, 1jwL1, 1ieUB, 28uN..."
197,https://n.news.naver.com/mnews/article/001/001...,"[1ENdl, ThL6, 2DDL1, OC59, 2yagM, kkt2, BM9v, ..."
12300,https://n.news.naver.com/mnews/article/003/001...,"[6uGA, 109NM, 1nHfd, WGsn, 3jisr, 3qiTD, KUGH,..."
64683,https://n.news.naver.com/mnews/article/021/000...,"[16Ifb, 1NJl0, 23AcB, 2bTlhK, 22I10, 2w1kH, 2s..."
73480,https://n.news.naver.com/mnews/article/029/000...,"[2XDYc, 2PKIj, x3ja, 2ERUb, 2pByx, 31OgR, 4UsI..."
124896,https://n.news.naver.com/mnews/article/422/000...,"[1QxLr, 26xVW, HaSY, 1rhu4, 44bzx, 3skBZ, banw..."
