<a href="https://colab.research.google.com/github/sh-0620/dacon-recommender-system/blob/main/recommendations_Tfidf(all)_content_tune(%EC%9C%A0%EC%A0%80_%EA%B8%B0%EB%B0%98_%ED%98%91%EC%97%85_%ED%95%84%ED%84%B0%EB%A7%81).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

# CSV 파일 로드
log_df = pd.read_csv("/content/drive/MyDrive/웹 기사 추천시스템/view_log.csv")  # 유저 로그 데이터
article__df = pd.read_csv("/content/drive/MyDrive/웹 기사 추천시스템/article_info.csv")  # 기사 데이터
article_nouns_df = pd.read_csv("/content/drive/MyDrive/웹 기사 추천시스템/article_nouns.csv")  # 기사 데이터

In [None]:
article_nouns_df.head()

Unnamed: 0.1,Unnamed: 0,articleID,Title,Content,Format,Language,userID,userCountry,userRegion,Content_Nouns
0,0,ARTICLE_0000,19 Tips For Everyday Git Use,using git full time past year wanted share pra...,HTML,en,USER_0683,,,git time year share tip way git git cheat shee...
1,1,ARTICLE_0001,Intel buys computer vision startup Itseez to i...,intel acquired computer vision machine learnin...,HTML,en,USER_1129,,,computer vision machine learning startup itsee...
2,2,ARTICLE_0002,Practical End-to-End Testing with Protractor,one reason angularjs great work developed arou...,HTML,en,USER_0256,,,reason work idea framework check source core t...
3,3,ARTICLE_0003,Corporate venture growth in Brazil is another ...,despite recent positive news renewed interest ...,HTML,en,USER_1304,,,news interest investor country tsumoney crude ...
4,4,ARTICLE_0004,Cross-channel user experiences with Drupal (aw...,last year around time wrote big reverse web wo...,HTML,en,USER_0336,,,year time web architecture web information per...


In [None]:
article_nouns_df = article_nouns_df.drop(['userCountry', 'userRegion', 'userID'], axis=1)

In [None]:
merged_df = pd.merge(log_df, article_nouns_df, on='articleID')

In [None]:
# 기사 데이터 중 로그 데이터에 없는 기사 제거
article_nouns_df  = article_nouns_df[article_nouns_df['articleID'].isin(merged_df['articleID'].unique())]

In [None]:
# 라벨 인코더 생성 및 article_id 변환
# article_label_encoder = LabelEncoder()
# log_df['articleID'] = article_label_encoder.fit_transform(log_df['articleID'])

# 유저 아이디도 라벨 인코딩 적용
# user_label_encoder = LabelEncoder()
# log_df['userID'] = user_label_encoder.fit_transform(log_df['userID'])

In [None]:
# 라벨 인코더 생성 및 학습
article_label_encoder = LabelEncoder()
merged_df['articleID'] = article_label_encoder.fit_transform(merged_df['articleID'])
article_nouns_df['articleID'] = article_label_encoder.transform(article_nouns_df['articleID'])

In [None]:
# 유저 아이디 라벨 인코딩 적용
user_label_encoder = LabelEncoder()
merged_df['userID'] = user_label_encoder.fit_transform(merged_df['userID'])

In [None]:
# log_df 원핫 인코딩
encoder = OneHotEncoder()

# 범주형 변수 선택 (지역, 나라)
log_df_categories = log_df[['userRegion', 'userCountry']]

# One-Hot Encoding 수행
log_df_encoded_categories = encoder.fit_transform(log_df_categories).toarray()

In [None]:
# article_nouns_df 원핫 인코딩
encoder = OneHotEncoder()

# 범주형 변수 선택 (언어, 형식)
article_nouns_df_categories = article_nouns_df[['Format', 'Language']]

# One-Hot Encoding 수행
article_nouns_df_encoded_categories = encoder.fit_transform(article_nouns_df_categories).toarray()

In [None]:
# 기사 데이터 전처리: 제목 + 본문
tfidf = TfidfVectorizer(
    stop_words='english',
    max_features=10000,
    ngram_range=(1, 2),
    max_df=0.8,
    min_df=3,
    sublinear_tf=True,
    smooth_idf=True
)
tfidf_matrix = tfidf.fit_transform(article_nouns_df['Title'] + ' ' + article_nouns_df['Content'])

In [None]:
# TF-IDF 행렬, article_nouns_df 원핫 인코딩 / log_df 원핫 인코딩 / => 피처 벡터 결합
combined_features = np.hstack((tfidf_matrix.toarray(), article_nouns_df_encoded_categories))


In [None]:
# 유저별 기사 읽기 패턴 추출
user_read_articles = defaultdict(list)
for _, row in merged_df.iterrows():
    user_read_articles[row['userID']].append(row['articleID'])

In [None]:
# 유저 기반 협업 필터링 모델
def recommend_articles(userID, top_n=5):
    if userID not in user_read_articles or not user_read_articles[userID]:

        return article_nouns_df.sample(n=top_n)[['Title', 'Content']]


    read_articles = user_read_articles[userID]
    read_articles_features = combined_features[read_articles]

    if read_articles_features.shape[0] == 0:
        return article_nouns_df.sample(n=top_n)[['Title', 'Content']]

    # 사용자가 읽은 기사들의 피처 벡터 평균 계산
    avg_features = np.mean(read_articles_features, axis=0).reshape(1, -1)

    # 사용자가 읽은 기사들의 피처 벡터 평균과 모든 기사들의 피처 벡터 간의 코사인 유사도 계산
    similarities = cosine_similarity(avg_features, combined_features)

    # 유사도가 높은 상위 top_n 개의 기사 인덱스 추출
    similar_articles_indices = similarities.flatten().argsort()[-top_n:][::-1]

    # 추천 기사 반환 (유저가 읽은 기사 포함)
    recommendations = article_nouns_df.iloc[similar_articles_indices]
    return recommendations[['Title', 'Content']]

In [None]:
user_recommendations = []

for encoded_user_id in merged_df['userID'].unique():
    # 추천 기사 인덱스 가져오기
    recommended_articles = recommend_articles(encoded_user_id)

    # 원래 유저 아이디로 역변환
    original_user_id = user_label_encoder.inverse_transform([encoded_user_id])[0]

    for idx in recommended_articles.index:
        # 원래 기사 아이디로 역변환
        original_article_id = article_label_encoder.inverse_transform([article_nouns_df.loc[idx, 'articleID']])[0]

        # 추천 결과를 리스트에 저장
        user_recommendations.append({
            'userID': original_user_id,
            'articleID': original_article_id
        })

In [None]:
print(user_recommendations)

In [None]:
recommendations_df = pd.DataFrame(user_recommendations)
recommendations_df.to_csv('user_recommendations_(title+content)_content_tune.csv', index=False)