In [None]:
import pandas as pd
from gensim.models import Word2Vec
from konlpy.tag import Okt
import string
from sklearn.metrics.pairwise import cosine_similarity
import requests

In [None]:
# 논문 api URL 넣는곳
#here

def fetch_paper_data(): # 초록, Keywords, abstract
    response = requests.get(api_url)
    data = response.json()
    # 데이터가 딕셔너리 형태로 반환되며, 'title', 'abstract', 'keywords' 키를 포함
    return data.get('title', ''), data.get('abstract', ''), data.get('keywords', [])

okt = Okt()

def preprocess_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = okt.nouns(text)
    return tokens

# Word2Vec 모델 Skip-gram
def train_word2vec_model():
    title, abstract, keywords = fetch_paper_data()
    df = pd.DataFrame({'text_column': [title + ' ' + abstract + ' ' + ' '.join(keywords)]})
    df['processed_text'] = df['text_column'].apply(preprocess_text)
    
    model = Word2Vec(sentences=df['processed_text'], vector_size=100, window=5, sg=1, min_count=1)

    return model

def get_document_vector(doc, model):
    word_vectors = [model.wv[word] for word in doc if word in model.wv]
    if not word_vectors:
        return None
    return sum(word_vectors) / len(word_vectors)

def find_similar_papers(input_document, model, top_n=10):
    input_doc_vector = get_document_vector(preprocess_text(input_document), model)
    if input_doc_vector is None:
        return None

    papers_data = [fetch_paper_data() for _ in range(10)]

    similarity_scores = []
    for title, abstract, keywords in papers_data:
        other_doc_vector = get_document_vector(preprocess_text(title + ' ' + abstract + ' ' + ' '.join(keywords)), model)
        if other_doc_vector is not None:
            similarity_score = cosine_similarity([input_doc_vector], [other_doc_vector])[0][0]
            similarity_scores.append((title, similarity_score))

    similarity_scores.sort(key=lambda x: x[1], reverse=True)

    similar_papers = similarity_scores[:min(top_n, len(similarity_scores))]
    return similar_papers

# test 영역
input_text =  "키워드 입력."
trained_model = train_word2vec_model()
similar_papers = find_similar_papers(input_text, trained_model)

print("유사한 논문들:")
for title, similarity in similar_papers:
    print(f"유사도 {similarity:.4f}\nTitle: {title}\n{'-'*30}")