In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from keybert import KeyBERT
from soykeyword.lasso import LassoKeywordExtractor

def preprocess_data(sentences):
    # TF-IDF 벡터화
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(sentences)
    return X

def cluster_sentences(sentences, num_clusters):
    X = preprocess_data(sentences)
    
    # K-means 클러스터링
    kmeans = KMeans(n_clusters=num_clusters)
    kmeans.fit(X)
    
    return kmeans.labels_

def get_cluster_keywords(sentences, cluster_labels, num_keywords):
    extractor = LassoKeywordExtractor(
        tokenizer='noun', 
        min_tf=1, 
        verbose=False
    )
    keywords = []
    
    for i in range(max(cluster_labels)+1):
        cluster_sentences = [sentences[j] for j, label in enumerate(cluster_labels) if label == i]
        cluster_keywords = extractor.extract_from_sentences(cluster_sentences, topk=num_keywords)
        keywords.append(cluster_keywords)
        
    return keywords

def get_similarity_scores(sentences, query):
    model = KeyBERT('distilbert-base-nli-mean-tokens')
    scores = model.score(sentences, query)
    return scores

def get_lowest_similarity_sentences(sentences, query, num_sentences):
    scores = get_similarity_scores(sentences, query)
    sorted_indices = sorted(range(len(scores)), key=lambda i: scores[i])
    lowest_similarity_sentences = [sentences[i] for i in sorted_indices[:num_sentences]]
    
    return lowest_similarity_sentences

# 사용자 입력 문장 리스트
sentences = [
    '은행 인턴 경험 있습니다',
    '실무에서 고객 데이터 분석을 통해 인사이트를 도출했다',
    '미래에셋 공모전 우수상',
    '머신러닝 알고리즘을 활용해서 금융 데이터 분석을 했다'
]

# 군집 수
num_clusters = 2

# 사용자 입력 군집화
cluster_labels = cluster_sentences(sentences, num_clusters)

# 군집별 키워드 추출
num_keywords = 3
cluster_keywords = get_cluster_keywords(sentences, cluster_labels, num_keywords)

# 사용자 입력과 유사도가 가장 낮은 문장 추출
query = '데이터 분석 경험'
num_sentences = 2
lowest_similarity_sentences = get_lowest_similarity_sentences(sentences, query, num_sentences)

print('군집 레이블:', cluster_labels)
print('군집 키워드:', cluster_keywords)
print('유사도가 가장 낮은 문장:', lowest_similarity_sentences)

ModuleNotFoundError: No module named 'soykeyword'