In [4]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np
import re
import ast


In [5]:
df = pd.read_csv('../data/recommendations.csv')

In [6]:
# Function to clean and convert string embeddings to numpy arrays
def clean_and_convert_embeddings(embedding_str):
    # Insert commas between numbers using regular expressions
    cleaned_str = re.sub(r'(?<=\d)\s+(?=[-\d])', ', ', embedding_str)

     # Ensure the string is properly formatted as a list
    if not cleaned_str.startswith('['):
        cleaned_str = '[' + cleaned_str
    if not cleaned_str.endswith(']'):
        cleaned_str = cleaned_str + ']'

    # Convert the cleaned string to a list using ast.literal_eval
    try:
        embedding_list = ast.literal_eval(cleaned_str)
    except (SyntaxError, ValueError) as e:
        print(f"Error parsing embedding: {e}")
        embedding_list = []
    # Convert the list to a numpy array
    return np.array(embedding_list)

In [7]:
df['embedding'] = df['embedding'].apply(clean_and_convert_embeddings)

In [8]:

# Assuming df is your DataFrame and 'embedding' contains the embedded vectors
# Convert the 'embedding' column to a numpy array
embeddings = np.array(df['embedding'].tolist())

# Step 1: Choose the number of clusters
n_clusters = 2  # You can adjust this based on your data

# Step 2: Cluster the data using K-Means
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(embeddings)

# Step 3: Calculate the Silhouette Coefficient
silhouette_avg = silhouette_score(embeddings, cluster_labels)

print(f"The average silhouette score is: {silhouette_avg}")

The average silhouette score is: 0.20208774605643434


In [10]:
random_articles.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, 40597 to 11342
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   url               5 non-null      object
 1   title             5 non-null      object
 2   publication_date  5 non-null      object
 3   content           5 non-null      object
 4   platform_id       5 non-null      object
 5   entities          5 non-null      object
 6   sentence_vector   5 non-null      object
 7   content_vector    5 non-null      object
 8   embedding         5 non-null      object
 9   rec_url           5 non-null      object
 10  rec_title         5 non-null      object
dtypes: object(11)
memory usage: 480.0+ bytes


In [19]:
random_articles['rec_title'] = random_articles['rec_title'].apply(ast.literal_eval)

In [20]:
random_articles['rec_url'] = random_articles['rec_url'].apply(ast.literal_eval)

In [9]:
random_articles = df.sample(n=5, random_state=33)

In [22]:
for row in random_articles.itertuples():
    print(f"Title: {row.title}, URL: {row.url}")
    for i in range(len(row.rec_title)):
        print(f"   Recommended Article {i+1}: {row.rec_title[i]}, URL: {row.rec_url[i]}")
    print("")

Title: [2024 상반기 히트상품] 롯데이노베이트, 비지니스 혁신 이끌 자체 AI 플랫폼, URL: https://v.daum.net/v/20240626113117734
   Recommended Article 1: AI 속여서 정보 빼간다?…높아지는 LLM 취약점, "특화 보안 서비스 필요", URL: https://n.news.naver.com/mnews/article/138/0002176651
   Recommended Article 2: [전문가기고] 외산 AI 넘어서는 '국산 AI 생존 전략', URL: https://n.news.naver.com/mnews/article/138/0002177492
   Recommended Article 3: "감탄했습니다"…옴디아 수석 애널리스트, SKT AI 전략에 '엄지척', URL: https://v.daum.net/v/20240627104151358
   Recommended Article 4: "지적재산제도, AI 효용·안전장치 동시에 잡아야", URL: https://v.daum.net/v/20240820161314809
   Recommended Article 5: [전문가기고] AX 시대, AI 거버넌스가 중요하다, URL: https://v.daum.net/v/20240805160116295

Title: 일본 진출 시동 건 토종 ‘AI 마케팅’ 스타트업 브이캣, URL: https://v.daum.net/v/20240809091200260
   Recommended Article 1: 2030이 이끄는 AI 시대의 IP 혁신…2.9조 기업가치 '스토리', URL: https://v.daum.net/v/20240822125446147
   Recommended Article 2: [Interview] 고이쿠배터리 타바타 아키라 CEO·타바타 이지 COO | GS가 찜한 日 스타트업…10초 만에 전기차 배터리 잔량 진단, URL: https://v.daum.net/v/202406031350037