In [None]:
import chromadb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# 한글 폰트 설정 (선택사항)
plt.rcParams['font.family'] = 'DejaVu Sans'
sns.set_style('whitegrid')

## 1. ChromaDB 연결 및 기본 정보

In [None]:
# ChromaDB 클라이언트 초기화
client = chromadb.PersistentClient(path="./chromadb")

# 사용 가능한 컬렉션 목록
collections = client.list_collections()
print(f"총 {len(collections)}개의 컬렉션이 있습니다:\n")
for col in collections:
    print(f"  - {col.name}: {col.count()} documents")

In [None]:
# 컬렉션 선택 (기본값: iclr_neurips_2021_2025)
collection_name = "iclr_neurips_2021_2025"  # 필요시 변경
collection = client.get_collection(name=collection_name)

print(f"컬렉션: {collection_name}")
print(f"총 문서 수: {collection.count()}")
print(f"메타데이터: {collection.metadata}")

## 2. 데이터 샘플 확인

In [None]:
# 처음 10개 문서 가져오기
results = collection.get(
    limit=10,
    include=['documents', 'metadatas']
)

# DataFrame으로 변환하여 보기 쉽게 표시
df_sample = pd.DataFrame({
    'id': results['ids'],
    'type': [m['type'] for m in results['metadatas']],
    'forum_id': [m['forum_id'] for m in results['metadatas']],
    'content_preview': [doc[:100] + '...' for doc in results['documents']]
})

print("\n샘플 데이터:")
df_sample

## 3. 전체 데이터 통계

In [None]:
# 전체 데이터 가져오기 (메타데이터만)
all_data = collection.get(
    include=['metadatas', 'documents']
)

# 문서 타입별 분포
types = [m['type'] for m in all_data['metadatas']]
type_counts = Counter(types)

print("\n문서 타입별 분포:")
for type_name, count in type_counts.items():
    print(f"  {type_name}: {count}")

# 논문 수 계산 (abstract 개수)
num_papers = type_counts.get('abstract', 0)
num_reviews = type_counts.get('review', 0)
avg_reviews_per_paper = num_reviews / num_papers if num_papers > 0 else 0

print(f"\n총 논문 수: {num_papers}")
print(f"총 리뷰 수: {num_reviews}")
print(f"논문당 평균 리뷰 수: {avg_reviews_per_paper:.2f}")

## 4. 데이터 시각화

In [None]:
# 문서 타입 분포 시각화
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 파이 차트
axes[0].pie(type_counts.values(), labels=type_counts.keys(), autopct='%1.1f%%', startangle=90)
axes[0].set_title('Document Type Distribution', fontsize=14, fontweight='bold')

# 바 차트
axes[1].bar(type_counts.keys(), type_counts.values(), color=['#2ecc71', '#3498db'])
axes[1].set_xlabel('Document Type', fontsize=12)
axes[1].set_ylabel('Count', fontsize=12)
axes[1].set_title('Document Type Count', fontsize=14, fontweight='bold')
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# 논문별 리뷰 수 분포
forum_ids = [m['forum_id'] for m in all_data['metadatas'] if m['type'] == 'review']
reviews_per_paper = Counter(forum_ids)
review_counts = list(reviews_per_paper.values())

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 히스토그램
axes[0].hist(review_counts, bins=range(min(review_counts), max(review_counts)+2),
             edgecolor='black', alpha=0.7, color='#e74c3c')
axes[0].set_xlabel('Number of Reviews per Paper', fontsize=12)
axes[0].set_ylabel('Number of Papers', fontsize=12)
axes[0].set_title('Distribution of Reviews per Paper', fontsize=14, fontweight='bold')
axes[0].grid(axis='y', alpha=0.3)

# 박스 플롯
axes[1].boxplot(review_counts, vert=True)
axes[1].set_ylabel('Number of Reviews', fontsize=12)
axes[1].set_title('Reviews per Paper - Box Plot', fontsize=14, fontweight='bold')
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\n리뷰 수 통계:")
print(f"  최소: {min(review_counts)}")
print(f"  최대: {max(review_counts)}")
print(f"  평균: {np.mean(review_counts):.2f}")
print(f"  중앙값: {np.median(review_counts):.1f}")
print(f"  표준편차: {np.std(review_counts):.2f}")

In [None]:
# 문서 길이 분포
doc_lengths = [len(doc) for doc in all_data['documents']]
abstract_lengths = [len(doc) for doc, meta in zip(all_data['documents'], all_data['metadatas'])
                    if meta['type'] == 'abstract']
review_lengths = [len(doc) for doc, meta in zip(all_data['documents'], all_data['metadatas'])
                  if meta['type'] == 'review']

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 전체 문서 길이 히스토그램
axes[0].hist(doc_lengths, bins=50, edgecolor='black', alpha=0.7, color='#9b59b6')
axes[0].set_xlabel('Document Length (characters)', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].set_title('Document Length Distribution', fontsize=14, fontweight='bold')
axes[0].grid(axis='y', alpha=0.3)

# 타입별 박스 플롯
axes[1].boxplot([abstract_lengths, review_lengths], labels=['Abstract', 'Review'])
axes[1].set_ylabel('Document Length (characters)', fontsize=12)
axes[1].set_title('Length by Document Type', fontsize=14, fontweight='bold')
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\n문서 길이 통계:")
print(f"  Abstract 평균: {np.mean(abstract_lengths):.0f} characters")
print(f"  Review 평균: {np.mean(review_lengths):.0f} characters")

## 5. 유사도 검색 테스트

In [None]:
# 샘플 쿼리로 검색
query = "What are the main weaknesses of this paper?"

results = collection.query(
    query_texts=[query],
    n_results=5
)

print(f"Query: {query}\n")
print("="*80)

for i, (doc, meta, distance) in enumerate(zip(
    results['documents'][0],
    results['metadatas'][0],
    results['distances'][0]
)):
    print(f"\n[Result {i+1}] (Distance: {distance:.4f})")
    print(f"Type: {meta['type']}")
    print(f"Forum ID: {meta['forum_id']}")
    print(f"Content: {doc[:300]}...")
    print("-"*80)

In [None]:
# 다양한 쿼리 테스트
queries = [
    "experimental validation and empirical results",
    "theoretical contribution and novelty",
    "reproducibility and code availability",
    "limitations and future work"
]

for query in queries:
    results = collection.query(
        query_texts=[query],
        n_results=3
    )

    print(f"\nQuery: '{query}'")
    print(f"Top result distance: {results['distances'][0][0]:.4f}")
    print(f"Type: {results['metadatas'][0][0]['type']}")
    print(f"Preview: {results['documents'][0][0][:150]}...")
    print("-"*80)

## 6. 특정 논문의 모든 리뷰 조회

In [None]:
# 첫 번째 논문의 forum_id 가져오기
first_abstract = [m for m in all_data['metadatas'] if m['type'] == 'abstract'][0]
forum_id = first_abstract['forum_id']

print(f"Forum ID: {forum_id}\n")

# 해당 논문의 abstract와 모든 리뷰 가져오기
paper_data = collection.get(
    where={"forum_id": forum_id},
    include=['documents', 'metadatas']
)

# Abstract 출력
for doc, meta in zip(paper_data['documents'], paper_data['metadatas']):
    if meta['type'] == 'abstract':
        print("[ABSTRACT]")
        print(doc)
        print("\n" + "="*80 + "\n")

# 리뷰들 출력
review_count = 0
for doc, meta in zip(paper_data['documents'], paper_data['metadatas']):
    if meta['type'] == 'review':
        review_count += 1
        print(f"[REVIEW {review_count}]")
        print(f"Note ID: {meta.get('note_id', 'N/A')}")
        print(doc)
        print("\n" + "-"*80 + "\n")

print(f"\n총 {review_count}개의 리뷰")

## 7. 커스텀 검색 (필터링)

In [None]:
# 리뷰만 검색
query = "model architecture and design choices"

results = collection.query(
    query_texts=[query],
    n_results=3,
    where={"type": "review"}  # 리뷰만 필터링
)

print(f"Query: {query} (Reviews only)\n")
print("="*80)

for i, (doc, meta, distance) in enumerate(zip(
    results['documents'][0],
    results['metadatas'][0],
    results['distances'][0]
)):
    print(f"\n[Review {i+1}] (Distance: {distance:.4f})")
    print(f"Forum ID: {meta['forum_id']}")
    print(f"Content: {doc[:250]}...")
    print("-"*80)

In [None]:
# Abstract만 검색
query = "machine learning and deep neural networks"

results = collection.query(
    query_texts=[query],
    n_results=3,
    where={"type": "abstract"}  # Abstract만 필터링
)

print(f"Query: {query} (Abstracts only)\n")
print("="*80)

for i, (doc, meta, distance) in enumerate(zip(
    results['documents'][0],
    results['metadatas'][0],
    results['distances'][0]
)):
    print(f"\n[Abstract {i+1}] (Distance: {distance:.4f})")
    print(f"Forum ID: {meta['forum_id']}")
    print(f"Content: {doc[:300]}...")
    print("-"*80)