# 論文ラベルごとの頻出単語分析

論文のラベル（Primary, Secondary, Missing）ごとに頻出単語を分析し、特徴的な単語を抽出します。

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import japanize_matplotlib
import seaborn as sns
from pathlib import Path
import re
import fitz  # PyMuPDF
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK data
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('wordnet', quiet=True)

# Set up plotting
plt.style.use('default')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (12, 8)
japanize_matplotlib.japanize()
%matplotlib inline

## データの読み込み

In [2]:
# Load training labels
train_labels = pd.read_csv('../dataset/train_labels.csv')
print(f"Training labels shape: {train_labels.shape}")
print("\nLabel distribution:")
print(train_labels['type'].value_counts())
print(f"\nPercentages:")
print((train_labels['type'].value_counts() / len(train_labels) * 100).round(2))

Training labels shape: (1028, 3)

Label distribution:
type
Secondary    449
Missing      309
Primary      270
Name: count, dtype: int64

Percentages:
type
Secondary    43.68
Missing      30.06
Primary      26.26
Name: count, dtype: float64


In [3]:
# Set up data paths
train_dir = Path('../dataset/train')
pdf_dir = train_dir / 'PDF'

print(f"PDF files: {len(list(pdf_dir.glob('*.pdf'))) if pdf_dir.exists() else 0}")

PDF files: 524


## テキスト抽出関数の定義

In [1]:
def extract_text_from_pdf(pdf_path):
    """PDFファイルからテキストを抽出"""
    try:
        # PyMuPDFでPDFを開く（アノテーションエラーを抑制）
        doc = fitz.open(pdf_path)
        text_parts = []
        
        # 各ページからテキストを抽出
        for page_num in range(len(doc)):
            try:
                page = doc.load_page(page_num)
                # アノテーションエラーを回避するため、シンプルなテキスト抽出を使用
                text = page.get_text("text", flags=fitz.TEXTFLAGS_TEXT)
                if text.strip():  # 空でない場合のみ追加
                    text_parts.append(text)
            except Exception as page_error:
                # ページレベルのエラーをスキップして次のページを処理
                print(f"Warning: Error processing page {page_num} in {pdf_path}: {page_error}")
                continue
        
        doc.close()
        return ' '.join(text_parts)
    
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
        return ""

def preprocess_text(text):
    """テキストの前処理"""
    if not text:
        return []
    
    # 小文字化
    text = text.lower()
    
    # 数字、特殊文字を除去
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # トークン化
    tokens = word_tokenize(text)
    
    # ストップワード除去
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and len(token) > 2]
    
    # レンマ化
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return tokens

## 論文からテキストを抽出してラベル別に分類

In [5]:
# ラベル別のテキストを格納する辞書
label_texts = {
    'Primary': [],
    'Secondary': [],
    'Missing': []
}

# 各論文のテキストを抽出
processed_count = 0

for _, row in train_labels.iterrows():
    article_id = row['article_id']
    label = row['type']
    
    # PDFファイルのパスを構築
    pdf_path = pdf_dir / f"{article_id}.pdf"
    
    if pdf_path.exists():
        text = extract_text_from_pdf(pdf_path)
        if text:
            label_texts[label].append(text)
            processed_count += 1

print(f"\nTotal processed articles: {processed_count}")
print("\nTexts per label:")
for label, texts in label_texts.items():
    print(f"  {label}: {len(texts)} articles")

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: uns

## ラベル間の特徴的な単語の分析

In [6]:
# TF-IDFスコアを計算して特徴的な単語を抽出
from sklearn.feature_extraction.text import TfidfVectorizer

# 各ラベルの全テキストを結合
label_documents = {}
for label, texts in label_texts.items():
    if texts:
        # 前処理済みのテキストを結合
        processed_texts = [' '.join(preprocess_text(text)) for text in texts]
        label_documents[label] = ' '.join(processed_texts)

# TF-IDFベクトライザーを作成
if label_documents:
    documents = list(label_documents.values())
    labels = list(label_documents.keys())
    
    vectorizer = TfidfVectorizer(max_features=1000, min_df=2)
    tfidf_matrix = vectorizer.fit_transform(documents)
    
    # 特徴語の抽出
    feature_names = vectorizer.get_feature_names_out()

In [7]:
# 各ラベルの特徴的な単語を抽出
for i, label in enumerate(labels):
    # TF-IDFスコアを取得
    tfidf_scores = tfidf_matrix[i].toarray()[0]
    
    # スコアでソート
    word_scores = list(zip(feature_names, tfidf_scores))
    word_scores.sort(key=lambda x: x[1], reverse=True)
    
    print(f"\n=== {label} ラベルの特徴的な単語 (TF-IDF) ===")
    for word, score in word_scores[:10]:
        print(f"  {word}: {score:.4f}")


=== Primary ラベルの特徴的な単語 (TF-IDF) ===
  data: 0.2739
  model: 0.1840
  fig: 0.1661
  study: 0.1619
  using: 0.1601
  specie: 0.1578
  analysis: 0.1446
  figure: 0.1411
  cell: 0.1406
  doi: 0.1215

=== Secondary ラベルの特徴的な単語 (TF-IDF) ===
  gene: 0.2423
  protein: 0.2389
  data: 0.2356
  using: 0.1728
  genome: 0.1723
  cell: 0.1707
  alaska: 0.1632
  analysis: 0.1624
  specie: 0.1614
  strain: 0.1495

=== Missing ラベルの特徴的な単語 (TF-IDF) ===
  gene: 0.3166
  data: 0.2115
  cell: 0.2068
  study: 0.1845
  analysis: 0.1770
  using: 0.1648
  fig: 0.1594
  file: 0.1416
  additional: 0.1384
  model: 0.1319


## TFIDFスコアのみで分類してみる

In [None]:
# 論文分類のための関数を定義（DOIチェック機能付き）
def classify_text_by_tfidf(text, tfidf_matrix, labels, feature_names, use_doi_check=True):
    # DOI関連パターンを定義
    doi_patterns = [
        r'10\.\d{4,9}/[-._;()/:A-Z0-9]+',  # DOI
        r'doi\.org/[\w\./\-]+',            # DOI URL
        r'https?://[\w\./\-]+',            # URL
        r'data(?:set|base)?\s+(?:available|deposited|stored|archived)',  # データセット言及
        r'repository\s+(?:at|available)',  # リポジトリ言及
        r'supplementary\s+(?:data|material|information)',  # 補足データ
        r'accession\s+(?:number|code|id)',  # アクセッション番号
        r'github\.com/[\w\./\-]+',         # GitHub
        r'figshare\.com/[\w\./\-]+',       # Figshare
        r'zenodo\.org/[\w\./\-]+',         # Zenodo
        r'dryad\.org/[\w\./\-]+',          # Dryad
        r'genbank',                        # GenBank
        r'bioproject',                     # BioProject
        r'GSE\d+|SR[APRX]\d+|PRJ[NAED][A-Z]?\d+|E-[A-Z]+-\d+',  # GEO/SRA/ENA
    ]
    
    # DOIチェック機能が有効な場合
    if use_doi_check:
        import re
        has_doi_match = False
        
        for pattern in doi_patterns:
            if re.search(pattern, text, re.IGNORECASE):
                has_doi_match = True
                break
        
        # DOI関連文字列がない場合は確定でMissing判定
        if not has_doi_match:
            return "Missing", 1.0, {"Primary": 0.0, "Secondary": 0.0, "Missing": 1.0}
    
    # テキストを前処理
    processed_words = preprocess_text(text)
    
    # 各ラベルのスコアを初期化
    label_scores = {label: 0.0 for label in labels}
    
    # 各単語についてTF-IDFスコアを累積
    for word in processed_words:
        if word in feature_names:
            # feature_namesでのインデックスを取得
            word_index = np.where(feature_names == word)[0]
            if len(word_index) > 0:
                word_index = word_index[0]
                # 各ラベルのTF-IDFスコアを取得して加算
                for i, label in enumerate(labels):
                    tfidf_score = tfidf_matrix[i, word_index]
                    label_scores[label] += tfidf_score
    
    # 単語数で正規化
    if len(processed_words) > 0:
        for label in label_scores:
            label_scores[label] /= len(processed_words)
    
    # 最も高いスコアのラベルを返す
    predicted_label = max(label_scores.items(), key=lambda x: x[1])[0]
    confidence = max(label_scores.values())
    
    return predicted_label, confidence, label_scores

# 分類性能を評価する関数
def evaluate_classification(y_true, y_pred):
    """
    分類性能を評価
    """
    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
    from sklearn.metrics import classification_report, confusion_matrix
    
    accuracy = accuracy_score(y_true, y_pred)
    f1_macro = f1_score(y_true, y_pred, average='macro')
    f1_weighted = f1_score(y_true, y_pred, average='weighted')
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score (macro): {f1_macro:.4f}")
    print(f"F1 Score (weighted): {f1_weighted:.4f}")
    print(f"Precision (macro): {precision:.4f}")
    print(f"Recall (macro): {recall:.4f}")
    
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))
    
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_true, y_pred))

分類関数を定義しました（DOIチェック機能付き）


In [9]:
from tqdm import tqdm

tfidf_submission = {'article_id': [], 'predicted_label': [], 'true_label': [], 'label_scores': []}
# sample = train_labels.sample(n=100, random_state=42)  # サンプルを取得
# for _, row in tqdm(sample.iterrows(), total=sample.shape[0], desc="Processing articles"):
for _, row in tqdm(train_labels.iterrows(), total=train_labels.shape[0], desc="Processing articles"):
    article_id = row['article_id']
    label = row['type']
    
    # PDFファイルのパスを構築
    pdf_path = pdf_dir / f"{article_id}.pdf"
    
    if pdf_path.exists():
        text = extract_text_from_pdf(pdf_path)
        if text:
            predicted_label, confidence, label_scores = classify_text_by_tfidf(text, tfidf_matrix, labels, feature_names)
            tfidf_submission['article_id'].append(article_id)
            tfidf_submission['predicted_label'].append(predicted_label)
            tfidf_submission['true_label'].append(label)
            tfidf_submission['label_scores'].append(label_scores)

# 性能評価
y_true = tfidf_submission['true_label']
y_pred = tfidf_submission['predicted_label']
evaluate_classification(y_true, y_pred)

Processing articles:  62%|██████▏   | 635/1028 [20:42<06:14,  1.05it/s] 

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: unsupported error: cannot create appearance stream for  annotations

MuPDF error: uns

Processing articles: 100%|██████████| 1028/1028 [33:52<00:00,  1.98s/it]

Accuracy: 0.7626
F1 Score (macro): 0.7596
F1 Score (weighted): 0.7675
Precision (macro): 0.7698
Recall (macro): 0.7768

Classification Report:
              precision    recall  f1-score   support

     Missing       0.73      0.77      0.75       309
     Primary       0.62      0.86      0.72       270
   Secondary       0.96      0.70      0.81       449

    accuracy                           0.76      1028
   macro avg       0.77      0.78      0.76      1028
weighted avg       0.80      0.76      0.77      1028


Confusion Matrix:
[[238  62   9]
 [ 33 233   4]
 [ 56  80 313]]





In [10]:
tfidf_submission_df = pd.DataFrame(tfidf_submission)
tfidf_submission_df.head()

Unnamed: 0,article_id,predicted_label,true_label,label_scores
0,10.1002_2017jc013030,Primary,Primary,"{'Primary': 0.01830200243619581, 'Secondary': ..."
1,10.1002_anie.201916483,Missing,Missing,"{'Primary': 0.014237204037258847, 'Secondary':..."
2,10.1002_anie.202005531,Missing,Missing,"{'Primary': 0.011695045450711728, 'Secondary':..."
3,10.1002_anie.202007717,Missing,Missing,"{'Primary': 0.01470705977951414, 'Secondary': ..."
4,10.1002_chem.201902131,Missing,Missing,"{'Primary': 0.010463712002582694, 'Secondary':..."
