In [1]:
!pip install spacy ginza sudachipy sudachidict_core pandas tqdm datasets pysbd --quiet
!python -m spacy download ja_core_news_lg

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.1/72.1 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.1/71.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ja-core-news-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ja_core_news_lg-3.8.0/ja_core_news_lg-3.8.0-py3-none-any.whl (555.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m555.3/555.3 MB[0m [31m874.5 kB/s[0m eta [36m0:00:00[0m
Installing collected packages: ja-core-news-lg
Successfully installed ja-core-news-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('ja_core_news_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to 

In [5]:
import pandas as pd
import re
import random
from tqdm import tqdm
import spacy
from typing import List, Dict, Optional
import requests
import pysbd

# 1. Explicit Indicator Splitter
class ExplicitSplitter:
    def __init__(self):
        self.explicit_switchers = [
            'だが', 'ただ', 'ただし', 'とはいえ', 'といっても', 'なのに', 'それなのに', 'にもかかわらず',
            'それにもかかわらず', 'ものの', 'ながら', 'ながらも', 'かかわらず', 'とはいうものの', 'そうはいうものの',
            'けど', 'けれど', 'けれども', 'しかし', 'でも', 'が', 'ところが', 'そうは言っても', 'とは言え',
            'とは言っても', 'にしても', 'にしろ', 'にせよ', 'そうとは言え', 'それでも', 'その反面', 'その一方で',
            '反対に', '逆に', 'それが'
        ]
        # Implicit/sentiment-based splitting: discuss with the team

    def find_first_explicit(self, text: str):
        positions = [(text.find(sw), sw) for sw in self.explicit_switchers if sw in text]
        if positions:
            first = min([p for p in positions if p[0] != -1], default=(None, None))
            return first[1] if first[0] is not None else None
        return None

    def split(self, text: str) -> List[str]:
        sw = self.find_first_explicit(text)
        if sw:
            parts = text.split(sw, 1)
            if len(parts) == 2 and parts[0].strip() and parts[1].strip():
                return [parts[0].strip(), sw + parts[1].strip()]
        # No explicit indicator, split on "。", "、、、", "。。。"
        # Implicit/sentiment-based splitting: discuss with the team
        result = []
        # 、、、 or 。。。: try to split at these as well
        for delim in ["、、、", "。。。", "。"]:
            if delim in text:
                return [s+delim if i < len(text.split(delim))-1 else s
                        for i, s in enumerate(text.split(delim)) if s]
        return [text]

explicit_splitter = ExplicitSplitter()

# 2. Data Loading (WRIME or fallback)
def load_wrime_data(url="https://raw.githubusercontent.com/ids-cv/wrime/refs/heads/master/wrime-ver1.tsv") -> List[str]:
    try:
        df = pd.read_csv(url, sep="\t", encoding='utf-8')
        sents = df["Sentence"].dropna().astype(str).tolist()
        return [s for s in sents if 10 <= len(s) <= 300]
    except Exception:
        # fallback
        return [
            "見た目は素晴らしいです。ただ、値段が高すぎると思います。",
            "サービスは最高でした。ところが待ち時間が長すぎます。",
            "料理は美味しいです。そうは言っても、量が少なすぎます。",
            "音質は素晴らしいです。ただし、重量が重すぎます。",
            "このアプリは便利。でも、広告が多すぎる。"
        ]

# 3. Ground Truth Creation (Explicit & Punct Only)
def create_ground_truth(sentences: List[str], splitter: ExplicitSplitter, sample_size=200) -> Dict[str, List[str]]:
    sample = random.sample(sentences, min(sample_size, len(sentences)))
    ground_truth = {}
    for s in tqdm(sample, desc="Ground truth"):
        ground_truth[s] = splitter.split(s)
    return ground_truth

# 4. Splitter Implementations
def spacy_split(text: str) -> List[str]:
    if not hasattr(spacy_split, "nlp"):
        try:
            spacy_split.nlp = spacy.load("ja_core_news_lg")
        except Exception:
            return [text]
    try:
        doc = spacy_split.nlp(text)
        return [sent.text.strip() for sent in doc.sents if sent.text.strip()] or [text]
    except Exception:
        return [text]

def punctuation_split(text: str) -> List[str]:
    # Split on "。", "、、、", "。。。"
    for delim in ["、、、", "。。。", "。"]:
        if delim in text:
            return [s+delim if i < len(text.split(delim))-1 else s
                    for i, s in enumerate(text.split(delim)) if s]
    return [text]

def advanced_regex_split(text: str) -> List[str]:
    # For now, just use punctuation split. (Any advanced logic: discuss with the team)
    return punctuation_split(text)

def clause_boundary_split(text: str) -> List[str]:
    # Only split if conjunction + comma appears
    pattern = r'(.*?(?:が|けど|けれど|のに|しかし|そして|または|それで|だから|ところが|ので|から)、)'
    matches = re.findall(pattern, text)
    result = []
    remaining = text
    for m in matches:
        idx = remaining.find(m)
        if idx != -1:
            result.append(m.strip())
            remaining = remaining[len(m):]
    if remaining.strip():
        result.append(remaining.strip())
    return result if len(result) > 1 else [text]

def pysbd_split(text: str) -> List[str]:
    seg = pysbd.Segmenter(language="ja", clean=False)
    sents = seg.segment(text)
    return [s.strip() for s in sents if s.strip()] or [text]

# 5. Evaluation
def calculate_split_similarity(predicted: List[str], true: List[str]) -> Dict[str, float]:
    exact_match = predicted == true
    count_diff = abs(len(predicted) - len(true))
    count_similarity = 1.0 / (1.0 + count_diff)
    # Boundary metrics
    def get_boundaries(lst):
        pos, bounds = 0, set()
        for part in lst[:-1]:
            pos += len(part)
            bounds.add(pos)
        return bounds
    pb = get_boundaries(predicted)
    tb = get_boundaries(true)
    if not pb and not tb:
        bp = br = bf1 = 1.0
    elif not pb:
        bp = br = bf1 = 0.0
    elif not tb:
        bp = 0.0; br = 1.0; bf1 = 0.0
    else:
        inter = len(pb & tb)
        bp = inter / len(pb) if pb else 0.0
        br = inter / len(tb) if tb else 0.0
        bf1 = (2 * bp * br) / (bp + br) if (bp + br) > 0 else 0.0
    return {
        'exact_match': exact_match,
        'count_similarity': count_similarity,
        'boundary_precision': bp,
        'boundary_recall': br,
        'boundary_f1': bf1
    }

def evaluate_splitter(splitter_func, ground_truth: Dict[str, List[str]], name: str) -> Dict:
    total_exact = 0
    total = 0
    boundary_precisions = []
    boundary_recalls = []
    boundary_f1s = []
    count_similarities = []
    for sentence, true_splits in tqdm(ground_truth.items(), desc=f"Evaluating {name}"):
        pred_splits = splitter_func(sentence)
        sim = calculate_split_similarity(pred_splits, true_splits)
        total_exact += int(sim['exact_match'])
        boundary_precisions.append(sim['boundary_precision'])
        boundary_recalls.append(sim['boundary_recall'])
        boundary_f1s.append(sim['boundary_f1'])
        count_similarities.append(sim['count_similarity'])
        total += 1
    return {
        'name': name,
        'exact_match_ratio': total_exact / max(total,1),
        'avg_boundary_precision': sum(boundary_precisions) / max(len(boundary_precisions),1),
        'avg_boundary_recall': sum(boundary_recalls) / max(len(boundary_recalls),1),
        'avg_boundary_f1': sum(boundary_f1s) / max(len(boundary_f1s),1),
        'avg_count_similarity': sum(count_similarities) / max(len(count_similarities),1),
        'total_sentences': total
    }

# 6. Main Execution
def main():
    print("="*60)
    print("Japanese Sentence Split Evaluation (Explicit Only)")
    print("="*60)
    random.seed(42)
    sentences = load_wrime_data()
    print(f"Loaded {len(sentences)} sentences.")

    ground_truth = create_ground_truth(sentences, explicit_splitter, sample_size=10_000)

    splitters = {
        "spaCy ja_core_news_lg": spacy_split,
        "Punctuation Split": punctuation_split,
        "Advanced Regex Split": advanced_regex_split,
        "Clause Boundary Split": clause_boundary_split
    }
    splitters["PySBD Japanese"] = pysbd_split

    results = []
    for name, func in splitters.items():
        res = evaluate_splitter(func, ground_truth, name)
        results.append(res)

    # Show results
    df = pd.DataFrame(results)
    print(df[['name', 'exact_match_ratio', 'avg_boundary_f1', 'avg_boundary_precision', 'avg_boundary_recall', 'avg_count_similarity', 'total_sentences']])

    print("\nNOTE: Only explicit indicators and Japanese punctuation (。、、、, etc) used.")
    print("      Implicit/sentiment-based splitting will be discussed with the team. May use LLM for future improvement.")

if __name__ == "__main__":
    main()


Japanese Sentence Split Evaluation (Explicit Only)
Loaded 39162 sentences.


Ground truth: 100%|██████████| 10000/10000 [00:00<00:00, 206013.14it/s]
Evaluating spaCy ja_core_news_lg: 100%|██████████| 9992/9992 [01:31<00:00, 109.73it/s]
Evaluating Punctuation Split: 100%|██████████| 9992/9992 [00:00<00:00, 353472.66it/s]
Evaluating Advanced Regex Split: 100%|██████████| 9992/9992 [00:00<00:00, 300784.34it/s]
Evaluating Clause Boundary Split: 100%|██████████| 9992/9992 [00:00<00:00, 22267.28it/s]
Evaluating PySBD Japanese: 100%|██████████| 9992/9992 [00:03<00:00, 3100.97it/s]

                    name  exact_match_ratio  avg_boundary_f1  \
0  spaCy ja_core_news_lg           0.412230         0.432917   
1      Punctuation Split           0.504604         0.506448   
2   Advanced Regex Split           0.504604         0.506448   
3  Clause Boundary Split           0.399520         0.401488   
4         PySBD Japanese           0.408627         0.431225   

   avg_boundary_precision  avg_boundary_recall  avg_count_similarity  \
0                0.429893             0.506388              0.754579   
1                0.505903             0.508006              0.799128   
2                0.505903             0.508006              0.799128   
3                0.401471             0.407926              0.745472   
4                0.427978             0.506682              0.752450   

   total_sentences  
0             9992  
1             9992  
2             9992  
3             9992  
4             9992  

NOTE: Only explicit indicators and Japanese punctuation


