In [None]:
# !pip install bertopic llama-cpp-python sentence-transformers umap-learn hdbscan datamapplot

import os
import pickle

import torch
import numpy as np
import pandas as pd
import nltk

from huggingface_hub import hf_hub_download
from pathlib import Path
from llama_cpp import Llama
from bertopic.representation import KeyBERTInspired, LlamaCPP
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
import datamapplot





#for japanese fonts in  plots
import matplotlib.pyplot as plt
plt.rcParams["font.family"] = "Noto Sans CJK JP"
plt.rcParams["axes.unicode_minus"] = False

# Detect device: Apple MPS if available, else CPU
device = "mps" if torch.backends.mps.is_available() else "cpu"
print("Using device:", device)

# One‐time NLTK download
nltk.download("stopwords", quiet=True)


#Upload japanese processor and topic modeler (defined manuallt in multiling_helpers.py)
import sys
script_dir = os.path.dirname(os.path.abspath('__file__'))
print("Script directory:", script_dir)
multilingual_dir = os.path.join(script_dir, 'MULTILINGUAL')
sys.path.append(multilingual_dir)
print(f"Added {multilingual_dir} to Python path")
from multiling_helpers import JapaneseProcessor, TopicModeler

#initialise japanese processor
japanese_processor = JapaneseProcessor()
japanese_processor.sentence_transformer_model = "sonoisa/sentence-bert-base-ja-mean-tokens"#"paraphrase-multilingual-mpnet-base-v2"#"sonoisa/sentence-bert-base-ja-mean-tokens" #"oshizo/sbert-jsnli-luke-japanese-base-lite" #"sonoisa/sentence-bert-base-ja-mean-tokens" #cl-tohoku/bert-base-japanese-v3




### download and configure Japanese Llama 3 (Eliza)

https://huggingface.co/elyza/Llama-3-ELYZA-JP-8B-GGUF


In [None]:

# ── CONFIG ──
repo_id   = "elyza/Llama-3-ELYZA-JP-8B-GGUF"
filename  = "Llama-3-ELYZA-JP-8B-q4_k_m.gguf"
cache_dir = "/Users/rbeaute/Projects/MOSAIC/MULTILINGUAL/models/elyza_Llama-3-ELYZA-JP-8B"
os.makedirs(cache_dir, exist_ok=True)

# ── DOWNLOAD ──
model_path = hf_hub_download(
    repo_id=repo_id,
    filename=filename,
    cache_dir=cache_dir,
    repo_type="model",
    force_filename=filename
)
print("Quantized GGUF model saved to:", model_path)


### Load innerspeech dataset

In [None]:
local_pkl = "/Users/rbeaute/Projects/MOSAIC/DATA/multilingual/japanese/innerspeech/innerspeech_reports.pkl"

with open(local_pkl, "rb") as f:
    raw_reports = pickle.load(f)

print("Raw reports type:", type(raw_reports))
if hasattr(raw_reports, "head"):
    display(raw_reports.head(3))
    docs = raw_reports["text"].astype(str).tolist()  
else:
    docs = [str(x) for x in raw_reports]
    print("Sample docs:", docs[:3])

print(f"Total documents: {len(docs)}")


### Preprocess dataset

In [None]:
cleaned_docs = [japanese_processor.preprocess_text(d) for d in docs] #super basic preprocessing (remove extra spaces, etc)
sentences, doc_map = japanese_processor.split_sentences(cleaned_docs)
print(f"Split into {len(sentences)} sentences from {len(cleaned_docs)} docs")

print("First 3 raw doc:", docs[:3])
# print("First 5 cleaned docs:", cleaned_docs[:10])
print("First 5 sentences:", sentences[:5])
print("doc_map for those 5 sentences:", doc_map[:5])


In [None]:
import pandas as pd

sample_df = pd.DataFrame({
    "raw": docs[:5],
    "cleaned": cleaned_docs[:5]
})
print(sample_df.to_markdown())

for i, sent in enumerate(sentences[:5]):
    print(f"{i:02d}:", sent)


In [None]:
print("RAW docs[0]:", repr(docs[0]))
print("CLEANED docs[0]:", repr(cleaned_docs[0]))
for i, sent in enumerate(sentences[:5]):
    print(f"  sentence[{i}]:", repr(sent))


###  Instantiate LLM & embedding model

In [None]:
# Quantized Llama-3 (runs on CPU; no CUDA)
llm = Llama(
    model_path=model_path,
    n_ctx=4096,
    n_gpu_layers=-1,
    stop=["Q:", "\n"],
    verbose=False
)

# Sentence-Transformer on MPS/CPU
embed_model = SentenceTransformer(
    japanese_processor.sentence_transformer_model,
    device=device
)

### Compute Embeddings & 2D Viz Coordinates

In [None]:
embeddings = embed_model.encode(sentences, show_progress_bar=True)
print("Embeddings shape:", embeddings.shape)

viz_embeddings = UMAP(
    n_neighbors=10,
    n_components=2,
    min_dist=0.5,
    metric="cosine",
    random_state=42
).fit_transform(embeddings)


### Define UMAP/HDBSCAN & Representation Models

In [None]:
umap_model = UMAP(
    n_neighbors=15,
    n_components=10,
    min_dist=0.01, # > 0 to avoid overfitting and avoid collapsing points too tightly
    metric="cosine",
    random_state=42
)
hdbscan_model = HDBSCAN(
    min_cluster_size=10, #allow to form topics out of 10 sentences
    min_samples=5, 
    metric="euclidean",
    cluster_selection_epsilon=0.1, #help merge closer clusters
    prediction_data=True
)



# prompt = """
# Q:  
# トピックのキーワード: [KEYWORDS]  
# トピックのドキュメント: [DOCUMENTS]  

# 以下の条件を満たす科学的なラベルを生成してください:  
# 1. **タイトルケース**（例: 「Adhd 診断 併存」）  
# 2. **2～4語以内**（名詞のみ、動詞・形容詞は不可）  
# 3. キーワード/ドキュメントの**明示的な用語のみ反映**（解釈や抽象表現は禁止）  
# 4. 句読点、例文、説明文は一切含まない  

# A:  
# """


prompt = """
Q:
以下の情報をもとに、科学的な「トピックラベル」を生成してください。

――――――

サンプル:
トピックのキーワード: [「内言」「映像」「味」]
トピックのドキュメント: 
「頭の中では日本語で考えているとき、文字か音声かではなく映像や味も感じることがある。」

生成ラベル: 「内言 感覚結合」

――――――

本番:
トピックのキーワード: [KEYWORDS]
トピックのドキュメント: [DOCUMENTS]

以下の条件を満たす科学的なラベルを生成してください:
1. タイトルケース（例: 「Adhd 診断 併存」）
2. 2～4語以内（名詞のみ、動詞・形容詞は不可）
3. キーワード/ドキュメントの明示的な用語のみ反映（解釈や抽象表現は禁止）
4. 句読点、例文、説明文は一切含まない

A:
"""

pipeline_kwargs = {
    "max_tokens": 6,               # ラベル長を制限（4語 + 余白）
    "temperature": 0.1,            # 低ランダム性（キーワード厳守）
    "top_p": 0.6,                  # 高確率トークンのみサンプリング
    "repeat_penalty": 1.2,         # 単語繰り返しを軽減
    "stop": ["\n", "。", "、"]     # 1行のみ出力
}

# representation_model = {
#     "KeyBERT": KeyBERTInspired(),
#     "LLM-JP": LlamaCPP(llm, prompt=prompt)
# }


representation_model = {
   "KeyBERT": KeyBERTInspired(),
   "LLM": LlamaCPP(llm, prompt=prompt,nr_docs=10, #show 10 sentences per topic
                   pipeline_kwargs=pipeline_kwargs,diversity=0.2)
}

In [None]:
list(japanese_processor.stopwords)

### Fit Bertopic

In [None]:
# check what a vectoriser would do 
temp_vectorizer = CountVectorizer(
    ngram_range=(1,2),
    stop_words=list(japanese_processor.stopwords),    # no stopwords
    max_df=0.85
)
temp_vectorizer.fit(sentences)
print("Temp vocabulary size :", len(temp_vectorizer.vocabulary_))
print("Sample words :", list(temp_vectorizer.vocabulary_.keys())[:20])


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

topic_model = BERTopic(
    embedding_model=embed_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    representation_model=representation_model,
    # vectorizer_model=CountVectorizer(
    #     ngram_range=(1,2),  # bigrams 
    #     stop_words=list(japanese_processor.stopwords),
    #     max_df=0.85, #lower to drop excessively common words
    #     min_df=2), #avoid gettinh topics on very rare words
    language="japanese",
    top_n_words=10,
    nr_topics="auto", #automatically reduce number of topics by merging similar ones
    verbose=True
)


topics, probs = topic_model.fit_transform(sentences, embeddings)
print("Number of topics:", len(set(topics)) - (1 if -1 in topics else 0))


In [None]:
#print raw labels exytavyed by LLama

raw_llm_labels = [label[0][0] for label in topic_model.get_topics(full=True)["LLM"].values()]
print(raw_llm_labels)

llm_labels = raw_llm_labels.copy()

In [None]:
# if need more cleaning (as we did for english), can use:

# import re 

# llm_labels = [label[0][0].replace('\nThe topic is labeled as:','').replace('\n', '').replace('Label:', '').replace('"', '') for label in topic_model.get_topics(full=True)["LLM"].values()]
# llm_labels
# llm_labels = [re.sub(r'\W+', ' ', label[0][0].split("\n")[0].replace('"', '')) for label in topic_model.get_topics(full=True)["LLM"].values()]
# llm_labels = [label if label else "Unlabelled" for label in llm_labels]
# all_labels = [llm_labels[topic+topic_model._outliers] if topic != -1 else "Unlabelled" for topic in topics] 

# filtered_labels = [label for label in all_labels if label != "Unlabelled"] #remove -1 topics (outliers)

# llm_labels

### Visualise topics

In [None]:
#get the summary table of all topics (including the ‘–1’ outliers)
info = topic_model.get_topic_info()
display(info)

# build a map from topic ID to label 
name_map = dict(zip(info.Topic, info.Name))

# for each document look its label
bertopic_labels = [ name_map[t] for t in topics ]
np.unique(bertopic_labels, return_counts=True)

In [None]:
import importlib
import datamapplot

# force reload so create_plot is back to its original
datamapplot = importlib.reload(datamapplot)

_orig_create = datamapplot.create_plot


from matplotlib.text import Text, Annotation

def create_jp_plot(*args, font="Hiragino Sans", **kwargs):
    # draw with the genuine original
    fig, ax = _orig_create(*args, **kwargs)
    # patch every Text and Annotation
    for art in fig.findobj(match=lambda o: isinstance(o, (Text, Annotation))):
        try:
            art.set_fontname(font)
        except Exception:
            pass
    return fig, ax

datamapplot.create_plot = create_jp_plot



In [None]:
len(llm_labels)

In [None]:
fig, ax = datamapplot.create_plot(
    viz_embeddings,
    bertopic_labels,
    label_font_size=14,
    # title="Japanese BERTopic (Llama-3-ELYZA-JP-8B)",
    # sub_title="Labels by quantized Llama-3-ELYZA",
    label_wrap_width=15,
    use_medoids=False,
    figsize=(30, 25)
)


plt.show()
