# Japanese Text Analysis

In [None]:
import json
import pandas as pd
import requests

from sudachipy import tokenizer
from sudachipy import dictionary
from sudachipy import config

from io import StringIO
import re
from scipy import stats

import unicodedata

## Example of using sudachi for parsing Japanese text.

Check [github](https://github.com/WorksApplications/Sudachi) page for Japanese morphological analyzer Sudachi. ![Sudachi](images/Sudachi.png)

In [None]:
with open(config.SETTINGFILE, "r", encoding="utf-8") as f:
    settings = json.load(f)
tokenizer_obj = dictionary.Dictionary(settings).create()

In [None]:
config.SETTINGFILE

In [None]:
!cat {config.SETTINGFILE}

Multi-granular tokenization
(following results are w/ `system_full.dic`
you may not be able to replicate this particular example w/ `system_core.dic`)

In [None]:
mode = tokenizer.Tokenizer.SplitMode.C
[m.surface() for m in tokenizer_obj.tokenize(mode, "医薬品安全管理責任者")]
# => ['医薬品', '安全', '管理責任者']

In [None]:
mode = tokenizer.Tokenizer.SplitMode.B
[m.surface() for m in tokenizer_obj.tokenize(mode, "医薬品安全管理責任者")]
# => ['医薬品', '安全', '管理', '責任者']

In [None]:
mode = tokenizer.Tokenizer.SplitMode.A
[m.surface() for m in tokenizer_obj.tokenize(mode, "医薬品安全管理責任者")]
# => ['医薬', '品', '安全', '管理', '責任', '者']

In [None]:
s = '複数粒度の分割結果に基づく日本語単語分散表現'
mode = tokenizer.Tokenizer.SplitMode.C
for t in zip([m.surface() for m in tokenizer_obj.tokenize(mode, s)], [m.reading_form() for m in tokenizer_obj.tokenize(mode, s)]):
    print(t)

In [None]:
s = '分散表現の構築手法'
mode = tokenizer.Tokenizer.SplitMode.C
for t in zip([m.surface() for m in tokenizer_obj.tokenize(mode, s)], [m.reading_form() for m in tokenizer_obj.tokenize(mode, s)]):
    print(t)

In [None]:
# Morpheme information

m = tokenizer_obj.tokenize(mode, "食べ")[0]

In [None]:
m.surface() # => '食べ'

In [None]:
m.dictionary_form() # => '食べる'

In [None]:
m.reading_form() # => 'タベ'

In [None]:
def get_reading_form(word):
    m = tokenizer_obj.tokenize(mode, word)[0]
    return m.reading_form()

In [None]:
m.part_of_speech() # => ['動詞', '一般', '*', '*', '下一段-バ行', '連用形-一般']

In [None]:
# Normalization

In [None]:
tokenizer_obj.tokenize(mode, "附属")[0].normalized_form()
# => '付属'

In [None]:
tokenizer_obj.tokenize(mode, "SUMMER")[0].normalized_form()
# => 'サマー'

In [None]:
tokenizer_obj.tokenize(mode, "シュミレーション")[0].normalized_form()
# => 'シミュレーション'

## Japanese Text Mining

![Japanese Text Mining](images/japanese_text_mining.jpg)
Check out the [Emory University workshop blog](https://scholarblogs.emory.edu/japanese-text-mining/) on Japanese Text Mining. The example notebook cells below repeat the steps in the [tutorial](http://history.emory.edu/RAVINA/JF_text_mining/Guides/Jtextmining_intro_part1.html) of Mark Ravina using python instead of R.

In [None]:
response = requests.get('http://history.emory.edu/RAVINA/JF_text_mining/Guides/data/meiroku_zasshi.txt')

In [None]:
response.encoding = 'utf-8'

In [None]:
data = [t.split('" "') for t in response.text.split('\n')]

In [None]:
data[0]

In [None]:
data = [[d.replace('"', '') for d in row] for row in data]

In [None]:
d = data[1][0].split()
d.extend(data[1][1:])

In [None]:
rows = []
for d in data[1:]:
    row = d[0].split()
    row.extend(d[1:])
    rows.append(row)

In [None]:
df = pd.DataFrame(rows, columns = ['index', 'year', 'issue', 'title', 'author', 'text'])

In [None]:
df = df.drop(df.index[-1])

In [None]:
authors = list(df.author)
no_boxes_per_line = 5
[authors[no_boxes_per_line*m: no_boxes_per_line*m+no_boxes_per_line] for m in range(31)]

In [None]:
set(authors)

In [None]:
mask = df.author == '西周'
df[mask][['index', 'title']].head()

## Tale of Genji: Significant Terms and Word Clusters.

![源氏物語歌合](images/200014735/image/200014735_00014.jpg)


In [None]:
# The Tale of the Genji consists of 54 chapters. Each chapter is broken into sections and each section into a list of text blocks.
with open('../data/raw/genji_data.json') as fp:
    genji_data = json.load(fp)

In [None]:
chapter_names = list(genji_data.keys())
print(len(chapter_names))

In [None]:
# Section keys for a chapter.
chapter = chapter_names[0]
print(chapter, genji_data[chapter].keys())

In [None]:
wikipedia_genji = pd.read_html('https://en.wikipedia.org/wiki/The_Tale_of_Genji', attrs={"class": "wikitable"})[0]

In [None]:
wikipedia_genji

In [None]:
chapter_names[:1]

In [None]:
all_text = ''
mode = tokenizer.Tokenizer.SplitMode.C
text_length = 0
chapter_boundaries = []
for chapter in chapter_names:
    chapter_boundaries.append(text_length)
    for section in genji_data[chapter].keys():
        wordlist = []
        for block in genji_data[chapter][section]:
            wordlist = [m.dictionary_form() for m in tokenizer_obj.tokenize(mode, block)]
            text_length += len(wordlist)
            all_text += ' '.join(wordlist)
            print('\r{}'.format(text_length), end='')

In [None]:
tokens = re.findall('\w+', all_text)

In [None]:
len(tokens)

In [None]:
with open('../data/processed/all_text.txt', 'w') as fp:
    fp.write(all_text)

In [None]:
# sents = !ruby pragmatic_segmenter_test.rb all_text.txt

In [None]:
from src.models import WordLevelStatistics

In [None]:
# class WordLevelStatistics():
#     # Copyright 2014 Shubhanshu Mishra. All rights reserved.
#     #
#     # This library is free software; you can redistribute it and/or
#     # modify it under the same terms as Python itself.
#     def __init__(self, word_pos=None, corpus_file=None, percentile_C=95):
#         '''This package is a port of the perl module Algorithm::WordLevelStatistics by
#         Francesco Nidito which can be found at:
#         http://search.cpan.org/~nids/Algorithm-WordLevelStatistics-0.03/

#         The code is an implementation of the spatial statistics described in
#         the following paper:
#         @article{carpena2009level,
#           title={Level statistics of words: Finding keywords in literary texts and symbolic sequences},
#           author={Carpena, P and Bernaola-Galv{\'a}n, P and Hackenberg, M and Coronado, AV and Oliver, JL},
#           journal={Physical Review E},
#           volume={79},
#           number={3},
#           pages={035102},
#           year={2009},
#           publisher={APS}
#         }

#         Author: Shubhanshu Mishra
#         Published: December 29, 2014
#         License: GPL3
#         '''
#         if percentile_C is not None:
#             self.percentile_C = percentile_C

#         if word_pos is not None:
#             self.word_pos = word_pos
#         elif corpus_file is not None:
#             self.word_pos = dict()
#             self.pos_counter = 0
#             if isinstance(corpus_file, list):
#                 for c in corpus_file:
#                     self.gen_word_pos(c)
#             else:
#                 self.gen_word_pos(corpus_file)

#     def gen_word_pos(self, corpus_file):
#         # with open(corpus_file, encoding='utf-8') as fp:
#         text = corpus_file.read()  # .lower()
#         tokens = re.findall('\w+', text)
#         for t in tokens:
#             if t not in self.word_pos:
#                 self.word_pos[t] = []
#             self.word_pos[t].append(self.pos_counter)
#             self.pos_counter += 1

#     def compute_spectra(self):
#         if self.word_pos is None or len(self.word_pos.keys()) < 1:
#             return None
#         # Count total words in the text.
#         self.tot_words = sum([len(self.word_pos[k]) for k in self.word_pos.keys()])

#         # Compute level statistics of all terms
#         self.level_stat = []
#         for k in self.word_pos.keys():
#             ls = self.compute_spectrum(k)
#             self.level_stat.append(ls)

#         # Sort level_stat frequency, use index in this list for vocab.
#         self.level_stat = sorted(self.level_stat,
#                                  key=lambda x: x['count'],
#                                  reverse=True)

#         # Add index to keep track of vocab, higher freq <-> higer index.
#         for n, vocab_entry in enumerate(self.level_stat):
#             vocab_entry['vocab_index'] = n

#         self.threshold = stats.scoreatpercentile(    ## TODO: Compute this directly, dont import extra lib.
#             [t['C'] for t in self.level_stat], self.percentile_C)
#         self.level_stat_thresholded = [t for t in self.level_stat if t['C'] > self.threshold]

#         # Significant terms
#         self.significant_terms = [t['word'] for t in self.level_stat_thresholded]

#     def compute_spectrum(self, word):
#         positions = self.word_pos[word]
#         n = len(positions)
#         ls = {'word': word, 'count': n, 'C': 0, 'sigma_nor': 0}
#         if n > 3:
#             # position -> distance from preceding element in text
#             tmp = [positions[i+1] - positions[i] for i in range(n-1)]
#             # len(tmp) = n-1
#             avg = sum(tmp)*1.0/(n-1)
#             sigma = sum([(k-avg)**2 for k in tmp])*1.0/(n-1)
#             sigma = (sigma**(0.5))/avg

#             p = n*1.0/self.tot_words
#             ls['sigma_nor'] = sigma/((1.0-p)**.5)

#             ls['C'] = (ls['sigma_nor'] - (2.0*n-1.0)/(2.0*n+2.0))\
#                        * ((n**0.5) * (1.0+2.8*n**-0.865))
#         return ls

In [None]:
fp = StringIO(all_text)
word_level_statistics = WordLevelStatistics(corpus_file=fp, percentile_C=98)
word_level_statistics.compute_spectra()

lvls_df = pd.DataFrame(word_level_statistics.level_stat_thresholded)
significant_terms = word_level_statistics.significant_terms
print('Threshold: {}, ({} percentile) find {} significant terms.'.format(
                             word_level_statistics.threshold,
                             word_level_statistics.percentile_C,
                             len(significant_terms)))

In [None]:
lvls_df = lvls_df.sort_values(by='sigma_nor', ascending=False)
lvls_df['reading form'] = lvls_df.word.map(get_reading_form)
lvls_df.head(100)

In [None]:
block.split(' ')

In [None]:
word = '斎宮'
for n, chapter in enumerate(chapter_names):
    for section in genji_data[chapter].keys():
        for block in genji_data[chapter][section]:
            if word in block:
                print(n+1, chapter, section, block)
                print('='*40)

In [None]:
def text_length(text):
    fp = StringIO(text)
    return len(fp.read().split())

In [None]:
def keywords(text):
    fp = StringIO(text)
    word_level_statistics = WordLevelStatistics(corpus_file=fp, percentile_C=90)
    word_level_statistics.compute_spectra()

    lvls_df = pd.DataFrame(word_level_statistics.level_stat_thresholded)
    try:
        lvls_df = lvls_df.sort_values(by='sigma_nor', ascending=False)
    except:
        print(text)
        return ''
    return '|'.join(lvls_df.word[0:5])

In [None]:
df['text length'] = df['text'].map(text_length)
df['keywords'] = df['text'].map(keywords)

In [None]:
mask = df.keywords.str.contains('スパルタ')
df[mask]

In [None]:
df.iloc[55].text

In [None]:
mask = df.issue == str(13)
df[mask]

In [None]:
from ipywidgets import HTML, Image, Layout, Button, Label
from ipywidgets import HBox, VBox, Box

In [None]:
import plotly.graph_objs as go

In [None]:
no_terms = 30
word_list = list(lvls_df['word'].head(no_terms))
positions = [word_level_statistics.word_pos[word] for word in word_list]
keywords_in_context = [' '.join(word_level_statistics.tokens[n-2:n+3]) for n,w in enumerate(word_level_statistics.tokens)]

word_list.reverse()
positions.reverse()

fig1 = go.FigureWidget()
for w, p in zip(word_list, positions):
    scatter = fig1.add_scatter(x=p, y=[w]*len(p))
    scatter.mode = 'markers'
    scatter.marker.symbol = 'line-ns-open'
    scatter.marker.color = 'grey'
    scatter.name = w
    scatter.hovertext = [keywords_in_context[n] for n in p]
    scatter.hoverinfo = 'text'

ticklabels = []
for n in range(1,55):
    if n%2 == 0:
        ticklabels.append(str(n))
    else:
        ticklabels.append('')

layout = go.Layout(
    title='Word Distributions for Top {} Significant Terms'.format(no_terms),
    showlegend=False,
    autosize=True,
#     width=1000,
    height=700,
    margin=go.layout.Margin(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    ),
    hovermode='closest',
#     paper_bgcolor='#7f7f7f',
#     plot_bgcolor='#c7c7c7',
    xaxis=dict(
        title=None,
        titlefont=dict(
            family='Arial, sans-serif',
            size=18,
            color='lightgrey'
        ),
        showticklabels=True,
#         ticks='outside',
        tickangle=45,
        tickfont=dict(
            family='Old Standard TT, serif',
            size=14,
            color='black'
        ),
        tickvals=chapter_boundaries,
        ticktext=ticklabels,
        automargin=True,
        showgrid=True,
        zeroline=False,
        showline=False,
    ),
    yaxis=dict(
        title=None,
        titlefont=dict(
            family='Arial, sans-serif',
            size=18,
            color='lightgrey'
        ),
        showticklabels=True,
        automargin=True,
        tickangle=0,
        tickfont=dict(
            family='Old Standard TT, serif',
            size=14,
            color='black'
        ),
        tickvals=word_list,
        showgrid=True,
        zeroline=False,
        showline=False,
    )
)

fig1.layout = layout

In [None]:
fig1

In [None]:
lvls_df.head(100)

## Word Clusters

In [None]:
import pymagnitude
import hdbscan
import numpy as np
from collections import Counter
try:
    import umap
    print("Using: umap")
except ImportError:
    import bhtsne

In [None]:
from src.models import enrich_significant_terms, topic_exemplars, display_topics, topic_order_index, hdbscan_parameter_search, enumerate_exemplars
from IPython.core.display import display, HTML

In [None]:
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()
formatter = logging.Formatter('%(asctime)s %(message)s',"%b-%d-%Y %H:%M:%S")
logger.handlers[0].setFormatter(formatter)
logging.getLogger('joblib').setLevel(logging.ERROR)

In [None]:
# background_model = '../data/external/wiki-news-300d-1M.magnitude'
background_model = '/Users/ray/data/models/cc.ja.300.magnitude'
background_vectors = pymagnitude.Magnitude(background_model)

In [None]:
# local_vectors = '../models/hobbit/wordvectors_rare15_spl_window5_bag_hash0_dim200_sqrt_cca_pseudo0_ce0P75_se0.magnitude'
# local_vectors = pymagnitude.Magnitude(local_vectors)

In [None]:
# vectors = pymagnitude.Magnitude(local_vectors, background_vectors)
# vectors = local_vectors
vectors = background_vectors

In [None]:
significant_terms = list(lvls_df['word'])
significant_vectors = vectors.query(significant_terms)

In [None]:
try:
    fit = umap.UMAP(n_neighbors=15, n_components=10, metric='euclidean')
    vec_10d = fit.fit_transform(significant_vectors)
    fit = umap.UMAP(n_neighbors=15, n_components=2, metric='euclidean')
    vec_2d = fit.fit_transform(vec_10d)
except Exception as ex:
    logging.error("Trying bhtsne. Got exception {}".format(ex))
    vec_2d = bhtsne.tsne(np.asfarray(significant_vectors, dtype='float64' ),dimensions=2)

In [None]:
significant_terms_enriched = enrich_significant_terms(lvls_df, vec_10d, vec_2d, 'leaf')
exemplar_scores, hovers = topic_exemplars(significant_terms_enriched)
summary = pd.DataFrame([h.split(':') for h in hovers], columns=['topic', 'terms'])

In [None]:
len(significant_terms_enriched[significant_terms_enriched['topic']==-1])

In [None]:
mask = (lvls_df.word == '器量')
lvls_df[mask]

**TODO:** Cache (full) results from dictionary lookups.

In [None]:
def jisho_lookup(word):
    # word = '器量'
    response = requests.get('https://jisho.org/api/v1/search/words?keyword={}'.format(word))
    word_def = response.json()
    # print(word_def)
    jisho_definition = ''
    try:
        jisho_definition = ' | '.join(word_def['data'][0]['senses'][0]['english_definitions'])
    except Exception as ex:
        pass
#         print(word, ex)
    return jisho_definition

In [None]:
lvls_df['definition'] = list(map(jisho_lookup, lvls_df['word']))

In [None]:
word_topic_map = dict(lvls_df[['word', 'topic']].values)

In [None]:
word = 'やっかい'
neighbors = [(word, 1.0)]
neighbors.extend(background_vectors.most_similar_approx(word, topn=15))
for w,s in neighbors:
    t = '_'
    if w in word_topic_map:
        t = word_topic_map[w]
    print(w, t, s, jisho_lookup(w))

In [None]:
from sklearn.neighbors import NearestNeighbors

In [None]:
nbrs = NearestNeighbors(n_neighbors=15, algorithm='ball_tree').fit(vec_10d)
distances, indices = nbrs.kneighbors(vec_10d)

In [None]:
index_no = 5
for n, word_num in enumerate(indices[index_no]):
    if distances[index_no][n] < 0.6:
        w = significant_terms[word_num]
        t = '_'
        if w in word_topic_map:
            t = word_topic_map[w]
        print(w, t, distances[index_no][n], jisho_lookup(w))

In [None]:
topic_no = 7
mask = (lvls_df.topic == topic_no)
lvls_df[mask]

In [None]:
topics, top_columns = display_topics(significant_terms_enriched, n_rows=20, n_cols=35)
topics = topics.fillna('')
print('{} topics'.format(significant_terms_enriched['topic'].max()))
display(HTML(topics.to_html(index=False)))

In [None]:
word_list = list(lvls_df[mask]['word'])
positions = [word_level_statistics.word_pos[word] for word in word_list]
keywords_in_context = [' '.join(word_level_statistics.tokens[n-2:n+3]) for n,w in enumerate(word_level_statistics.tokens)]

word_list.reverse()
positions.reverse()

fig = go.FigureWidget()
for w, p in zip(word_list, positions):
    scatter = fig.add_scatter(x=p, y=[w]*len(p))
    scatter.mode = 'markers'
    scatter.marker.symbol = 'line-ns-open'
    scatter.marker.color = 'grey'
    scatter.name = w
    scatter.hovertext = [keywords_in_context[n] for n in p]
    scatter.hoverinfo = 'text'

ticklabels = []
for n in range(1,55):
    if n%2 == 0:
        ticklabels.append(str(n))
    else:
        ticklabels.append('')

layout = go.Layout(
    title='Word Distributions for Topic {}'.format(topic_no),
    showlegend=False,
    autosize=True,
#     width=1000,
    height=700,
    margin=go.layout.Margin(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    ),
#     paper_bgcolor='#7f7f7f',
#     plot_bgcolor='#c7c7c7',
    xaxis=dict(
        title=None,
        titlefont=dict(
            family='Arial, sans-serif',
            size=18,
            color='lightgrey'
        ),
        showticklabels=True,
#         ticks='outside',
        tickangle=45,
        tickfont=dict(
            family='Old Standard TT, serif',
            size=14,
            color='black'
        ),
        tickvals=chapter_boundaries,
        ticktext=ticklabels,
        automargin=True,
        showgrid=True,
        zeroline=False,
        showline=False,
    ),
    yaxis=dict(
        title=None,
        titlefont=dict(
            family='Arial, sans-serif',
            size=18,
            color='lightgrey'
        ),
        showticklabels=True,
        automargin=True,
        tickangle=0,
        tickfont=dict(
            family='Old Standard TT, serif',
            size=14,
            color='black'
        ),
        tickvals=word_list,
        showgrid=True,
        zeroline=False,
        showline=False,
    )
)

fig.layout = layout
fig.layout.hovermode = 'closest'

In [None]:
fig

In [None]:
lvls_df[mask]