### Requirements
1. Google Driveをマウントする
2. GPUランタイムで実行する

### 必要なライブラリをインストール

In [1]:
!pip install transformers fugashi ipadic tensorboardX

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/f9/54/5ca07ec9569d2f232f3166de5457b63943882f7950ddfcc887732fc7fb23/transformers-4.3.3-py3-none-any.whl (1.9MB)
[K     |████████████████████████████████| 1.9MB 7.9MB/s 
[?25hCollecting fugashi
[?25l  Downloading https://files.pythonhosted.org/packages/55/9c/009da34dd111e84f54eef833c84afb5c744a0306af8546014a958e1967a0/fugashi-1.1.0-cp37-cp37m-manylinux1_x86_64.whl (486kB)
[K     |████████████████████████████████| 491kB 35.8MB/s 
[?25hCollecting ipadic
[?25l  Downloading https://files.pythonhosted.org/packages/e7/4e/c459f94d62a0bef89f866857bc51b9105aff236b83928618315b41a26b7b/ipadic-1.0.0.tar.gz (13.4MB)
[K     |████████████████████████████████| 13.4MB 243kB/s 
[?25hCollecting tensorboardX
[?25l  Downloading https://files.pythonhosted.org/packages/af/0c/4f41bcd45db376e6fe5c619c01100e9b7531c55791b7244815bac6eac32c/tensorboardX-2.1-py2.py3-none-any.whl (308kB)
[K     |██████████████████████████████

### データを読み込むための処理

1. レポジトリ内に`data`ディレクトリを作成

In [2]:
import pathlib
import os

In [3]:
repository_dir = '/content/drive/MyDrive/research/brigade-visualizer/source_code/'
repository_dir = pathlib.Path(repository_dir)

In [4]:
# レポジトリのディレクトリに移動
os.chdir(str(repository_dir))
print("current_dir: %s" % os.getcwd())

# dataディレクトリを作成
pathlib.Path('data').mkdir(exist_ok=True)

current_dir: /content/drive/MyDrive/research/brigade-visualizer/source_code


2. crawlingをしたデータを`data`ディレクトリ直下に置く
  - crawling data: https://drive.google.com/drive/folders/1CJM4aGf9h6shnBxsK24Ylagdiv5TXGBc?usp=sharing
  - `exact_match/contentData` ディレクトリの内容を用いる

In [5]:
# 確認する
data_dir = repository_dir / "data" / "CodeForBrigade" / "exact_match" / "contentData"
print("crawling data Path: %s" % data_dir)
print("crawling data Exists: %s" % data_dir.exists())

crawling data Path: /content/drive/MyDrive/research/brigade-visualizer/source_code/data/CodeForBrigade/exact_match/contentData
crawling data Exists: True


3. レポジトリ内の`bert`ディレクトリに移動

In [6]:
# bertディレクトリに移動
os.chdir(str(repository_dir / "bert"))
print("current_dir: %s" % os.getcwd())

current_dir: /content/drive/MyDrive/research/brigade-visualizer/source_code/bert


### BERTモデルを読み込む
- 東北大学 乾・鈴木研究室が公開している[訓練済み日本語BERTモデル](https://www.nlp.ecei.tohoku.ac.jp/news-release/3284/)を使用
  - https://github.com/cl-tohoku/bert-japanese

In [7]:
from transformers import (BertJapaneseTokenizer, BertModel)

tokenizer = BertJapaneseTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')
model = BertModel.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=257706.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=479.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=445021143.0, style=ProgressStyle(descri…




In [8]:
import torch

if torch.cuda.is_available():
  print("We will use GPU: %s" % torch.cuda.get_device_name(0))
  device = torch.device('cuda:0')
else:
  print("No GPU available, use CPU instead.")
  device = torch.device('cpu')

model.to(device)
print(model.device)

We will use GPU: Tesla T4
cuda:0


### 文のベクトルを取得する
- 最後から2つ目の隠れ層を取得し平均したものを文章の埋め込みベクトルとする
- bert-as-serviceを参考
  - https://github.com/hanxiao/bert-as-service#q-how-do-you-get-the-fixed-representation-did-you-do-pooling-or-something
  

In [9]:
import unicodedata

def preprocess_text(text):
	# 先頭・末尾の空白を削除
  text = text.strip()
	
  # 改行を空白に置換
  text = text.replace("\n", " ")

  # NFKCで文字の正規化
  text = unicodedata.normalize("NFKC", text)
	
  return text

In [10]:
def sentence_representation(text):
  text = preprocess_text(text)

  encoded = tokenizer.encode_plus(
      text, 
      max_length=512,
      padding="max_length",
      truncation=True,
      return_tensors='pt').to(device)
  
  encoded["return_dict"] = True
  encoded["output_hidden_states"] = True

  with torch.no_grad():
    result = model(**encoded)
    return torch.mean(result.hidden_states[-2], 1).to(torch.device('cpu'))

### クローリング結果を読み込む

In [11]:
data_dir

PosixPath('/content/drive/MyDrive/research/brigade-visualizer/source_code/data/CodeForBrigade/exact_match/contentData')

In [37]:
cos = torch.nn.CosineSimilarity(dim=1)

In [46]:
import json 
from collections import OrderedDict

brigade_rep_dic = OrderedDict()

for crawlingData_path in [f for f in data_dir.glob("*.json") if f.is_file()]:
  with open(crawlingData_path, encoding="utf_8_sig") as f:
    brigade_name = crawlingData_path.name[:-5]
    
    if brigade_name == "Code_for_Mitaka__Musashino":
      brigade_name = "Code for Mitaka / Musashino"
    elif brigade_name == "Code_for_Shiga__Biwako":
      brigade_name = "Code for Shiga / Biwako"
    else:
      brigade_name = brigade_name.replace("_", " ")    

    reader = json.load(f)
    crawlingData = reader["data"]

    brigade_rep = torch.zeros(0)
    
    for contents_id, crawlingContents in crawlingData.items():
      context = crawlingContents["context"]
      if context is None:
        continue
      
      context_rep = sentence_representation(context)
      brigade_rep = torch.cat((brigade_rep, context_rep))
    
    brigade_rep = torch.mean(torch.unsqueeze(brigade_rep, 0), 1)
    brigade_rep_dic[brigade_name] = brigade_rep

In [55]:
brigade_distance_dic = {
    "civicname1": [],
    "civicname2": [],
    "distance": []
}

brigade_rep_list = list(brigade_rep_dic.items())

for brigade1_idx, (brigade1_name, brigade1_sentence_rep) in enumerate(brigade_rep_list):
  for brigade2_idx, (brigade2_name, brigade2_sentence_rep) in enumerate(brigade_rep_list[brigade1_idx+1:]):
    brigade_distance_dic["civicname1"].append(brigade1_name)
    brigade_distance_dic["civicname2"].append(brigade2_name)
    
    distance = cos(brigade1_sentence_rep, brigade2_sentence_rep)
    brigade_distance_dic["distance"].append(distance.clone().numpy()[0])

In [56]:
import pandas as pd

df_brigade_distance = pd.DataFrame.from_dict(brigade_distance_dic).sort_values('distance', ascending=False)
display(df_brigade_distance)

Unnamed: 0,civicname1,civicname2,distance
180,Code for Fukuoka,Code for Sabae,0.999242
173,Code for Fukuoka,Code for Kusatsu,0.999198
181,Code for Fukuoka,Code for Saga,0.999102
399,Code for Kusatsu,Code for MIKAWA,0.999034
525,Code for Sabae,Code for Saga,0.998978
...,...,...,...
358,Code for Kobe,Code for Muroran,0.959523
475,Code for Muroran,Code for 中野,0.958961
146,Code for Fuchu,Code for Muroran,0.957956
471,Code for Muroran,Code for TODA,0.956891


#### CODE for GIFU, Code for AICHIとの距離を列挙

In [57]:
display(df_brigade_distance[(df_brigade_distance['civicname1'] == "CODE for GIFU") | (df_brigade_distance['civicname2'] == "CODE for GIFU")].sort_values('distance', ascending=False).head(5))

display(df_brigade_distance[(df_brigade_distance['civicname1'] == "Code for AICHI") | (df_brigade_distance['civicname2'] == "Code for AICHI")].sort_values('distance', ascending=False).head(5))

Unnamed: 0,civicname1,civicname2,distance
203,CODE for GIFU,Code for MIKAWA,0.998851
206,CODE for GIFU,Code for NAGAREYAMA,0.998021
220,CODE for GIFU,Code for Toyota,0.997998
202,CODE for GIFU,Code for Kusatsu,0.997842
216,CODE for GIFU,Code for Suginami,0.99743


Unnamed: 0,civicname1,civicname2,distance
31,Code for AICHI,Code for Toyota,0.998467
5,Code for AICHI,CODE for GIFU,0.997373
18,Code for AICHI,Code for Nagoya,0.996982
12,Code for AICHI,Code for Kumagaya,0.996925
0,Code for AICHI,CODE for AIZU,0.996778


In [61]:
text_list = []

for row in df_brigade_distance.itertuples():
  text_list.append(
      {
        "civicname1": row.civicname1,
        "civicname2": row.civicname2,
        "distance": row.distance
       }
  )

with open("../json/brigade-distance-bert_crawling.json", 'w') as f:
  json.dump(text_list, f, ensure_ascii=False, indent=3)