In [1]:
import pandas as pd
import numpy as np
import string
import re

from nltk.tokenize import word_tokenize
from collections import Counter
from functools import reduce

In [3]:
lay = pd.read_json(path_or_buf='data.jsonl', lines=True)

In [4]:
lay.head()

Unnamed: 0,id_pair,id_premis,premis_text,hypothesis_text,label,data_split,annotator_type,sentence_size
0,101000,10100,"Selain itu, ia juga memiliki andil dari rumah ...",Ia memiliki teman.,e,train,lay,single
1,101001,10100,"Selain itu, ia juga memiliki andil dari rumah ...",Teman-temannya memiliki bagian juga.,n,train,lay,single
2,101003,10100,"Selain itu, ia juga memiliki andil dari rumah ...",Ia bermain di rumah sandiwara.,e,train,lay,single
3,101010,10101,Kualitas yang rendah ini dapat mengakibatkan k...,Komunikasi terganggu akibat kualitas tinggi.,n,train,lay,single
4,101011,10101,Kualitas yang rendah ini dapat mengakibatkan k...,Komunikasi terganggu akibat kualitas rendah.,n,train,lay,single


In [5]:
class BaseTokenizer:
  _whitespace_pattern = r"\s+"
  _tokenize_pattern = r'([0-9]+\-an|[+-]?[0-9]*[,.]?[0-9]+|[A-Z][a-z]\.|(?:[A-Z]+\.)(?:[A-Za-z]+\.){1,}|\.\.\.|\-\-|\w+(?=n\'t)|n\'t|\w+(?=\'[m|s]\s)|\'[m|s]\s|[^\w\s+]|(?:[\w-]{0,}))'

  def __init__(self):
    self.regex = re.compile(self._tokenize_pattern)
    self.whitespace_regex = re.compile(self._whitespace_pattern)
    
  def tokenize(self, sent):
    stripped_sent = self.whitespace_regex.sub(" ", sent).strip()
    tokens = self.regex.findall(stripped_sent)
#     spaceafterflags = self.__getspaceafterflag(tokens)
    tokens = [token for token in tokens if token != '']
    return tokens
  
  def __getspaceafterflag(self, tokens):
    flag = [False for token in tokens if token != '']

    # Iterate over list
    i = 0
    is_whitespace = True
    for token in tokens:
      if token == '':
        is_whitespace = True
      else:
        flag[i] = not is_whitespace
        i += 1
        is_whitespace = False

    return flag[1:] + [False]
  
tokens = BaseTokenizer()

In [6]:
def get_seq_counter(*seq):
  return [Counter(s) for s in seq]

def intersect_counter(*seq):
  intersection = seq[0].copy()
  for s in seq[1:]:
    intersection &= s
  return intersection

def union_counter(*seq):
  union = seq[0].copy()
  for s in seq[1:]:
    union |= s
  return union

def count(counter):
  return len(set(counter))

def jaccard_sim(*seq, verb=False):
  sequences = get_seq_counter(*seq)
  intersection = count(intersect_counter(*sequences))
  union = count(union_counter(*sequences))
  
  if verb:
    print('intersect:', set(intersect_counter(*sequences)))
    print('union:', set(union_counter(*sequences)))
  
  return intersection / union

def intersection_len(*seq, premis=True, verb=False):
  """
  seq[0] is the premis
  seq[1] is the hypothesis
  """
  sequences = get_seq_counter(*seq)
  intersection = count(intersect_counter(*sequences))
  denom = len(set(seq[0])) if premis else len(set(seq[1]))
  
  if verb:
    print('intersect:', set(intersect_counter(*sequences)))
    print('denom:', denom)
    
  return intersection / denom

def cosine_ochiai(*seq):
  sequences = get_seq_counter(*seq)
  intersection = count(intersect_counter(*sequences))  
  sequences = [count(s) for s in sequences]
  prod = reduce(lambda x, y: x * y, sequences)    
  return intersection / pow(prod, 1.0 / len(sequences))

def show_matrix(m):
  rows = len(m)
  cols = len(m[0])
  for i in range(rows):
    for j in range(cols):
      print("%4d" % m[i][j], end="")
    print("")
    
def lcs(*seq, mode=0, punct=False, tokeniser='nltk'):
  if tokeniser == 'nltk':
    s1 = word_tokenize(re.sub(r'[\.\?\'"!,]','',seq[0]).lower()) if punct else word_tokenize(seq[0].lower())
    s2 = word_tokenize(re.sub(r'[\.\?\'"!,]','',seq[1]).lower()) if punct else word_tokenize(seq[1].lower())
  else:
    s1 = tokens.tokenize(seq[0].lower())
    s2 = tokens.tokenize(seq[1].lower())
  n1 = len(s1); n2 = len(s2)
  mat = np.zeros((n1+1,n2+1), dtype=np.int64)

  for i in range(0, n1+1):
    for j in range(0, n2+1):
      if i == 0 or j == 0:
        mat[i][j] = 0
      elif s1[i-1] == s2[j-1]:
        mat[i][j] = mat[i-1][j-1] + 1
      else:
        mat[i][j] = max( mat[i-1][j], mat[i][j-1] )

#   show_matrix(mat)
        
  denom = len(s2) if mode else len(s1)
  return mat[n1][n2] / denom



In [7]:
lay['jaccard_sim'] = lay.apply(lambda x: jaccard_sim(word_tokenize(x.premis_text.lower()), word_tokenize(x.hypothesis_text.lower())), axis=1)
# lay['intersection_premis'] = lay.apply(lambda x: intersection_len(word_tokenize(x.premis_text.lower()), word_tokenize(x.hypothesis_text.lower())), axis=1)
lay['intersection_hypothesis'] = lay.apply(lambda x: intersection_len(word_tokenize(x.premis_text.lower()), word_tokenize(x.hypothesis_text.lower()), premis=False), axis=1)
lay['lcs_punct'] = lay.apply(lambda x: lcs(x.premis_text, x.hypothesis_text, mode=1), axis=1)
lay['lcs_nopunct'] = lay.apply(lambda x: lcs(x.premis_text, x.hypothesis_text, mode=1, punct=True), axis=1)
# lay['cosine_ochiai'] = lay.apply(lambda x: cosine_ochiai(word_tokenize(x.premis_text.lower()), word_tokenize(x.hypothesis_text.lower())), axis=1)
lay['lcs_aksara'] = lay.apply(lambda x: lcs(x.premis_text, x.hypothesis_text, mode=1, tokeniser='aksara'), axis=1)

In [8]:
lay.head()

Unnamed: 0,id_pair,id_premis,premis_text,hypothesis_text,label,data_split,annotator_type,sentence_size,jaccard_sim,intersection_hypothesis,lcs_punct,lcs_nopunct,lcs_aksara
0,101000,10100,"Selain itu, ia juga memiliki andil dari rumah ...",Ia memiliki teman.,e,train,lay,single,0.1875,0.75,0.75,0.666667,0.75
1,101001,10100,"Selain itu, ia juga memiliki andil dari rumah ...",Teman-temannya memiliki bagian juga.,n,train,lay,single,0.25,0.8,0.4,0.25,0.4
2,101003,10100,"Selain itu, ia juga memiliki andil dari rumah ...",Ia bermain di rumah sandiwara.,e,train,lay,single,0.3125,0.833333,0.666667,0.6,0.666667
3,101010,10101,Kualitas yang rendah ini dapat mengakibatkan k...,Komunikasi terganggu akibat kualitas tinggi.,n,train,lay,single,0.15,0.5,0.333333,0.2,0.333333
4,101011,10101,Kualitas yang rendah ini dapat mengakibatkan k...,Komunikasi terganggu akibat kualitas rendah.,n,train,lay,single,0.210526,0.666667,0.5,0.4,0.5


In [9]:
difference = lay.loc[lay['lcs_punct'].ne(lay['lcs_aksara'])][['premis_text', 'hypothesis_text']]
difference['nltk_prem'] = difference.apply(lambda x: word_tokenize(x.premis_text.lower()), axis=1) 
difference['nltk_hyp'] = difference.apply(lambda x: word_tokenize(x.hypothesis_text.lower()), axis=1)
difference['aksara_prem'] = difference.apply(lambda x: tokens.tokenize(x.premis_text.lower()), axis=1) 
difference['aksara_hyp'] = difference.apply(lambda x: tokens.tokenize(x.hypothesis_text.lower()), axis=1)
difference.head(10)

Unnamed: 0,premis_text,hypothesis_text,nltk_prem,nltk_hyp,aksara_prem,aksara_hyp
78,Kain mori pernah pula amat populer sebagai bah...,Kain mori tidak populer pada era 1970-an.,"[kain, mori, pernah, pula, amat, populer, seba...","[kain, mori, tidak, populer, pada, era, 1970-a...","[kain, mori, pernah, pula, amat, populer, seba...","[kain, mori, tidak, populer, pada, era, 1970-a..."
197,Dokter menyebut pemulihan Ada Zanusso yang ber...,Pemulihan Ada Zanusso adalah harapan baik.,"[dokter, menyebut, pemulihan, ada, zanusso, ya...","[pemulihan, ada, zanusso, adalah, harapan, bai...","[dokter, menyebut, pemulihan, ada, zanusso, ya...","[pemulihan, ada, zanusso, adalah, harapan, bai..."
199,Dokter menyebut pemulihan Ada Zanusso yang ber...,Harapan baik seluruh dunia adalah Ada Zanusso.,"[dokter, menyebut, pemulihan, ada, zanusso, ya...","[harapan, baik, seluruh, dunia, adalah, ada, z...","[dokter, menyebut, pemulihan, ada, zanusso, ya...","[harapan, baik, seluruh, dunia, adalah, ada, z..."
303,"Untuk pasar domestik Jepang, mesin untuk Auris...","Untuk pasar internasional, mesin untuk Auris h...","[untuk, pasar, domestik, jepang, ,, mesin, unt...","[untuk, pasar, internasional, ,, mesin, untuk,...","[untuk, pasar, domestik, jepang, ,, mesin, unt...","[untuk, pasar, internasional, ,, mesin, untuk,..."
304,"Untuk pasar domestik Jepang, mesin untuk Auris...","Untuk pasar domestik Jepang, mesin untuk Auris...","[untuk, pasar, domestik, jepang, ,, mesin, unt...","[untuk, pasar, domestik, jepang, ,, mesin, unt...","[untuk, pasar, domestik, jepang, ,, mesin, unt...","[untuk, pasar, domestik, jepang, ,, mesin, unt..."
305,"Untuk pasar domestik Jepang, mesin untuk Auris...","Untuk pasar domestik Jepang, mesin untuk Auris...","[untuk, pasar, domestik, jepang, ,, mesin, unt...","[untuk, pasar, domestik, jepang, ,, mesin, unt...","[untuk, pasar, domestik, jepang, ,, mesin, unt...","[untuk, pasar, domestik, jepang, ,, mesin, unt..."
306,"Untuk pasar domestik Jepang, mesin untuk Auris...",Mesin untuk Auris di Jepang hanya 1.5 liter 1N...,"[untuk, pasar, domestik, jepang, ,, mesin, unt...","[mesin, untuk, auris, di, jepang, hanya, 1.5, ...","[untuk, pasar, domestik, jepang, ,, mesin, unt...","[mesin, untuk, auris, di, jepang, hanya, 1.5, ..."
307,"Untuk pasar domestik Jepang, mesin untuk Auris...",1.5 liter 1NZ-FE dibutuhkan untuk Auris.,"[untuk, pasar, domestik, jepang, ,, mesin, unt...","[1.5, liter, 1nz-fe, dibutuhkan, untuk, auris, .]","[untuk, pasar, domestik, jepang, ,, mesin, unt...","[1.5, liter, 1, nz-fe, dibutuhkan, untuk, auri..."
308,"Untuk pasar domestik Jepang, mesin untuk Auris...",Auris butuh 2ZR-FE.,"[untuk, pasar, domestik, jepang, ,, mesin, unt...","[auris, butuh, 2zr-fe, .]","[untuk, pasar, domestik, jepang, ,, mesin, unt...","[auris, butuh, 2, zr-fe, .]"
316,Jejaring Pendidikan Nasional (JARDIKNAS) merup...,Sekola bukan institusi pendidikan.,"[jejaring, pendidikan, nasional, (, jardiknas,...","[sekola, bukan, institusi, pendidikan, .]","[jejaring, pendidikan, nasional, (, jardiknas,...","[sekola, bukan, institusi, pendidikan, .]"


In [10]:
difference['nltk_p'] = difference.apply(lambda x: set(x.nltk_prem).difference(set(x.aksara_prem)), axis=1)
difference['aksara_p'] = difference.apply(lambda x: set(x.aksara_prem).difference(set(x.nltk_prem)), axis=1)
difference['nltk_h'] = difference.apply(lambda x: set(x.nltk_hyp).difference(set(x.aksara_hyp)), axis=1)
difference['aksara_h'] = difference.apply(lambda x: set(x.aksara_hyp).difference(set(x.nltk_hyp)), axis=1)

In [11]:
difference[['premis_text', 'nltk_p', 'aksara_p']].to_csv('difference.csv', encoding='utf-16', sep='\t')

In [13]:
lay.groupby('label')[['jaccard_sim', 'intersection_hypothesis', 'lcs_punct', 'lcs_nopunct', 'lcs_aksara']].median()

Unnamed: 0_level_0,jaccard_sim,intersection_hypothesis,lcs_punct,lcs_nopunct,lcs_aksara
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
c,0.277778,0.75,0.666667,0.6,0.666667
e,0.32,0.833333,0.714286,0.666667,0.727273
n,0.222222,0.666667,0.555556,0.5,0.571429


In [15]:
lay.groupby('sentence_size')[['jaccard_sim', 'intersection_hypothesis', 'lcs_punct', 'lcs_nopunct', 'lcs_aksara']].median()

Unnamed: 0_level_0,jaccard_sim,intersection_hypothesis,lcs_punct,lcs_nopunct,lcs_aksara
sentence_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
double,0.238095,0.75,0.625,0.571429,0.625
multiple,0.157895,0.777778,0.666667,0.625,0.666667
single,0.285714,0.75,0.666667,0.6,0.666667


In [16]:
lay.groupby('data_split')[['jaccard_sim', 'intersection_hypothesis', 'lcs_punct', 'lcs_nopunct', 'lcs_aksara']].median()

Unnamed: 0_level_0,jaccard_sim,intersection_hypothesis,lcs_punct,lcs_nopunct,lcs_aksara
data_split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
dev,0.277778,0.75,0.647059,0.6,0.666667
test,0.272727,0.75,0.666667,0.6,0.666667
train,0.269231,0.75,0.666667,0.6,0.666667


In [17]:
expert = pd.read_json(path_or_buf='expert.jsonl', lines=True)
expert.head()

Unnamed: 0,id_pair,id_premis,author_label,annotation_round,label,premis_text,hypothesis_text,data_split,annotator_type,sentence_size,source,topics
0,31100,3110,e,1,e,Filipina terkenal dengan pertanian padi bukitn...,Suku Batad bertani sejak sekitar 2000 tahun si...,test,expert,single,wiki,geografi
1,31101,3110,n,2,n,Filipina terkenal dengan pertanian padi bukitn...,Filipina mengekspor 2000 ton padi tahun lalu,test,expert,single,wiki,geografi
2,31102,3110,n,1,n,Filipina terkenal dengan pertanian padi bukitn...,Suku Batad terkenal di Filipina.,test,expert,single,wiki,geografi
3,31103,3110,e,2,e,Filipina terkenal dengan pertanian padi bukitn...,Sawah padi Batad berusia ribuan tahun.,test,expert,single,wiki,geografi
4,31104,3110,c,2,c,Filipina terkenal dengan pertanian padi bukitn...,Suku Batad tinggal di pantai.,test,expert,single,wiki,geografi


In [22]:
expert.topics.value_counts()

olahraga               313
film                   209
margasatwa             203
geografi               191
person                 190
bencana                188
politik                179
teknologi              164
ekonomi                161
budaya                 145
entertainment          137
soshum pendidikan      136
saintek                131
hukum/undang-undang    122
kesehatan              118
agama                  113
profil perusahaan      111
profil sekolah          89
homepage pemerintah     84
Name: topics, dtype: int64

In [23]:
expert['jaccard_sim'] = expert.apply(lambda x: jaccard_sim(word_tokenize(x.premis_text.lower()), word_tokenize(x.hypothesis_text.lower())), axis=1)
# expert['intersection_premis'] = expert.apply(lambda x: intersection_len(word_tokenize(x.premis_text.lower()), word_tokenize(x.hypothesis_text.lower())), axis=1)
expert['intersection_hypothesis'] = expert.apply(lambda x: intersection_len(word_tokenize(x.premis_text.lower()), word_tokenize(x.hypothesis_text.lower()), premis=False), axis=1)
expert['lcs_punct'] = expert.apply(lambda x: lcs(x.premis_text, x.hypothesis_text, mode=1), axis=1)
expert['lcs_nopunct'] = expert.apply(lambda x: lcs(x.premis_text, x.hypothesis_text, mode=1, punct=True), axis=1)
# expert['cosine_ochiai'] = expert.apply(lambda x: cosine_ochiai(word_tokenize(x.premis_text.lower()), word_tokenize(x.hypothesis_text.lower())), axis=1)
expert['lcs_aksara'] = expert.apply(lambda x: lcs(x.premis_text, x.hypothesis_text, mode=1, tokeniser='aksara'), axis=1)

In [24]:
expert.head()

Unnamed: 0,id_pair,id_premis,author_label,annotation_round,label,premis_text,hypothesis_text,data_split,annotator_type,sentence_size,source,topics,jaccard_sim,intersection_hypothesis,lcs_punct,lcs_nopunct,lcs_aksara
0,31100,3110,e,1,e,Filipina terkenal dengan pertanian padi bukitn...,Suku Batad bertani sejak sekitar 2000 tahun si...,test,expert,single,wiki,geografi,0.238095,0.555556,0.333333,0.25,0.333333
1,31101,3110,n,2,n,Filipina terkenal dengan pertanian padi bukitn...,Filipina mengekspor 2000 ton padi tahun lalu,test,expert,single,wiki,geografi,0.263158,0.714286,0.571429,0.571429,0.571429
2,31102,3110,n,1,n,Filipina terkenal dengan pertanian padi bukitn...,Suku Batad terkenal di Filipina.,test,expert,single,wiki,geografi,0.277778,0.833333,0.5,0.4,0.5
3,31103,3110,e,2,e,Filipina terkenal dengan pertanian padi bukitn...,Sawah padi Batad berusia ribuan tahun.,test,expert,single,wiki,geografi,0.2,0.571429,0.428571,0.333333,0.428571
4,31104,3110,c,2,c,Filipina terkenal dengan pertanian padi bukitn...,Suku Batad tinggal di pantai.,test,expert,single,wiki,geografi,0.15,0.5,0.5,0.4,0.5


In [25]:
expert.groupby('label')[['jaccard_sim', 'intersection_hypothesis', 'lcs_punct', 'lcs_nopunct', 'lcs_aksara']].median()

Unnamed: 0_level_0,jaccard_sim,intersection_hypothesis,lcs_punct,lcs_nopunct,lcs_aksara
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
c,0.208333,0.714286,0.625,0.571429,0.625
e,0.210526,0.7,0.6,0.533333,0.6
n,0.150758,0.538462,0.444444,0.375,0.444444


In [26]:
expert.label.value_counts()

e    1041
c     999
n     944
Name: label, dtype: int64

In [27]:
expert.groupby('sentence_size')[['jaccard_sim', 'intersection_hypothesis', 'lcs_punct', 'lcs_nopunct', 'lcs_aksara']].median()

Unnamed: 0_level_0,jaccard_sim,intersection_hypothesis,lcs_punct,lcs_nopunct,lcs_aksara
sentence_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
double,0.166667,0.666667,0.571429,0.5,0.571429
multiple,0.12,0.666667,0.555556,0.5,0.555556
single,0.235294,0.625,0.545455,0.5,0.545455


In [28]:
expert.sentence_size.value_counts()

single      1534
double      1043
multiple     407
Name: sentence_size, dtype: int64

In [29]:
expert.groupby('source')[['jaccard_sim', 'intersection_hypothesis', 'lcs_punct', 'lcs_nopunct', 'lcs_aksara']].median()

Unnamed: 0_level_0,jaccard_sim,intersection_hypothesis,lcs_punct,lcs_nopunct,lcs_aksara
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
news,0.206623,0.666667,0.545455,0.5,0.545455
web,0.166667,0.666667,0.583333,0.555556,0.6
wiki,0.185185,0.666667,0.555556,0.5,0.555556
wiki/news,0.158947,0.707143,0.571429,0.5,0.571429


In [30]:
expert.source.value_counts()

news         1376
wiki         1066
web           284
wiki/news     258
Name: source, dtype: int64

In [31]:
expert.groupby('topics')[['jaccard_sim', 'intersection_hypothesis', 'lcs_punct', 'lcs_nopunct', 'lcs_aksara']].median()

Unnamed: 0_level_0,jaccard_sim,intersection_hypothesis,lcs_punct,lcs_nopunct,lcs_aksara
topics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
agama,0.15,0.714286,0.571429,0.5,0.571429
bencana,0.2,0.630682,0.550505,0.5,0.550505
budaya,0.163265,0.666667,0.571429,0.5,0.571429
ekonomi,0.291667,0.764706,0.666667,0.642857,0.666667
entertainment,0.225806,0.666667,0.6,0.555556,0.6
film,0.142857,0.666667,0.583333,0.5,0.6
geografi,0.24,0.666667,0.555556,0.5,0.555556
homepage pemerintah,0.205882,0.7,0.630682,0.585714,0.636364
hukum/undang-undang,0.15625,0.6,0.5,0.440972,0.5
kesehatan,0.2,0.63961,0.525575,0.483333,0.545455


In [33]:
expert.topics.value_counts()

olahraga               313
film                   209
margasatwa             203
geografi               191
person                 190
bencana                188
politik                179
teknologi              164
ekonomi                161
budaya                 145
entertainment          137
soshum pendidikan      136
saintek                131
hukum/undang-undang    122
kesehatan              118
agama                  113
profil perusahaan      111
profil sekolah          89
homepage pemerintah     84
Name: topics, dtype: int64

In [34]:
expert_diff = expert.loc[expert['lcs_punct'].ne(expert['lcs_aksara'])][['premis_text', 'hypothesis_text']]
expert_diff['nltk_prem'] = expert_diff.apply(lambda x: word_tokenize(x.premis_text.lower()), axis=1) 
expert_diff['nltk_hyp'] = expert_diff.apply(lambda x: word_tokenize(x.hypothesis_text.lower()), axis=1)
expert_diff['aksara_prem'] = expert_diff.apply(lambda x: tokens.tokenize(x.premis_text.lower()), axis=1) 
expert_diff['aksara_hyp'] = expert_diff.apply(lambda x: tokens.tokenize(x.hypothesis_text.lower()), axis=1)
expert_diff.head(10)

Unnamed: 0,premis_text,hypothesis_text,nltk_prem,nltk_hyp,aksara_prem,aksara_hyp
54,Lokasi kota Solok sangat strategis karena terl...,Jalan antar provinsi melewati kota Solok.,"[lokasi, kota, solok, sangat, strategis, karen...","[jalan, antar, provinsi, melewati, kota, solok...","[lokasi, kota, solok, sangat, strategis, karen...","[jalan, antar, provinsi, melewati, kota, solok..."
55,Lokasi kota Solok sangat strategis karena terl...,Kota Solok ramai dilalui bus antar kota.,"[lokasi, kota, solok, sangat, strategis, karen...","[kota, solok, ramai, dilalui, bus, antar, kota...","[lokasi, kota, solok, sangat, strategis, karen...","[kota, solok, ramai, dilalui, bus, antar, kota..."
56,Lokasi kota Solok sangat strategis karena terl...,Jalan antar kabupaten/kota menghubungkan Solok...,"[lokasi, kota, solok, sangat, strategis, karen...","[jalan, antar, kabupaten/kota, menghubungkan, ...","[lokasi, kota, solok, sangat, strategis, karen...","[jalan, antar, kabupaten, /, kota, menghubungk..."
57,Lokasi kota Solok sangat strategis karena terl...,Jalan antar provinsi sepanjang 90 kilometer me...,"[lokasi, kota, solok, sangat, strategis, karen...","[jalan, antar, provinsi, sepanjang, 90, kilome...","[lokasi, kota, solok, sangat, strategis, karen...","[jalan, antar, provinsi, sepanjang, 90, kilome..."
58,Lokasi kota Solok sangat strategis karena terl...,Lokasi kota Solok strategis sebagai kota pelab...,"[lokasi, kota, solok, sangat, strategis, karen...","[lokasi, kota, solok, strategis, sebagai, kota...","[lokasi, kota, solok, sangat, strategis, karen...","[lokasi, kota, solok, strategis, sebagai, kota..."
90,Bupati Kotawaringin Timur saat ini adalah H. S...,H. Supian Hadi bukan bupati Kotawaringin Timur...,"[bupati, kotawaringin, timur, saat, ini, adala...","[h., supian, hadi, bukan, bupati, kotawaringin...","[bupati, kotawaringin, timur, saat, ini, adala...","[h, ., supian, hadi, bukan, bupati, kotawaring..."
501,Harry Potter adalah seri tujuh novel fantasi y...,Novel Harry Potter ditulis oleh J.K. Rowling.,"[harry, potter, adalah, seri, tujuh, novel, fa...","[novel, harry, potter, ditulis, oleh, j.k., ro...","[harry, potter, adalah, seri, tujuh, novel, fa...","[novel, harry, potter, ditulis, oleh, j, ., k,..."
503,Harry Potter adalah seri tujuh novel fantasi y...,Ronald Weasley bersahabat dengan J.K. Rowling,"[harry, potter, adalah, seri, tujuh, novel, fa...","[ronald, weasley, bersahabat, dengan, j.k., ro...","[harry, potter, adalah, seri, tujuh, novel, fa...","[ronald, weasley, bersahabat, dengan, j, ., k,..."
505,Harry Potter adalah seri tujuh novel fantasi y...,J.K. Rowling suka bertualang.,"[harry, potter, adalah, seri, tujuh, novel, fa...","[j.k., rowling, suka, bertualang, .]","[harry, potter, adalah, seri, tujuh, novel, fa...","[j, ., k, ., rowling, suka, bertualang, .]"
519,Setiap novel mengisahkan tentang satu tahun ke...,Setiap novel mengisahkan kehidupan Harry antar...,"[setiap, novel, mengisahkan, tentang, satu, ta...","[setiap, novel, mengisahkan, kehidupan, harry,...","[setiap, novel, mengisahkan, tentang, satu, ta...","[setiap, novel, mengisahkan, kehidupan, harry,..."


In [36]:
expert_diff['nltk_p'] = expert_diff.apply(lambda x: set(x.nltk_prem).difference(set(x.aksara_prem)), axis=1)
expert_diff['aksara_p'] = expert_diff.apply(lambda x: set(x.aksara_prem).difference(set(x.nltk_prem)), axis=1)
expert_diff['nltk_h'] = expert_diff.apply(lambda x: set(x.nltk_hyp).difference(set(x.aksara_hyp)), axis=1)
expert_diff['aksara_h'] = expert_diff.apply(lambda x: set(x.aksara_hyp).difference(set(x.nltk_hyp)), axis=1)

In [37]:
expert_diff[['premis_text', 'nltk_p', 'aksara_p']].to_csv('expert_difference.csv', encoding='utf-16', sep='\t')

In [39]:
len(expert)

2984

In [40]:
expert.describe()

Unnamed: 0,id_pair,id_premis,annotation_round,jaccard_sim,intersection_hypothesis,lcs_punct,lcs_nopunct,lcs_aksara
count,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0
mean,39714.800268,3971.231568,1.24866,0.242849,0.647621,0.571489,0.518496,0.573621
std,5737.780456,573.779051,0.544837,0.180223,0.201935,0.207796,0.232306,0.207696
min,31100.0,3110.0,1.0,0.019608,0.083333,0.076923,0.0,0.076923
25%,34400.75,3440.0,1.0,0.125,0.5,0.416667,0.333333,0.416667
50%,39210.5,3921.0,1.0,0.1875,0.666667,0.555556,0.5,0.555556
75%,44283.25,4428.0,1.0,0.290323,0.8125,0.714286,0.666667,0.723485
max,50475.0,5047.0,3.0,1.0,1.0,1.0,1.0,1.0


In [33]:
a, b = data.iloc[303].premis_text, data.iloc[303].hypothesis_text 
tokens = BaseTokenizer()
print(a)
print(tokens.tokenize(a))
print(word_tokenize(a))
print(b)
print(tokens.tokenize(b))
print(word_tokenize(b))

Untuk pasar domestik Jepang, mesin untuk Auris hanya 1.5 liter 1NZ-FE dan 1.8 liter 2ZR-FE.
['Untuk', 'pasar', 'domestik', 'Jepang', ',', 'mesin', 'untuk', 'Auris', 'hanya', '1.5', 'liter', '1', 'NZ-FE', 'dan', '1.8', 'liter', '2', 'ZR-FE', '.']
['Untuk', 'pasar', 'domestik', 'Jepang', ',', 'mesin', 'untuk', 'Auris', 'hanya', '1.5', 'liter', '1NZ-FE', 'dan', '1.8', 'liter', '2ZR-FE', '.']
Untuk pasar internasional, mesin untuk Auris hanya 1.5 liter 1NZ-FE dan 1.8 liter 2ZR-FE.
['Untuk', 'pasar', 'internasional', ',', 'mesin', 'untuk', 'Auris', 'hanya', '1.5', 'liter', '1', 'NZ-FE', 'dan', '1.8', 'liter', '2', 'ZR-FE', '.']
['Untuk', 'pasar', 'internasional', ',', 'mesin', 'untuk', 'Auris', 'hanya', '1.5', 'liter', '1NZ-FE', 'dan', '1.8', 'liter', '2ZR-FE', '.']


In [7]:
a, b = data.iloc[0].premis_text, data.iloc[0].hypothesis_text 
cosine_ochiai(word_tokenize(a.lower()), word_tokenize(b.lower()))

0.3872983346207417

In [63]:
for i in range(5):
  p, h = data.iloc[i].premis_text, data.iloc[i].hypothesis_text  
  print('Premis:', p)
  print('Hypothesis:', h, '\n')
  print('Jaccard:', jaccard_sim(word_tokenize(p.lower()), word_tokenize(h.lower()), verb=True), '\n')
  print('Intersection premis:', intersection_len(word_tokenize(p.lower()), word_tokenize(h.lower()), verb=True), '\n')
  print('Intersection hypothesis:', intersection_len(word_tokenize(p.lower()), word_tokenize(h.lower()), premis=False, verb=True), '\n')
  print('Cosine ochiai:', cosine_ochiai(word_tokenize(p.lower()), word_tokenize(h.lower())))
  print('\n============================================\n')

Premis: Selain itu, ia juga memiliki andil dari rumah sandiwara tempat ia dan teman-temannya bermain.
Hypothesis: Ia memiliki teman. 

intersect: {'memiliki', '.', 'ia'}
union: {'memiliki', ',', 'dari', 'sandiwara', 'dan', '.', 'juga', 'selain', 'teman-temannya', 'tempat', 'teman', 'andil', 'bermain', 'ia', 'rumah', 'itu'}
Jaccard: 0.1875 

intersect: {'memiliki', '.', 'ia'}
denom: 15
Intersection premis: 0.2 

intersect: {'memiliki', '.', 'ia'}
denom: 4
Intersection hypothesis: 0.75 

Cosine ochiai: 0.3872983346207417


Premis: Selain itu, ia juga memiliki andil dari rumah sandiwara tempat ia dan teman-temannya bermain.
Hypothesis: Teman-temannya memiliki bagian juga. 

intersect: {'memiliki', 'teman-temannya', '.', 'juga'}
union: {'memiliki', ',', 'dari', 'sandiwara', 'dan', '.', 'bagian', 'juga', 'selain', 'teman-temannya', 'tempat', 'andil', 'bermain', 'ia', 'rumah', 'itu'}
Jaccard: 0.25 

intersect: {'memiliki', 'teman-temannya', '.', 'juga'}
denom: 15
Intersection premis: 0.26666

In [19]:
print(word_tokenize(re.sub(r'[^\w\s]','',a).lower()))
print(word_tokenize(re.sub(r'[^\w\s]','',b).lower()))
print(a)
print(a.translate(str.maketrans('', '', string.punctuation)))
print(re.sub(r'[^\w\s]','',a))

['selain', 'itu', 'ia', 'juga', 'memiliki', 'andil', 'dari', 'rumah', 'sandiwara', 'tempat', 'ia', 'dan', 'temantemannya', 'bermain']
['ia', 'memiliki', 'teman']
Selain itu, ia juga memiliki andil dari rumah sandiwara tempat ia dan teman-temannya bermain.
Selain itu ia juga memiliki andil dari rumah sandiwara tempat ia dan temantemannya bermain
Selain itu ia juga memiliki andil dari rumah sandiwara tempat ia dan temantemannya bermain


In [78]:
a, b = data.iloc[1].premis_text, data.iloc[1].hypothesis_text 

print("s1 = " + a)
print("s2 = " + b)

print(re.sub(r'[\.\?\'",]','',a))
print(re.sub(r'[\.\?\'",]','',b))

long_seq = lcs(a, b, mode=1, punct=False)
print("\nLCS = " + str(long_seq))


s1 = Selain itu, ia juga memiliki andil dari rumah sandiwara tempat ia dan teman-temannya bermain.
s2 = Teman-temannya memiliki bagian juga.
Selain itu ia juga memiliki andil dari rumah sandiwara tempat ia dan teman-temannya bermain
Teman-temannya memiliki bagian juga

LCS = 0.4


In [69]:
data.groupby('data_split')[['jaccard_sim', 'intersection_premis', 'intersection_hypothesis', 'cosine_ochiai']].median()

Unnamed: 0_level_0,jaccard_sim,intersection_premis,intersection_hypothesis,cosine_ochiai
data_split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dev,0.277778,0.3125,0.75,0.483046
test,0.272727,0.307692,0.75,0.474342
train,0.269231,0.3,0.75,0.472456


In [70]:
data.groupby('label')[['jaccard_sim', 'intersection_premis', 'intersection_hypothesis', 'cosine_ochiai']].median()

Unnamed: 0_level_0,jaccard_sim,intersection_premis,intersection_hypothesis,cosine_ochiai
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
c,0.277778,0.3125,0.75,0.472456
e,0.32,0.352941,0.833333,0.536875
n,0.222222,0.25,0.666667,0.40452


In [71]:
data.groupby('sentence_size')[['jaccard_sim', 'intersection_premis', 'intersection_hypothesis', 'cosine_ochiai']].median()

Unnamed: 0_level_0,jaccard_sim,intersection_premis,intersection_hypothesis,cosine_ochiai
sentence_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
double,0.238095,0.259259,0.75,0.442522
multiple,0.157895,0.165423,0.777778,0.363012
single,0.285714,0.315789,0.75,0.486664


In [5]:
print(f'jaccard_sim.mean(): {data.jaccard_sim.mean()}')
print(f'cosine_sim.mean(): {data.cosine_sim.mean()}')


jaccard_sim.mean(): 0.3992650096239511
cosine_sim.mean(): 0.6121265777697864


In [6]:
print(f'jaccard_sim.std(): {data.jaccard_sim.std()}')
print(f'cosine_sim.std(): {data.cosine_sim.std()}')

jaccard_sim.std(): 0.16857486902490587
cosine_sim.std(): 0.13449692328288737


In [15]:
data.label.value_counts()

e    5091
c    4952
n    4685
Name: label, dtype: int64

In [17]:
data.sentence_size.value_counts()

single      11988
double       2060
multiple      680
Name: sentence_size, dtype: int64

In [18]:
data.data_split.value_counts()

train    10330
test      2201
dev       2197
Name: data_split, dtype: int64

In [19]:
a, b, c, d = 'test', 'text', 'saya makan nasi.', 'saya makan ikan.'

def print_metric(distance, text1, text2):
  model = distance_class.get(distance)
  
  print(f'Text1: {text1}\nText2: {text2}')
  print(f'{distance}.distance: {model.distance(text1, text2)}')
  print(f'{distance}.normalized_distance: {model.normalized_distance(text1, text2)}')
  print(f'{distance}.similarity: {model.similarity(text1, text2)}')
  print(f'{distance}.normalized_similarity: {model.normalized_similarity(text1, text2)}')
  print()
  
print_metric('cosine', c, d)
print_metric('jaccard', c, d)

Text1: saya makan nasi.
Text2: saya makan ikan.
cosine.distance: 0.0625
cosine.normalized_distance: 0.0625
cosine.similarity: 0.9375
cosine.normalized_similarity: 0.9375

Text1: saya makan nasi.
Text2: saya makan ikan.
jaccard.distance: 0.11764705882352944
jaccard.normalized_distance: 0.11764705882352944
jaccard.similarity: 0.8823529411764706
jaccard.normalized_similarity: 0.8823529411764706



In [11]:
data.to_json(path_or_buf='distance.jsonl', orient='records', lines=True)