In [15]:
import pandas as pd
import numpy as np
import string
import re
import csv

from nltk.tokenize import word_tokenize
from collections import Counter
from functools import reduce

from pathlib import Path


In [2]:
def get_seq_counter(*seq):
  return [Counter(s) for s in seq]

def intersect_counter(*seq):
  intersection = seq[0].copy()
  for s in seq[1:]:
    intersection &= s
  return intersection

def union_counter(*seq):
  union = seq[0].copy()
  for s in seq[1:]:
    union |= s
  return union

def count(counter):
  return len(set(counter))

def jaccard_sim(*seq, verb=False):
  sequences = get_seq_counter(*seq)
  intersection = count(intersect_counter(*sequences))
  union = count(union_counter(*sequences))
  
  if verb:
    print('intersect:', set(intersect_counter(*sequences)))
    print('union:', set(union_counter(*sequences)))
  
  return intersection / union

def intersection_len(*seq, premis=True, verb=False):
  """
  seq[0] is the premis
  seq[1] is the hypothesis
  """
  sequences = get_seq_counter(*seq)
  intersection = count(intersect_counter(*sequences))
  denom = len(set(seq[0])) if premis else len(set(seq[1]))
  
  if verb:
    print('intersect:', set(intersect_counter(*sequences)))
    print('denom:', denom)
    
  return intersection / denom

def show_matrix(m):
  rows = len(m)
  cols = len(m[0])
  for i in range(rows):
    for j in range(cols):
      print("%4d" % m[i][j], end="")
    print("")
    
def lcs(*seq, mode=0, punct=False, tokeniser='nltk'):
  if tokeniser == 'nltk':
    s1 = word_tokenize(re.sub(r'[\.\?\'"!,]','',seq[0]).lower()) if punct else word_tokenize(seq[0].lower())
    s2 = word_tokenize(re.sub(r'[\.\?\'"!,]','',seq[1]).lower()) if punct else word_tokenize(seq[1].lower())
  else:
    s1 = tokens.tokenize(seq[0].lower())
    s2 = tokens.tokenize(seq[1].lower())
  n1 = len(s1); n2 = len(s2)
  mat = np.zeros((n1+1,n2+1), dtype=np.int64)

  for i in range(0, n1+1):
    for j in range(0, n2+1):
      if i == 0 or j == 0:
        mat[i][j] = 0
      elif s1[i-1] == s2[j-1]:
        mat[i][j] = mat[i-1][j-1] + 1
      else:
        mat[i][j] = max( mat[i-1][j], mat[i][j-1] )

#   show_matrix(mat)
        
  denom = len(s2) if mode else len(s1)
  return mat[n1][n2] / denom

def convert_label(label):
  if label[0] == 'c':
    return 'c'
  elif label[0] == 'e':
    return 'e'
  elif label[0] == 'n':
    return 'n'
  
  return label

def new_words_rate(*seq):
  """
  seq[0] is the premis
  seq[1] is the hypothesis
  """
  s1 = word_tokenize(seq[0].lower())
  s2 = word_tokenize(seq[1].lower())
  
  new_words = set(s2).difference(set(s1))

  return len(new_words) / len(set(s2))

In [14]:
import os

basepath = 'data/'
output = 'instance_similarity_new.xlsx' # for appending, an excel file should be created first(?)

with os.scandir(basepath) as entries:
  for entry in entries:
    if entry.is_dir():      
      with os.scandir(entry) as subdirs:
        for file in subdirs:
          df = pd.read_json(path_or_buf=file, lines=True)          
          print(f'{entry.name}/{file.name}')
          if 'pair_id' in df.columns:
            df['label'] = df['label'].apply(convert_label)
            df['jaccard'] = df.apply(lambda x: jaccard_sim(word_tokenize(x.premise.lower()), word_tokenize(x.hypothesis.lower())), axis=1)
            df['intersection'] = df.apply(lambda x: intersection_len(word_tokenize(x.premise.lower()), word_tokenize(x.hypothesis.lower()), premis=False), axis=1)
            df['lcs'] = df.apply(lambda x: lcs(x.premise, x.hypothesis, mode=1), axis=1)
            df['new_words_rate'] = df.apply(lambda x: new_words_rate(x.premise, x.hypothesis), axis=1)
            
            with pd.ExcelWriter(output, engine='openpyxl', mode='a') as writer:
              df[['pair_id', 'label', 'jaccard', 'intersection', 'lcs', 'new_words_rate']].to_excel(writer, index=False, sheet_name=f'{entry.name} {file.name}')    
#               df.to_excel(writer, index=False, sheet_name=f'{entry.name} {file.name}')    
              
          elif 'pairid' in df.columns:
            df['gold_label'] = df['gold_label'].apply(convert_label)
            df['jaccard'] = df.apply(lambda x: jaccard_sim(word_tokenize(x.sentence1.lower()), word_tokenize(x.sentence2.lower())), axis=1)
            df['intersection'] = df.apply(lambda x: intersection_len(word_tokenize(x.sentence1.lower()), word_tokenize(x.sentence2.lower()), premis=False), axis=1)
            df['lcs'] = df.apply(lambda x: lcs(x.sentence1, x.sentence2, mode=1), axis=1)
            df['new_words_rate'] = df.apply(lambda x: new_words_rate(x.sentence1, x.sentence2), axis=1)

            with pd.ExcelWriter(output, engine='openpyxl', mode='a') as writer:
              df[['pairid', 'gold_label', 'jaccard', 'intersection', 'lcs', 'new_words_rate']].to_excel(writer, index=False, sheet_name=f'{entry.name} {file.name}')
#               df.to_excel(writer, index=False, sheet_name=f'{entry.name} {file.name}')
              

indo_nli/test.jsonl
indo_nli/test_expert.jsonl
indo_nli/test_lay.jsonl
indo_nli/train.jsonl
indo_nli/val.jsonl
indo_xnli/dev.jsonl
indo_xnli/dev_matched.jsonl
indo_xnli/dev_mismatched.jsonl
indo_xnli/train.jsonl
indo_xnli_small/dev.jsonl
indo_xnli_small/train.jsonl


In [4]:
# with pd.ExcelWriter('output.xlsx', engine='openpyxl', mode='a') as writer:
#   df[['pairid', 'sentence1', 'sentence2']].to_excel(writer, index=False, sheet_name='Sheet_name_3')

In [3]:
xls = pd.ExcelFile('instance_similarity_new2.xlsx')
# Indo-nli train, indo-nli test, indo-xnli-small train


In [16]:
df1 = pd.read_excel(xls, 'indo_nli test.jsonl')
df1.groupby('label')[['jaccard', 'intersection', 'lcs', 'new_words_rate']].median()

Unnamed: 0_level_0,jaccard,intersection,lcs,new_words_rate
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
c,0.25,0.75,0.642857,0.25
e,0.25,0.75,0.666667,0.25
n,0.175,0.571429,0.5,0.428571


In [17]:
df2 = pd.read_excel(xls, 'indo_nli test_expert.jsonl')
df2.groupby('label')[['jaccard', 'intersection', 'lcs', 'new_words_rate']].median()

Unnamed: 0_level_0,jaccard,intersection,lcs,new_words_rate
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
c,0.208333,0.714286,0.625,0.285714
e,0.210526,0.7,0.6,0.3
n,0.150758,0.538462,0.444444,0.461538


In [18]:
df3 = pd.read_excel(xls, 'indo_nli test_lay.jsonl')
df3.groupby('label')[['jaccard', 'intersection', 'lcs', 'new_words_rate']].median()

Unnamed: 0_level_0,jaccard,intersection,lcs,new_words_rate
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
c,0.285714,0.75,0.666667,0.25
e,0.318182,0.833333,0.714286,0.166667
n,0.210526,0.625,0.545455,0.375


In [19]:
df4 = pd.read_excel(xls, 'indo_nli train.jsonl')
df4.groupby('label')[['jaccard', 'intersection', 'lcs', 'new_words_rate']].median()

Unnamed: 0_level_0,jaccard,intersection,lcs,new_words_rate
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
c,0.272727,0.75,0.666667,0.25
e,0.319091,0.833333,0.727273,0.166667
n,0.222222,0.666667,0.571429,0.333333


In [20]:
df5 = pd.read_excel(xls, 'indo_nli val.jsonl')
df5.groupby('label')[['jaccard', 'intersection', 'lcs', 'new_words_rate']].median()

Unnamed: 0_level_0,jaccard,intersection,lcs,new_words_rate
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
c,0.291667,0.727273,0.666667,0.272727
e,0.321429,0.833333,0.733333,0.166667
n,0.210526,0.625,0.545455,0.375


In [4]:
df6 = pd.read_excel(xls, 'indo_xnli dev.jsonl')
df6.groupby('gold_label')[['jaccard', 'intersection', 'lcs', 'new_words_rate']].median()

Unnamed: 0_level_0,jaccard,intersection,lcs,new_words_rate
gold_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-,0.210526,0.5,0.428571,0.5
c,0.2,0.466667,0.4,0.533333
e,0.28,0.6,0.5,0.4
n,0.166667,0.4,0.333333,0.6


In [5]:
df6['gold_label'].value_counts()

e    6942
c    6453
n    6252
-     353
Name: gold_label, dtype: int64

In [6]:
df7 = pd.read_excel(xls, 'indo_xnli dev_matched.jsonl')
df7.groupby('gold_label')[['jaccard', 'intersection', 'lcs', 'new_words_rate']].median()

Unnamed: 0_level_0,jaccard,intersection,lcs,new_words_rate
gold_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-,0.208333,0.466667,0.428571,0.533333
c,0.1875,0.444444,0.4,0.555556
e,0.259259,0.571429,0.5,0.428571
n,0.16,0.375,0.333333,0.625


In [7]:
df7['gold_label'].value_counts()

e    3479
c    3213
n    3123
-     185
Name: gold_label, dtype: int64

In [8]:
df8 = pd.read_excel(xls, 'indo_xnli dev_mismatched.jsonl')
df8.groupby('gold_label')[['jaccard', 'intersection', 'lcs', 'new_words_rate']].median()

Unnamed: 0_level_0,jaccard,intersection,lcs,new_words_rate
gold_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-,0.218254,0.5,0.428571,0.5
c,0.206897,0.5,0.428571,0.5
e,0.3,0.625,0.529412,0.375
n,0.181818,0.4,0.35,0.6


In [9]:
df8['gold_label'].value_counts()

e    3463
c    3240
n    3129
-     168
Name: gold_label, dtype: int64

In [27]:
df9 = pd.read_excel(xls, 'indo_xnli train.jsonl')
df9.groupby('gold_label')[['jaccard', 'intersection', 'lcs', 'new_words_rate']].median()

Unnamed: 0_level_0,jaccard,intersection,lcs,new_words_rate
gold_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
c,0.181818,0.444444,0.384615,0.555556
e,0.266667,0.6,0.5,0.4
n,0.162162,0.375,0.333333,0.625


In [10]:
df10 = pd.read_excel(xls, 'indo_xnli_small dev.jsonl')
df10.groupby('gold_label')[['jaccard', 'intersection', 'lcs', 'new_words_rate']].median()

Unnamed: 0_level_0,jaccard,intersection,lcs,new_words_rate
gold_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-,0.244048,0.527778,0.470769,0.472222
c,0.190476,0.466667,0.4,0.533333
e,0.290323,0.6,0.5,0.4
n,0.166667,0.384615,0.333333,0.615385


In [11]:
df10['gold_label'].value_counts()

e    746
n    708
c    704
-     40
Name: gold_label, dtype: int64

In [26]:
df11 = pd.read_excel(xls, 'indo_xnli_small train.jsonl')
df11.groupby('gold_label')[['jaccard', 'intersection', 'lcs', 'new_words_rate']].median()

Unnamed: 0_level_0,jaccard,intersection,lcs,new_words_rate
gold_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
c,0.181818,0.444444,0.384615,0.555556
e,0.266667,0.6,0.5,0.4
n,0.166667,0.391304,0.333333,0.608696


In [22]:
# df4 = df4.sort_values(by=['intersection'])
# df4.to_json(path_or_buf='indo_nli_train_sorted.jsonl', orient='records', lines=True)

In [9]:
round_trip = pd.read_csv('indonli-roundtrip.tsv', sep='\t', encoding='utf-8', quoting=csv.QUOTE_NONE)

In [14]:
round_trip['jaccard'] = round_trip.apply(lambda x: jaccard_sim(word_tokenize(x.Premises_text.lower()), word_tokenize(x.Hipotesis_teks.lower())), axis=1)
round_trip['intersection'] = round_trip.apply(lambda x: intersection_len(word_tokenize(x.Premises_text.lower()), word_tokenize(x.Hipotesis_teks.lower()), premis=False), axis=1)
round_trip['lcs'] = round_trip.apply(lambda x: lcs(x.Premises_text, x.Hipotesis_teks, mode=1), axis=1)
round_trip['new_words_rate'] = round_trip.apply(lambda x: new_words_rate(x.Premises_text, x.Hipotesis_teks), axis=1)

In [17]:
round_trip_output = 'round_trip_sim.xlsx'
round_trip.to_excel(round_trip_output, index=False) 

In [18]:
round_trip.groupby('label')[['jaccard', 'intersection', 'lcs', 'new_words_rate']].median()

Unnamed: 0_level_0,jaccard,intersection,lcs,new_words_rate
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
c,0.238095,0.642857,0.555556,0.357143
e,0.277778,0.727273,0.625,0.272727
n,0.2,0.571429,0.5,0.428571
