<a href="https://colab.research.google.com/github/rahmanidashti/ACQSurvey/blob/main/semantic_representation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentence_transformers
!pip install -U kaleido

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 2.3 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 62.2 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 19.5 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 76.1 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 54.1 MB/s 
Building wheels for collected pa

In [None]:
# import packages
import json
import os
import pandas as pd
import math
import plotly.express as px
from tqdm import tqdm

# importing random module
import random

from sentence_transformers import SentenceTransformer

from scipy.spatial.distance import pdist,squareform
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

import numpy as np
from sklearn.manifold import TSNE

Load Qulac Dataset

In [None]:
acq_datasets = {'qulac':
                {
                    'format': 'txt',
                    'domains': []
                },
                'clariq':
                {
                    'format': 'tsv',
                    'domains': []
                },
                'clarq':
                {
                    'format': 'txt',
                    'domains': []
                },
                'clariqfw':
                {
                    'format': 'txt',
                    'domains': []
                },
                'claqua':
                {
                    'format': 'txt',
                    'domains': []
                },
                'sharc':
                {
                    'format': 'txt',
                    'domains': []
                },
                'mimics':
                {
                    'format': 'tsv',
                    'domains': []
                },
                'mimics':
                {
                    'format': 'txt',
                    'domains': []
                },
                'mimicsduo':
                {
                    'format': 'txt',
                    'domains': []
                },
                'amazoncq':
                {
                    'format': 'txt',
                    'domains': []
                },
                'msdialoge':
                {
                    'format': 'txt',
                    'domains': []
                },
                'mantis':
                {
                    'format': 'txt',
                    'domains': []
                },
                'tavakolicq':
                {
                    'format': 'txt',
                    'domains': []
                },
                'raocq':
                {
                    'format': 'txt',
                    'domains': ['askubuntu', 'superuser', 'unixstackexchange']
                }
              }

In [None]:
# All datasets
# ds_names = ['qulac', 'clariq', 'mimicsduo', 'clariqfw', 'mantis', 'msdialoge', 'mimics', 'tavakolicq', 'amazoncq', 'raocq', 'claqua', 'clarq']

# Conv. Search datasets
# ds_names = ['qulac', 'clariq', 'mimicsduo', 'clariqfw', 'mantis', 'msdialoge', 'mimics', 'tavakolicq']

# Conv. QA
ds_names = ['amazoncq', 'raocq', 'claqua', 'clarq']

In [None]:
# download datasets: train, test, tune
def download_datasets():
  ds_root_path = "datasets/"
  for dataset in ds_names:
    dataset_path = os.path.join(ds_root_path, dataset)
    if not os.path.isdir(dataset_path):
      os.makedirs(dataset_path)
      print("Directory '%s' is created." % dataset_path)
    else:
      print("Directory '%s' is exist." % dataset_path)

    # -nc: skip downloads that would download to existing files.

    if len(acq_datasets[dataset]['domains']) > 0:
      for domain in acq_datasets[dataset]['domains']:
        try:
          os.system(f"wget -P {dataset_path} -nc https://raw.githubusercontent.com/rahmanidashti/ACQDatasetsSurvey/main/cqs/{dataset}/{dataset}_{domain}.{acq_datasets[dataset]['format']}")
          print(f"{dataset}_{domain}: The dataset is downloaded.")
        except Expception as e:
          print(e)
    else:
      try:
        os.system(f"wget -P {dataset_path} -nc https://raw.githubusercontent.com/rahmanidashti/ACQDatasetsSurvey/main/cqs/{dataset}/{dataset}.{acq_datasets[dataset]['format']}")
        print(f"{dataset}: The dataset is downloaded.")
      except Expception as e:
        print(e)

In [None]:
download_datasets()

Directory 'datasets/amazoncq' is created.
amazoncq: The dataset is downloaded.
Directory 'datasets/raocq' is created.
raocq_askubuntu: The dataset is downloaded.
raocq_superuser: The dataset is downloaded.
raocq_unixstackexchange: The dataset is downloaded.
Directory 'datasets/claqua' is created.
claqua: The dataset is downloaded.
Directory 'datasets/clarq' is created.
clarq: The dataset is downloaded.


In [None]:
def CQ_reader(dataset_name="qulac"):

  sentences = []

  # Qulac
  if dataset_name == "qulac":
    dataset_file = open("datasets/qulac/qulac.txt", 'r')
    for cq in dataset_file:
      sentences.append(cq)

  # ClariQ
  elif dataset_name == "clariq":
    dataset_file = pd.read_csv("datasets/clariq/clariq.tsv", sep='\t')
    # Iterating through the json
    for id, cq in dataset_file['question'].items():
        sentences.append(cq)

  # MIMICS
  elif dataset_name == "mimics":
    dataset_file = open("datasets/mimics/mimics.txt", 'r')
    for cq in dataset_file:
      sentences.append(cq)

  # AmazonCQ
  elif dataset_name == "amazoncq":
    dataset_file = open("datasets/amazoncq/amazoncq.txt", 'r')
    for cq in dataset_file:
      sentences.append(cq)

  # ShARC
  elif dataset_name == "sharc":
    dataset_file = open("datasets/sharc/sharc.txt", 'r')
    for cq in dataset_file:
      sentences.append(cq)

  # MANTis
  elif dataset_name == "mantis":
    dataset_file = open("datasets/mantis/mantis.txt", 'r')
    for cq in dataset_file:
      sentences.append(cq)

  # TavakoliCQ
  elif dataset_name == "tavakolicq":
    dataset_file = open("datasets/tavakolicq/tavakolicq.txt", 'r')
    for cq in dataset_file:
      sentences.append(cq)

  # ClariQ-FW
  elif dataset_name == "clariqfw":
    dataset_file = open("datasets/clariqfw/clariqfw.txt", 'r')
    for cq in dataset_file:
      sentences.append(cq)

  # ClarQ
  elif dataset_name == "clarq":
    dataset_file = open("datasets/clarq/clarq.txt", 'r')
    for cq in dataset_file:
      sentences.append(cq)

  # MIMICS-Duo
  elif dataset_name == "mimicsduo":
    dataset_file = open("datasets/mimicsduo/mimicsduo.txt", 'r')
    for cq in dataset_file:
      sentences.append(cq)

  # MSDialoge
  elif dataset_name == "msdialoge":
    dataset_file = open("datasets/msdialoge/msdialoge.txt", 'r')
    for cq in dataset_file:
      sentences.append(cq)

  # CLAQUA
  elif dataset_name == "claqua":
    dataset_file = open("datasets/claqua/claqua.txt", 'r')
    for cq in dataset_file:
      sentences.append(cq)

  # RaoCQ
  elif dataset_name == "raocq":
    for domain in acq_datasets['raocq']['domains']:
      dataset_file = open(f"datasets/raocq/raocq_{domain}.txt", 'r')
      for cq in dataset_file:
        sentences.append(cq)

  return sentences

Sentence Embeddings using Sentence-BERT

In [None]:
ds_real_names = {'qulac': 'Qulac',
                 'clariq': 'ClariQ',
                 'mimicsduo': 'MIMICS-Dou',
                 'clariqfw': 'ClariQ-FKw',
                 'mantis': 'MANtIS',
                 'msdialoge': 'MSDialog',
                 'mimics': 'MIMICS',
                 'tavakolicq': 'TavakoliCQ',
                 'amazoncq': 'AmazonCQ',
                 'raocq': 'RaoCQ',
                 'claqua': 'CLAQUA',
                 'clarq': 'ClarQ'}

In [None]:
def get_embeddings(sentences):
  print(f" > get_embeddings start processing for {DATASET_NAME}.")
  model = SentenceTransformer('all-distilroberta-v1')

  #Sentences are encoded by calling model.encode()
  embeddings = model.encode(sentences)
  labels = np.array([ds_real_names[DATASET_NAME]] * len(sentences))

  print(f" > get_embeddings finished for {DATASET_NAME}.\n")
  return embeddings, labels

In [None]:
### Main Part ###

all_embeddings = np.ndarray((0, 768))
all_labels = np.ndarray((0))

for DATASET_NAME in ds_names:
  print(f"----- Dataset Name: {DATASET_NAME} -----\n")

  # read the CQs (or sentences) of the relevant dataset
  sentences = CQ_reader(dataset_name=DATASET_NAME)

  # initializing the value of n
  CQ_sample_number = 435 # Equal to the number of CQs in MANtIS
  print(f"CQ_sample_number: {CQ_sample_number}")

  # printing n elements from list
  sentences = random.choices(sentences, k=CQ_sample_number)

  # get the embedding of the CQs (or sentences)
  embeddings, labels = get_embeddings(sentences=sentences)

  all_embeddings = np.append(all_embeddings, embeddings, axis=0)
  all_labels = np.append(all_labels, labels)

----- Dataset Name: amazoncq -----

CQ_sample_number: 435
 > get_embeddings start processing for amazoncq.
 > get_embeddings finished for amazoncq.

----- Dataset Name: raocq -----

CQ_sample_number: 435
 > get_embeddings start processing for raocq.
 > get_embeddings finished for raocq.

----- Dataset Name: claqua -----

CQ_sample_number: 435
 > get_embeddings start processing for claqua.
 > get_embeddings finished for claqua.

----- Dataset Name: clarq -----

CQ_sample_number: 435
 > get_embeddings start processing for clarq.
 > get_embeddings finished for clarq.



In [None]:
# print dimension
all_embeddings.shape, len(all_labels)

((1740, 768), 1740)

In [None]:
# To check for Nan
array_sum = np.sum(all_embeddings)
array_has_nan = np.isnan(array_sum)

print(array_has_nan)

False


In [None]:
# NaN incides if exist
np.argwhere(all_embeddings!=all_embeddings)

array([], shape=(0, 2), dtype=int64)

In [None]:
# get tSNE embeddings
tsne = TSNE(n_components=2, verbose=1, random_state=123)
z = tsne.fit_transform(all_embeddings) 

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 1740 samples in 0.002s...
[t-SNE] Computed neighbors for 1740 samples in 0.134s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1740
[t-SNE] Computed conditional probabilities for sample 1740 / 1740
[t-SNE] Mean sigma: 0.320651
[t-SNE] KL divergence after 250 iterations with early exaggeration: 75.279709
[t-SNE] KL divergence after 1000 iterations: 1.537627


In [None]:
df = pd.DataFrame()
df["Dataset"] = all_labels
df["Comp-1"] = z[:,0]
df["Comp-2"] = z[:,1]

In [None]:
# color_seq = ['#AA0DFE',
#  '#3283FE',
#  '#85660D',
#  '#782AB6',
#  '#565656',
#  '#1C8356',
#  '#16FF32',
#  '#F7E1A0',
#  '#E2E2E2',
#  '#1CBE4F',
#  '#C4451C',
#  '#DEA0FD',
#  '#FE00FA',
#  '#325A9B',
#  '#FEAF16',
#  '#F8A19F',
#  '#90AD1C',
#  '#F6222E',
#  '#1CFFCE',
#  '#2ED9FF',
#  '#B10DA1',
#  '#C075A6',
#  '#FC1CBF',
#  '#B00068',
#  '#FBE426',
#  '#FA0087']

# color_seq = ['#C0E78C', '#ff7f0e', '#2ca02c',  
# '#d62728',  
# '#9467bd',
# '#8c564b', 
# '#e377c2', 
# '#7f7f7f', 
# '#bcbd22', '#17becf']

color_seq = ['#666EF2', '#DE6046', '#67CDA6',  
'#A167F2',  
'#F2A667',
'#63D0EF', 
'#ED7092', 
'#C0E78C', 
'#F09CF9', '#F6CD68', '#AF0D78', '#FA0087']

# color_discrete_sequence=color_seq,
 
fig = px.scatter(df, x='Comp-1', y='Comp-2', color_discrete_sequence=color_seq, color='Dataset', labels={'color': 'Dataset'})
fig.update_layout(showlegend=True, legend=dict(font=dict(size=16)))
fig.show()

In [None]:
fig.write_image("tsne_convqa_datasets.pdf", engine="kaleido")