# Processing text queries
- code in this notebook predominantly analyze text queries submitted by participants to solve KIS tasks of VBS 2023
- among others, this notebook can replicate the content of Table 2 and 3 as well as Figures 13-15

In [None]:
import sys
# if in notebook folder, change directory to parent one
import os
if os.getcwd().split('/')[-1] == 'notebooks':
    os.chdir('..')
import sys

# imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import math
from notebooks.utils import compute_user_penalty, get_team_values_df
from common.load import load_competition_data, process_team_logs

pd.set_option('display.max_colwidth', None)
unknownRankLimit = 1000
unknownRankValue = 2000

# Import common data

In [None]:
config = 'config_vbs2023.yaml'

# load competition data from dres files and auxiliary data (FPSs, sequences)
comp_data = load_competition_data(config)

# load the preprocessed query data
dataset = pd.read_pickle(comp_data["config"]["processed_logs_outdir"] + '/text_query_dataset.pkl')

# valid teams
team_order = ['vibro', 'VISIONE',  'vitrivr-VR', 'CVHunter', 'Verge']
#team_order = ['vibro', 'VISIONE', 'VIREO' 'vitrivr-VR', 'CVHunter', 'vitrivr', 'Verge']

dataset.shape

### Creating auxiliary variables
- Query length and volume of words per query
- Maybe also store information whether the query is temporal? Only HTW and VISIONE have obviouse temporal queries
- Define visual vs textual tasks

In [None]:
dataset["task_type"] = "visual"
dataset.loc[dataset.task.str.contains("kis-t"),"task_type"] = "textual"

dataset["QT"] = "Other"
dataset.loc[dataset.is_joint_embedding_text_query, "QT"] = "Text"

dataset["QueryLen"] = -1
dataset["QueryWords"] = -1

dataset.loc[dataset["category"]=="TEXT","QueryLen"] = dataset.loc[dataset["category"]=="TEXT","value"].str.len()
dataset.loc[dataset["category"]=="TEXT","QueryWords"] = dataset.loc[dataset["category"]=="TEXT","value"].str.split().str.len()

dataset.head()

# Work with text embeddings

In [None]:
dataset.columns

In [None]:
dataset

In [None]:
dataset.QT.unique()

In [None]:
import json
stdf = dataset.loc[dataset.QT=="Text",'joint_text_embedding'].values
stdf = np.stack(stdf)

In [None]:
stdf.shape

In [None]:
embedColnames = ["f_"+str(i) for i in range(stdf.shape[1])]
dfEmbeds = pd.DataFrame(stdf, columns=embedColnames, index=dataset.loc[dataset.QT=="Text"].index)

In [None]:
jointDF = pd.concat([dataset.loc[dataset.QT=="Text"], dfEmbeds], axis=1)
jointDF

In [None]:
jointDF.task.unique()

In [None]:
from sklearn.metrics import pairwise_distances
def upper_tri_indexing(A):
    m = A.shape[0]
    r,c = np.triu_indices(m,1)
    return A[r,c]

def ILD(dataset, columns):
    dt = dataset[columns].values
    if len(dt)==0:
        return (np.empty(shape=(0, 0)), 0)
    distMatrix = pairwise_distances(dt,metric="cosine")
    #remove distances to self
    distMatrix = upper_tri_indexing(distMatrix)
    return (distMatrix,distMatrix.mean())


In [None]:
upper_tri_indexing(np.array([[1,2,3],[4,5,6],[7,8,9]]))

### How does query distances differ for individual tasks?

In [None]:
dMats = {}
for t in jointDF.task.unique():    
    distMatrix, meanVal = ILD(jointDF.loc[jointDF["task"]==t],embedColnames)
    dMats[t] = distMatrix.ravel()
    print (t, meanVal)
dMats = pd.Series(dMats)

In [None]:
textTasks = [i for i in jointDF.task.unique() if "kis-t" in i]
visualTasks = [i for i in jointDF.task.unique() if (("kis-v-" not in i)&("kis-v" in i))]
marineTasks = [i for i in jointDF.task.unique() if "kis-v-" in i]

### Textual tasks have smaller between-query distances than both visual ones
- also marine tasks has slightly smaller distances than V3C1 visual ones

In [None]:
from scipy.stats import ttest_ind

print(np.concatenate(dMats[textTasks].values).mean())
print(np.concatenate(dMats[visualTasks].values).mean())
print(np.concatenate(dMats[marineTasks].values).mean())
print(ttest_ind(np.concatenate(dMats[textTasks].values),np.concatenate(dMats[visualTasks].values)))
print(ttest_ind(np.concatenate(dMats[textTasks].values),np.concatenate(dMats[marineTasks].values)))
print(ttest_ind(np.concatenate(dMats[marineTasks].values),np.concatenate(dMats[visualTasks].values)))

txt = pd.DataFrame({"v": np.concatenate(dMats[textTasks].values)})
txt["type"]="Textual"
vis = pd.DataFrame({"v": np.concatenate(dMats[visualTasks].values)})
vis["type"]="Visual"
mar = pd.DataFrame({"v": np.concatenate(dMats[marineTasks].values)})
mar["type"]="Marine Visual"
dfPlot = pd.concat([txt,vis,mar])

sns.boxenplot(y=dfPlot["v"],x=dfPlot["type"])

## Team-wise differences

In [None]:
dMats = {}
for t in jointDF.task.unique():
    for tm in jointDF.team.unique():
        distMatrix, meanVal = ILD(jointDF.loc[((jointDF["task"]==t)&(jointDF["team"]==tm))],embedColnames)
        dMats[(t,tm)] = distMatrix.ravel()
        print (t, tm, meanVal)
dMats = pd.Series(dMats)

In [None]:
for tm in jointDF.team.unique():
    keys = list(np.broadcast(jointDF.task.unique(),tm))
    print(tm, np.concatenate(dMats[keys].values).mean())

### Both vibro and VISIONE had more consistent per-task queries than CVHunter
- comparison with other teams omitted due to missing data

In [None]:
print(ttest_ind(np.concatenate(dMats[list(np.broadcast(jointDF.task.unique(),"vibro"))].values),np.concatenate(dMats[list(np.broadcast(jointDF.task.unique(),"CVHunter"))].values)))
print(ttest_ind(np.concatenate(dMats[list(np.broadcast(jointDF.task.unique(),"VISIONE"))].values),np.concatenate(dMats[list(np.broadcast(jointDF.task.unique(),"CVHunter"))].values)))
print(ttest_ind(np.concatenate(dMats[list(np.broadcast(jointDF.task.unique(),"vibro"))].values),np.concatenate(dMats[list(np.broadcast(jointDF.task.unique(),"VISIONE"))].values)))

## Comparison of both users per team and task

In [None]:
jointDF.groupby(["team","user"]).count()["task"]

- removing verge as no distinction between users is available
- several times, vitrivr-VR does not have any records from one of the users (candidate for removal)

In [None]:
jointDFNoVerge = jointDF.loc[jointDF.team != "Verge"]
jointDF.shape,jointDFNoVerge.shape

In [None]:
def ILD_pair(dataset1, dataset2, columns):
    dt1 = dataset1[columns].values
    dt2 = dataset2[columns].values
    if (len(dt1)==0)|(len(dt2)==0):
        return (np.empty(shape=(0, 0)), 0)
    distMatrix = pairwise_distances(dt1,dt2,metric="cosine")
    return (distMatrix.flatten(),distMatrix.mean())

In [None]:
dMats = {}
for t in jointDFNoVerge.task.unique():
    for tm in jointDFNoVerge.team.unique():
        dt1 = jointDFNoVerge.loc[((jointDFNoVerge["task"]==t)&(jointDFNoVerge["team"]==tm)&(jointDFNoVerge["user"]==0))]
        dt2 = jointDFNoVerge.loc[((jointDFNoVerge["task"]==t)&(jointDFNoVerge["team"]==tm)&(jointDFNoVerge["user"]==1))] 
        
        distMatrix, meanVal = ILD_pair(dt1, dt2, embedColnames)
        dMats[(t,tm)] = distMatrix.ravel()
        print (t, tm, meanVal)
dMats = pd.Series(dMats)

In [None]:
for tm in jointDFNoVerge.team.unique():
    keys = list(np.broadcast(jointDFNoVerge.task.unique(),tm))
    print(tm, np.concatenate(dMats[keys].values).mean())

- not so much different from the results of the overall distances (just a bit higher values)

In [None]:
dMats2 = {}
for t in jointDFNoVerge.task.unique():
    for tm in jointDFNoVerge.team.unique():
        for u in jointDFNoVerge.user.unique():
            distMatrix, meanVal = ILD(jointDFNoVerge.loc[((jointDFNoVerge["task"]==t)&(jointDFNoVerge["team"]==tm)&(jointDFNoVerge["user"]==u))],embedColnames)
            dMats2[(t,tm,u)] = distMatrix.ravel()
            print (t, tm,u, meanVal)
dMats2 = pd.Series(dMats2)

- quite a few NaNs due to having only a single query per user

In [None]:
for tm in jointDFNoVerge.team.unique():
    keys = list(np.broadcast(jointDFNoVerge.task.unique(),tm))
    users = [0]*len(keys)+[1]*len(keys)
    keys = [(keys[i%len(keys)][0],keys[i%len(keys)][1],val) for i,val in enumerate(users)]
    print(tm, np.concatenate(dMats2[keys].values).mean())

## Much smaller differences in within-user query distance as compared to between users (in the same team)

In [None]:
for tm in jointDFNoVerge.team.unique():
    keys = list(np.broadcast(jointDFNoVerge.task.unique(),tm))
    keys1 = keys
    users = [0]*len(keys)+[1]*len(keys)
    keys = [(keys[i%len(keys)][0],keys[i%len(keys)][1],val) for i,val in enumerate(users)]
    
    print(tm,ttest_ind(np.concatenate(dMats[keys1].values),np.concatenate(dMats2[keys].values)))

# Differences in sequences of query reformulations
- only users & tasks, where at least **4** text queries were made

In [None]:
dt = sortedData.loc[((sortedData.QueryRank <= 4)&(sortedData.MaxQueryRank >= 4))]
#cannot be done for verge as only one user is present
dt = dt.loc[dt.team != "Verge"]
dt.columns,dt.shape

In [None]:
jointDF[f]

In [None]:
stdf = dt.loc[dt.QT=="Text",'joint_text_embedding'].values
stdf = np.stack(stdf)
dfEmbeds = pd.DataFrame(stdf, columns=embedColnames, index=dt.loc[dt.QT=="Text"].index)
seqDFEmbeds = pd.concat([dt.loc[dt.QT=="Text"], dfEmbeds], axis=1)
seqDFEmbeds.shape

In [None]:
def ILD_noRemove(dataset, columns):
    dt = dataset[columns].values
    if len(dt)==0:
        return (np.empty(shape=(0, 0)), 0)
    distMatrix = pairwise_distances(dt,metric="cosine")
    return (distMatrix,distMatrix.mean())


In [None]:
dMats3 = {}
for t in seqDFEmbeds.task.unique():
    for tm in seqDFEmbeds.team.unique():
        for u in seqDFEmbeds.user.unique():
            dt = seqDFEmbeds.loc[((seqDFEmbeds["task"]==t)&(seqDFEmbeds["team"]==tm)&(seqDFEmbeds["user"]==u))]
            dt = dt.sort_values("QueryRank")
            #print(dt.QueryRank)
            distMatrix, meanVal = ILD_noRemove(dt,embedColnames)
            if len(distMatrix)>0:
                dMats3[(t,tm,u)] = distMatrix
            print (t, tm,u, meanVal)
dMats3 = pd.Series(dMats3)
sequentialResultsArray = np.stack(dMats3.values)

In [None]:
sequentialResultsArray[:,0,:].mean(axis=0)#distances to first query

In [None]:
#distances to subsequent queries
print(
    sequentialResultsArray[:,0,1].mean(),
    sequentialResultsArray[:,1,2].mean(),
    sequentialResultsArray[:,2,3].mean()
)

In [None]:
plt.boxplot([sequentialResultsArray[:,0,1],sequentialResultsArray[:,1,2],sequentialResultsArray[:,2,3]])

## While the distance to the initial query rises over time (unsurprisingly), the step size between consecutive queries remain roughly the same and rather small

In [None]:
from Levenshtein import ratio
def LevenshteinNormDist(dataset, txtCol):
    dt = dataset[txtCol].values
    if len(dt)==0:
        return (np.empty(shape=(0, 0)), 0)
    distMatrix = np.zeros((len(dt),len(dt)))
    for i,t1 in enumerate(dt):
        for j,t2 in enumerate(dt):
            distMatrix[i,j] = 1 - ratio(t1,t2)

    return (distMatrix,distMatrix.mean())


In [None]:
dMats4 = {}
for t in seqDFEmbeds.task.unique():
    for tm in seqDFEmbeds.team.unique():
        for u in seqDFEmbeds.user.unique():
            dt = seqDFEmbeds.loc[((seqDFEmbeds["task"]==t)&(seqDFEmbeds["team"]==tm)&(seqDFEmbeds["user"]==u))]
            dt = dt.sort_values("QueryRank")
            #print(dt.QueryRank)
            distMatrix, meanVal = LevenshteinNormDist(dt,"value")
            if len(distMatrix)>0:
                dMats4[(t,tm,u)] = distMatrix
            print (t, tm,u, meanVal)
dMats4 = pd.Series(dMats4)
sequentialResultsLevensteinArray = np.stack(dMats4.values)

In [None]:
sequentialResultsLevensteinArray[:,0,:].mean(axis=0)#distances to first query

In [None]:
#distances to subsequent queries
print(
    sequentialResultsLevensteinArray[:,0,1].mean(),
    sequentialResultsLevensteinArray[:,1,2].mean(),
    sequentialResultsLevensteinArray[:,2,3].mean()
)

In [None]:
ttest_ind(sequentialResultsLevensteinArray[:,0,1], sequentialResultsLevensteinArray[:,1,2])

## Results of Levenstein distance support those of embeds distance. It seems that subsequent changes are a bit smaller for later reformulations, but no stat sign.

# Differences in sequences of query reformulations
- only users & tasks, where at least **3** text queries were made

In [None]:
dt = sortedData.loc[((sortedData.QueryRank <= 3)&(sortedData.MaxQueryRank >= 3))]
#cannot be done for verge as only one user is present
dt = dt.loc[dt.team != "Verge"]

stdf = dt.loc[dt.QT=="Text",'joint_text_embedding'].values
stdf = np.stack(stdf)
dfEmbeds = pd.DataFrame(stdf, columns=embedColnames, index=dt.loc[dt.QT=="Text"].index)
seqDFEmbeds = pd.concat([dt.loc[dt.QT=="Text"], dfEmbeds], axis=1)
seqDFEmbeds.shape

In [None]:
dMats3 = {}
for t in seqDFEmbeds.task.unique():
    for tm in seqDFEmbeds.team.unique():
        for u in seqDFEmbeds.user.unique():
            dt = seqDFEmbeds.loc[((seqDFEmbeds["task"]==t)&(seqDFEmbeds["team"]==tm)&(seqDFEmbeds["user"]==u))]
            dt = dt.sort_values("QueryRank")
            #print(dt.QueryRank)
            distMatrix, meanVal = ILD_noRemove(dt,embedColnames)
            if len(distMatrix)>0:
                dMats3[(t,tm,u)] = distMatrix
            print (t, tm,u, meanVal)
dMats3 = pd.Series(dMats3)
sequentialResultsArray = np.stack(dMats3.values)

In [None]:
print(sequentialResultsArray[:,0,:].mean(axis=0)) #distances to first query

print(#distances to subsequent queries
    sequentialResultsArray[:,0,1].mean(),
    sequentialResultsArray[:,1,2].mean()
)

In [None]:
ttest_ind(sequentialResultsArray[:,0,1], sequentialResultsArray[:,1,2])

In [None]:
dMats4 = {}
for t in seqDFEmbeds.task.unique():
    for tm in seqDFEmbeds.team.unique():
        for u in seqDFEmbeds.user.unique():
            dt = seqDFEmbeds.loc[((seqDFEmbeds["task"]==t)&(seqDFEmbeds["team"]==tm)&(seqDFEmbeds["user"]==u))]
            dt = dt.sort_values("QueryRank")
            #print(dt.QueryRank)
            distMatrix, meanVal = LevenshteinNormDist(dt,"value")
            if len(distMatrix)>0:
                dMats4[(t,tm,u)] = distMatrix
            print (t, tm,u, meanVal)
dMats4 = pd.Series(dMats4)
sequentialResultsLevensteinArray = np.stack(dMats4.values)

In [None]:
print(sequentialResultsLevensteinArray[:,0,:].mean(axis=0)) #distances to first query

print(#distances to subsequent queries
    sequentialResultsLevensteinArray[:,0,1].mean(),
    sequentialResultsLevensteinArray[:,1,2].mean()
)

In [None]:
ttest_ind(sequentialResultsLevensteinArray[:,0,1], sequentialResultsLevensteinArray[:,1,2])

# Differences in sequences of query reformulations
- only users & tasks, where at least **5** text queries were made

In [None]:
dt = sortedData.loc[((sortedData.QueryRank <= 5)&(sortedData.MaxQueryRank >= 5))]
#cannot be done for verge as only one user is present
dt = dt.loc[dt.team != "Verge"]

stdf = dt.loc[dt.QT=="Text",'joint_text_embedding'].values
stdf = np.stack(stdf)
dfEmbeds = pd.DataFrame(stdf, columns=embedColnames, index=dt.loc[dt.QT=="Text"].index)
seqDFEmbeds = pd.concat([dt.loc[dt.QT=="Text"], dfEmbeds], axis=1)
seqDFEmbeds.shape

In [None]:
dMats3 = {}
for t in seqDFEmbeds.task.unique():
    for tm in seqDFEmbeds.team.unique():
        for u in seqDFEmbeds.user.unique():
            dt = seqDFEmbeds.loc[((seqDFEmbeds["task"]==t)&(seqDFEmbeds["team"]==tm)&(seqDFEmbeds["user"]==u))]
            dt = dt.sort_values("QueryRank")
            #print(dt.QueryRank)
            distMatrix, meanVal = ILD_noRemove(dt,embedColnames)
            if len(distMatrix)>0:
                dMats3[(t,tm,u)] = distMatrix
            print (t, tm,u, meanVal)
dMats3 = pd.Series(dMats3)
sequentialResultsArray = np.stack(dMats3.values)

In [None]:
print(sequentialResultsArray[:,0,:].mean(axis=0)) #distances to first query

print(#distances to subsequent queries
    sequentialResultsArray[:,0,1].mean(),
    sequentialResultsArray[:,1,2].mean(),
    sequentialResultsArray[:,2,3].mean(),
    sequentialResultsArray[:,3,4].mean()
)

In [None]:
dMats4 = {}
for t in seqDFEmbeds.task.unique():
    for tm in seqDFEmbeds.team.unique():
        for u in seqDFEmbeds.user.unique():
            dt = seqDFEmbeds.loc[((seqDFEmbeds["task"]==t)&(seqDFEmbeds["team"]==tm)&(seqDFEmbeds["user"]==u))]
            dt = dt.sort_values("QueryRank")
            #print(dt.QueryRank)
            distMatrix, meanVal = LevenshteinNormDist(dt,"value")
            if len(distMatrix)>0:
                dMats4[(t,tm,u)] = distMatrix
            print (t, tm,u, meanVal)
dMats4 = pd.Series(dMats4)
sequentialResultsLevensteinArray = np.stack(dMats4.values)

In [None]:
print(sequentialResultsLevensteinArray[:,0,:].mean(axis=0)) #distances to first query

print(#distances to subsequent queries
    sequentialResultsLevensteinArray[:,0,1].mean(),
    sequentialResultsLevensteinArray[:,1,2].mean(),
    sequentialResultsLevensteinArray[:,2,3].mean(),
    sequentialResultsLevensteinArray[:,3,4].mean()
)

# Differences in sequences of query reformulations
- only users & tasks, where at least **4** text queries were made
- selecting last 4 queries

In [None]:
dt = sortedData.loc[((sortedData.DiffFromMaxQueryRank >= -3)&(sortedData.MaxQueryRank >= 4))]
#cannot be done for verge as only one user is present
dt = dt.loc[dt.team != "Verge"]

stdf = dt.loc[dt.QT=="Text",'joint_text_embedding'].values
stdf = np.stack(stdf)
dfEmbeds = pd.DataFrame(stdf, columns=embedColnames, index=dt.loc[dt.QT=="Text"].index)
seqDFEmbeds = pd.concat([dt.loc[dt.QT=="Text"], dfEmbeds], axis=1)
seqDFEmbeds.shape

In [None]:
dMats3 = {}
for t in seqDFEmbeds.task.unique():
    for tm in seqDFEmbeds.team.unique():
        for u in seqDFEmbeds.user.unique():
            dt = seqDFEmbeds.loc[((seqDFEmbeds["task"]==t)&(seqDFEmbeds["team"]==tm)&(seqDFEmbeds["user"]==u))]
            dt = dt.sort_values("QueryRank")
            #print(dt.QueryRank)
            distMatrix, meanVal = ILD_noRemove(dt,embedColnames)
            if len(distMatrix)>0:
                dMats3[(t,tm,u)] = distMatrix
            print (t, tm,u, meanVal)
dMats3 = pd.Series(dMats3)
sequentialResultsArray = np.stack(dMats3.values)

In [None]:
print(sequentialResultsArray[:,2,:].mean(axis=0)) #distances to first query

print(#distances to subsequent queries
    sequentialResultsArray[:,0,1].mean(),
    sequentialResultsArray[:,1,2].mean(),
    sequentialResultsArray[:,2,3].mean()
)

In [None]:
ttest_ind(sequentialResultsArray[:,0,1], sequentialResultsArray[:,1,2])

In [None]:
plt.boxplot([sequentialResultsArray[:,0,1],sequentialResultsArray[:,1,2],sequentialResultsArray[:,2,3]])

In [None]:
dMats4 = {}
for t in seqDFEmbeds.task.unique():
    for tm in seqDFEmbeds.team.unique():
        for u in seqDFEmbeds.user.unique():
            dt = seqDFEmbeds.loc[((seqDFEmbeds["task"]==t)&(seqDFEmbeds["team"]==tm)&(seqDFEmbeds["user"]==u))]
            dt = dt.sort_values("QueryRank")
            #print(dt.QueryRank)
            distMatrix, meanVal = LevenshteinNormDist(dt,"value")
            if len(distMatrix)>0:
                dMats4[(t,tm,u)] = distMatrix
            print (t, tm,u, meanVal)
dMats4 = pd.Series(dMats4)
sequentialResultsLevensteinArray = np.stack(dMats4.values)

In [None]:
print(sequentialResultsLevensteinArray[:,2,:].mean(axis=0)) #distances to first query

print(#distances to subsequent queries
    sequentialResultsLevensteinArray[:,0,1].mean(),
    sequentialResultsLevensteinArray[:,1,2].mean()
)

In [None]:
ttest_ind(sequentialResultsLevensteinArray[:,0,1], sequentialResultsLevensteinArray[:,1,2])