# Processing text queries
- code in this notebook predominantly analyze text queries submitted by participants to solve KIS tasks of VBS 2023
- among others, this notebook can replicate the content of Table 2 and 3 as well as Figures 13-15

In [None]:
import sys
import os
if os.getcwd().split('/')[-1] == 'notebooks':
    os.chdir('..')

# imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import math
from notebooks.utils import compute_user_penalty, get_team_values_df
from common.load import load_competition_data, process_team_logs

pd.set_option('display.max_colwidth', None)
unknownRankLimit = 1000
unknownRankValue = 2000

# Import common data

In [None]:
config = 'config_vbs2023.yaml'

# load competition data from dres files and auxiliary data (FPSs, sequences)
comp_data = load_competition_data(config)

# load the preprocessed query data
dataset = pd.read_pickle(comp_data["config"]["processed_logs_outdir"] + '/text_query_dataset.pkl')

# valid teams
team_order = ['vibro', 'VISIONE',  'vitrivr-VR', 'CVHunter', 'Verge']
#team_order = ['vibro', 'VISIONE', 'VIREO' 'vitrivr-VR', 'CVHunter', 'vitrivr', 'Verge']

dataset.shape

### Creating auxiliary variables
- Query length and volume of words per query
- Maybe also store information whether the query is temporal? Only HTW and VISIONE have obviouse temporal queries
- Define visual vs textual tasks

In [None]:
dataset["task_type"] = "visual"
dataset.loc[dataset.task.str.contains("kis-t"),"task_type"] = "textual"

dataset["QT"] = "Other"
dataset.loc[dataset.is_joint_embedding_text_query, "QT"] = "Text"

dataset["QueryLen"] = -1
dataset["QueryWords"] = -1

dataset.loc[dataset["category"]=="TEXT","QueryLen"] = dataset.loc[dataset["category"]=="TEXT","value"].str.len()
dataset.loc[dataset["category"]=="TEXT","QueryWords"] = dataset.loc[dataset["category"]=="TEXT","value"].str.split().str.len()

dataset.head()

In [None]:
#append dummy values for too high ratings (have to be labeled in plots properly in charts)
dataset.loc[dataset.rank_video > unknownRankLimit,"rank_video"] = unknownRankValue
dataset.loc[dataset.rank_shot_margin_0 > unknownRankLimit,"rank_shot_margin_0"] = unknownRankValue
dataset.loc[dataset.rank_shot_margin_5 > unknownRankLimit,"rank_shot_margin_5"] = unknownRankValue


In [None]:
(dataset.loc[dataset.QT == "Other"]).is_temporal_query.unique()

In [None]:
textData = dataset.loc[dataset["QT"]=="Text"]

In [None]:
dataset.shape, textData.shape

In [None]:
textualMaxTime = 420000
visualMaxTime = 300000
dataset.loc[((dataset.correct_submission_time_ms.isna())&(dataset.task_type=="textual")),"correct_submission_time_ms"] = textualMaxTime
dataset.loc[((dataset.correct_submission_time_ms.isna())&(dataset.task_type=="visual")),"correct_submission_time_ms"] = visualMaxTime

In [None]:
boundaries = [0,60,120,180,240,300,360,420]
boundaries = [b*1000 for b in boundaries]
valid_bins = []
dataset["hist_bin"] = 0

for b in boundaries:
    
    if b > 0:
        # checking whether the team was available throughout the whole period of the bin
        dataset["valid_"+str(b-60*1000)] = (dataset["correct_submission_time_ms"]>=b)
        valid_bins.append("valid_"+str(b-60*1000))
    dataset.loc[dataset.elapsed_since_task_start_ms >= b, "hist_bin"] = b
    
dataset.tail()

# Work with text embeddings

In [None]:
dataset.columns

In [None]:
dataset

In [None]:
dataset.QT

In [None]:
import json
stdf = dataset.loc[dataset.QT=="Text",'joint_text_embedding'].values
stdf = np.stack(stdf)

In [None]:
stdf.shape

In [None]:
embedColnames = ["f_"+str(i) for i in range(stdf.shape[1])]
dfEmbeds = pd.DataFrame(stdf, columns=embedColnames, index=dataset.loc[dataset.QT=="Text"].index)

In [None]:
jointDF = pd.concat([dataset.loc[dataset.QT=="Text"], dfEmbeds], axis=1)
jointDF

In [None]:
jointDF.task.unique()

In [None]:
from sklearn.metrics import pairwise_distances
def upper_tri_indexing(A):
    m = A.shape[0]
    r,c = np.triu_indices(m,1)
    return A[r,c]

def ILD(dataset, columns):
    dt = dataset[columns].values
    if len(dt)==0:
        return (np.empty(shape=(0, 0)), 0)
    distMatrix = pairwise_distances(dt,metric="cosine")
    #remove distances to self
    distMatrix = upper_tri_indexing(distMatrix)
    return (distMatrix,distMatrix.mean())


In [None]:
upper_tri_indexing(np.array([[1,2,3],[4,5,6],[7,8,9]]))

### How does query distances differ for individual tasks?

In [None]:
dMats0 = {}
for t in jointDF.task.unique():    
    distMatrix, meanVal = ILD(jointDF.loc[jointDF["task"]==t],embedColnames)
    dMats0[t] = distMatrix.ravel()
    print (t, meanVal)
dMats0 = pd.Series(dMats0)

In [None]:
textTasks = [i for i in jointDF.task.unique() if "kis-t" in i]
visualTasks = [i for i in jointDF.task.unique() if (("kis-v-" not in i)&("kis-v" in i))]
marineTasks = [i for i in jointDF.task.unique() if "kis-v-" in i]

### Textual tasks have smaller between-query distances than both visual ones
- also marine tasks has slightly smaller distances than V3C1 visual ones

In [None]:
print(np.concatenate(dMats0[textTasks].values).mean())
print(np.concatenate(dMats0[visualTasks].values).mean())
print(np.concatenate(dMats0[marineTasks].values).mean())

In [None]:
np.concatenate(dMats0[textTasks].values).shape

In [None]:
from scipy.stats import ttest_ind
print(ttest_ind(np.concatenate(dMats0[textTasks].values),np.concatenate(dMats0[visualTasks].values)))
print(ttest_ind(np.concatenate(dMats0[textTasks].values),np.concatenate(dMats0[marineTasks].values)))
print(ttest_ind(np.concatenate(dMats0[marineTasks].values),np.concatenate(dMats0[visualTasks].values)))

In [None]:
txt = pd.DataFrame({"v": np.concatenate(dMats0[textTasks].values)})
txt["type"]="Textual"
vis = pd.DataFrame({"v": np.concatenate(dMats0[visualTasks].values)})
vis["type"]="Visual"
mar = pd.DataFrame({"v": np.concatenate(dMats0[marineTasks].values)})
mar["type"]="Marine Visual"
dfPlot = pd.concat([txt,vis,mar])

In [None]:
sns.boxenplot(y=dfPlot["v"],x=dfPlot["type"])

In [None]:
mar["v"].mean()

## Team-wise differences

In [None]:
dMats = {}
for t in jointDF.task.unique():
    for tm in jointDF.team.unique():
        distMatrix, meanVal = ILD(jointDF.loc[((jointDF["task"]==t)&(jointDF["team"]==tm))],embedColnames)
        dMats[(t,tm)] = distMatrix.ravel()
        print (t, tm, meanVal)
dMats = pd.Series(dMats)

In [None]:

for tm in jointDF.team.unique():
    keys = list(np.broadcast(jointDF.task.unique(),tm))

    print(tm, np.concatenate(dMats[keys].values).mean())

In [None]:
task = []
team = []
distance = []

for t in jointDF.task.unique():
    for tm in jointDF.team.unique():
        #print(t,tm,np.mean(dMats[(t,tm)]))
        for d in dMats[(t,tm)]:
            task.append(t)
            team.append(tm)
            distance.append(d)
dfGraph2 = pd.DataFrame({"task":task, "team":team,"distance":distance})
dfGraph2.head()

In [None]:
dfGraph2 = dfGraph2.sort_values("task")
dfGraph2
sns.pointplot(data=dfGraph2, x="task", y="distance", hue="team", markers=["o","v","s","p","*"], errorbar=None, linestyles="dotted")
sns.pointplot(data=dfGraph2, x="task", y="distance", markers=["o","v","s","p","*"], errorbar=None)

plt.xticks(rotation=90)

In [None]:
task = []
team = []
distance = []

for t in jointDF.task.unique():
    for tm in jointDF.team.unique():
        #print(t,tm,np.mean(dMats[(t,tm)]))
        task.append(t)
        team.append(tm)
        
        distance.append(dMats[(t,tm)].mean())
dfGraph = pd.DataFrame({"task":task, "team":team,"mean distance":distance})
dfGraph.head()

In [None]:
dfGraph = dfGraph.sort_values("task")
dfGraph

In [None]:
dfGraph["task"] = dfGraph["task"].str.replace("vbs23-","")
dfGraph["task"] = dfGraph["task"].str.replace("kis-v-m","KIS-V-M")
dfGraph["task"] = dfGraph["task"].str.replace("kis-v","KIS-V")
dfGraph["task"] = dfGraph["task"].str.replace("kis-t","KIS-T")

In [None]:
import itertools

In [None]:
hue_order = ['vibro', 'VISIONE', "vitrivr-VR","CVHunter",'Verge']
txtTasks = list(itertools.product(textTasks,hue_order))
marTasks = list(itertools.product(marineTasks,hue_order))
visTasks = list(itertools.product(visualTasks,hue_order))
#val = np.concatenate(dMats[txtTasks].values).mean()

In [None]:
res = []
for k in txtTasks:
    try:
        res.extend(dMats[k])
    except:
        print("not found key",k)
txtMean = np.mean(res)
txtAll = res
print(len(res),np.mean(res))

In [None]:
res = []
for k in marTasks:
    try:
        res.extend(dMats[k])
    except:
        print("not found key",k)
marMean = np.mean(res)
marAll = res
print(len(res),np.mean(res))

In [None]:
res = []
for k in visTasks:
    try:
        res.extend(dMats[k])
    except:
        print("not found key",k)
visMean = np.mean(res)
visAll = res
print(len(res),np.mean(res))

In [None]:
print(ttest_ind(txtAll,marAll))
print(ttest_ind(txtAll,visAll))
print(ttest_ind(marAll,visAll))

In [None]:
team2color = {
    'vibro'     : '#1f77b4',
    'VISIONE'   : '#ff7f0e',
    'VIREO'     : '#2ca02c',
    'vitrivr-VR': '#d62728',
    'CVHunter'  : '#9467bd',
    'vitrivr'   : '#8c564b',
    'Verge'     : '#e377c2',
}

team2marker = {
    'vibro'     : 'D',
    'VISIONE'   : 'X',
    'VIREO'     : 'o', 
    'vitrivr-VR': '*',
    'CVHunter'  : 'd',
    'vitrivr'   : 'P',
    'Verge'     : 's',
}

In [None]:
import matplotlib.patches as mpatches
plt.subplots(1,1,figsize=(8,4))
dfGraph = dfGraph.sort_values("task")
hue_order = ['vibro', 'VISIONE', "vitrivr-VR","CVHunter",'Verge']

custom_palette = sns.color_palette("Set1", 5)
plt.axvline(x = 6.5, color="grey", lw=0.5)
plt.axvline(x = 12.5, color="grey", lw=0.5)

sns.scatterplot(data=dfGraph, x="task", y="mean distance", hue="team", hue_order=hue_order, style="team", palette=custom_palette)
plt.xticks(rotation=90)




plt.axhline(y = txtMean, xmin = 0.01, xmax = 0.36, color="black", linestyle = '--', label="Mean distance per task type")
plt.axhline(y = marMean, xmin = 0.38, xmax = 0.66, color="black", linestyle = '--')
plt.axhline(y = visMean, xmin = 0.68, xmax = 0.99, color="black", linestyle = '--')

#plt.axhline(y = np.concatenate(dMats0[textTasks].values).mean(), xmin = 0.01, xmax = 0.36, color="black", linestyle = '--', label="Mean distance per task type")
#plt.axhline(y = np.concatenate(dMats0[marineTasks].values).mean(), xmin = 0.38, xmax = 0.66, color="black", linestyle = '--')
#plt.axhline(y = np.concatenate(dMats0[visualTasks].values).mean(), xmin = 0.68, xmax = 0.99, color="black", linestyle = '--')
plt.axhline(y = 0, xmin = 0.00, xmax = 0.00, color="black", linestyle = ':', label="Mean distance per team")


i = 0
for tm in ['vibro', 'VISIONE', "vitrivr-VR","CVHunter",'Verge']:
    keys = list(np.broadcast(jointDF.task.unique(),tm))
    val = np.concatenate(dMats[keys].values).mean()
    plt.axhline(y = val, xmin = 0.01, xmax = 0.99, color=custom_palette[i], linestyle = ':')
    
    i=i+1



plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')
plt.tight_layout()
plt.savefig("distances.pdf")

In [None]:
import matplotlib.patches as mpatches
plt.subplots(1,1,figsize=(8,4))
dfGraph = dfGraph.sort_values("task")
hue_order = ['vibro', 'VISIONE', "vitrivr-VR","CVHunter",'Verge']

custom_palette = sns.color_palette("Set1", 5)
plt.axvline(x = 6.5, color="grey", lw=0.5)
plt.axvline(x = 12.5, color="grey", lw=0.5)

sns.scatterplot(data=dfGraph, x="task", y="mean distance", hue="team", hue_order=hue_order, style="team", palette=team2color, markers=team2marker)
plt.xticks(rotation=90)




plt.axhline(y = txtMean, xmin = 0.01, xmax = 0.36, color="black", linestyle = '--', label="Mean distance per task type")
plt.axhline(y = marMean, xmin = 0.38, xmax = 0.66, color="black", linestyle = '--')
plt.axhline(y = visMean, xmin = 0.68, xmax = 0.99, color="black", linestyle = '--')

plt.axhline(y = 0, xmin = 0.00, xmax = 0.00, color="black", linestyle = ':', label="Mean distance per team")


j=0
posStart = [0.01, 0.385, 0.685]
posEnd = [0.365, 0.665, 0.99]
for tasks in [textTasks,marineTasks,visualTasks]:
    print(tasks)
    i = 0
    for tm in ['vibro', 'VISIONE', "vitrivr-VR","CVHunter",'Verge']:
        keys = list(np.broadcast(tasks,tm))
        val = np.concatenate(dMats[keys].values).mean()
        plt.axhline(y = val, xmin = posStart[j], xmax = posEnd[j], color=team2color[tm], linestyle = ':')

        i=i+1
    j = j+1


plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')
plt.tight_layout()
plt.savefig("distances_v2.pdf")

### Both vibro and VISIONE had more consistent per-task queries than CVHunter
- comparison with other teams omitted due to missing data

In [None]:
print(ttest_ind(np.concatenate(dMats[list(np.broadcast(jointDF.task.unique(),"vibro"))].values),np.concatenate(dMats[list(np.broadcast(jointDF.task.unique(),"CVHunter"))].values)))
print(ttest_ind(np.concatenate(dMats[list(np.broadcast(jointDF.task.unique(),"VISIONE"))].values),np.concatenate(dMats[list(np.broadcast(jointDF.task.unique(),"CVHunter"))].values)))
print(ttest_ind(np.concatenate(dMats[list(np.broadcast(jointDF.task.unique(),"vibro"))].values),np.concatenate(dMats[list(np.broadcast(jointDF.task.unique(),"VISIONE"))].values)))

## Comparison of both users per team and task

In [None]:
jointDF.groupby(["team","user"]).count()["task"]

- removing verge as no distinction between users is available
- several times, vitrivr-VR does not have any records from one of the users (candidate for removal)

In [None]:
jointDFNoVerge = jointDF.loc[jointDF.team != "Verge"]
jointDF.shape,jointDFNoVerge.shape

In [None]:
def ILD_pair(dataset1, dataset2, columns):
    dt1 = dataset1[columns].values
    dt2 = dataset2[columns].values
    if (len(dt1)==0)|(len(dt2)==0):
        return (np.empty(shape=(0, 0)), 0)
    distMatrix = pairwise_distances(dt1,dt2,metric="cosine")
    return (distMatrix.flatten(),distMatrix.mean())

In [None]:
dMats = {}
for t in jointDFNoVerge.task.unique():
    for tm in jointDFNoVerge.team.unique():
        dt1 = jointDFNoVerge.loc[((jointDFNoVerge["task"]==t)&(jointDFNoVerge["team"]==tm)&(jointDFNoVerge["user"]==0))]
        dt2 = jointDFNoVerge.loc[((jointDFNoVerge["task"]==t)&(jointDFNoVerge["team"]==tm)&(jointDFNoVerge["user"]==1))] 
        
        distMatrix, meanVal = ILD_pair(dt1, dt2, embedColnames)
        dMats[(t,tm)] = distMatrix.ravel()
        print (t, tm, meanVal)
dMats = pd.Series(dMats)

In [None]:
vals = []
for tm in jointDFNoVerge.team.unique():
    keys = list(np.broadcast(jointDFNoVerge.task.unique(),tm))
    print(tm, np.concatenate(dMats[keys].values).mean())
    vals.extend(np.concatenate(dMats[keys].values))
np.mean(vals)

- not so much different from the results of the overall distances (just a bit higher values)

In [None]:
dMats2 = {}
for t in jointDFNoVerge.task.unique():
    for tm in jointDFNoVerge.team.unique():
        for u in jointDFNoVerge.user.unique():
            distMatrix, meanVal = ILD(jointDFNoVerge.loc[((jointDFNoVerge["task"]==t)&(jointDFNoVerge["team"]==tm)&(jointDFNoVerge["user"]==u))],embedColnames)
            dMats2[(t,tm,u)] = distMatrix.ravel()
            print (t, tm,u, meanVal)
dMats2 = pd.Series(dMats2)

In [None]:
vals=[]
for tm in jointDFNoVerge.team.unique():
    keys = list(np.broadcast(jointDFNoVerge.task.unique(),tm))
    users = [0]*len(keys)+[1]*len(keys)
    keys = [(keys[i%len(keys)][0],keys[i%len(keys)][1],val) for i,val in enumerate(users)]
    print(tm, np.concatenate(dMats2[keys].values).mean())
    vals.extend(np.concatenate(dMats2[keys].values))
np.mean(vals)    

- quite a few NaNs due to having only a single query per user

In [None]:
dMats3 = {}
for t in jointDFNoVerge.task.unique():
    for tm in jointDFNoVerge.team.unique():
        dt1 = jointDFNoVerge.loc[((jointDFNoVerge["task"]==t)&(jointDFNoVerge["team"]==tm))]
        
        for tm2 in jointDFNoVerge.team.unique():
            if tm2!= tm:
                dt2_1 = jointDFNoVerge.loc[((jointDFNoVerge["task"]==t)&(jointDFNoVerge["team"]==tm2))]                                
                distMatrix, meanVal = ILD_pair(dt1, dt2_1, embedColnames)
                dMats3[(t,tm,tm2)] = distMatrix.ravel()
                print (t, tm,tm2, meanVal)
dMats3 = pd.Series(dMats3)

In [None]:
valsA = []
for tm in jointDFNoVerge.team.unique():
    vals = []
    for tm2 in jointDFNoVerge.team.unique():
        if tm != tm2:
            keys = list(np.broadcast(jointDFNoVerge.task.unique(),tm, tm2))
            users = [0]*len(keys)+[1]*len(keys)
            keys = [(keys[i%len(keys)][0],keys[i%len(keys)][1],keys[i%len(keys)][2],val) for i,val in enumerate(users)]
            print(tm, tm2, np.concatenate(dMats3[keys].values).mean())

            vals.extend(np.concatenate(dMats3[keys].values))
            valsA.extend(np.concatenate(dMats3[keys].values)) 
    print(tm,np.mean(vals))
    print()
    
print(np.mean(valsA))    

## Much smaller differences in within-user query distance as compared to between users (in the same team)

In [None]:
for tm in jointDFNoVerge.team.unique():
    keys = list(np.broadcast(jointDFNoVerge.task.unique(),tm))
    keys1 = keys
    users = [0]*len(keys)+[1]*len(keys)
    keys = [(keys[i%len(keys)][0],keys[i%len(keys)][1],val) for i,val in enumerate(users)]
    
    print(tm,ttest_ind(np.concatenate(dMats[keys1].values),np.concatenate(dMats2[keys].values)))

# Differences in sequences of query reformulations
- only users & tasks, where at least **4** text queries were made

In [None]:
ranks = []
lastRank = 0
lastRecord = (0,0,0)
#TODO: only for textual reformulations
sortedData = textData.sort_values(["task","team","user","timestamp"])
for idx, row in sortedData.iterrows():
    record = (row["task"],row["team"],row["user"])
    if record != lastRecord:
        lastRecord = record
        lastRank = 0
    lastRank += 1
    ranks.append(lastRank)

sortedData["QueryRank"] = ranks

#record how long was the interaction for each task and user
querySeriesLen = sortedData.groupby(["task","team","user"])[["QueryRank"]].max()
querySeriesLen.columns = ["MaxQueryRank"]
sortedData = sortedData.join(querySeriesLen, on=["task","team","user"])
sortedData["DiffFromMaxQueryRank"] = sortedData.QueryRank - sortedData.MaxQueryRank
sortedData.head()

In [None]:
dt = sortedData.loc[((sortedData.QueryRank <= 4)&(sortedData.MaxQueryRank >= 4))]
#cannot be done for verge as only one user is present
dt = dt.loc[dt.team != "Verge"]
dt.columns,dt.shape

In [None]:
stdf = dt.loc[dt.QT=="Text",'joint_text_embedding'].values
stdf = np.stack(stdf)
dfEmbeds = pd.DataFrame(stdf, columns=embedColnames, index=dt.loc[dt.QT=="Text"].index)
seqDFEmbeds = pd.concat([dt.loc[dt.QT=="Text"], dfEmbeds], axis=1)
seqDFEmbeds.shape

In [None]:
def ILD_noRemove(dataset, columns):
    dt = dataset[columns].values
    if len(dt)==0:
        return (np.empty(shape=(0, 0)), 0)
    distMatrix = pairwise_distances(dt,metric="cosine")
    return (distMatrix,distMatrix.mean())


In [None]:
dMats3 = {}
for t in seqDFEmbeds.task.unique():
    for tm in seqDFEmbeds.team.unique():
        for u in seqDFEmbeds.user.unique():
            dt = seqDFEmbeds.loc[((seqDFEmbeds["task"]==t)&(seqDFEmbeds["team"]==tm)&(seqDFEmbeds["user"]==u))]
            dt = dt.sort_values("QueryRank")
            #print(dt.QueryRank)
            distMatrix, meanVal = ILD_noRemove(dt,embedColnames)
            if len(distMatrix)>0:
                dMats3[(t,tm,u)] = distMatrix
            print (t, tm,u, meanVal)
dMats3 = pd.Series(dMats3)
sequentialResultsArray = np.stack(dMats3.values)

In [None]:
sequentialResultsArray[:,0,:].mean(axis=0)#distances to first query

In [None]:
#distances to subsequent queries
print(
    sequentialResultsArray[:,0,1].mean(),
    sequentialResultsArray[:,1,2].mean(),
    sequentialResultsArray[:,2,3].mean()
)

In [None]:
plt.boxplot([sequentialResultsArray[:,0,1],sequentialResultsArray[:,1,2],sequentialResultsArray[:,2,3]])

## While the distance to the initial query rises over time (unsurprisingly), the step size between consecutive queries remain roughly the same and rather small

In [None]:
from Levenshtein import ratio
def LevenshteinNormDist(dataset, txtCol):
    dt = dataset[txtCol].values
    if len(dt)==0:
        return (np.empty(shape=(0, 0)), 0)
    distMatrix = np.zeros((len(dt),len(dt)))
    for i,t1 in enumerate(dt):
        for j,t2 in enumerate(dt):
            distMatrix[i,j] = 1 - ratio(t1,t2)

    return (distMatrix,distMatrix.mean())


In [None]:
dMats4 = {}
for t in seqDFEmbeds.task.unique():
    for tm in seqDFEmbeds.team.unique():
        for u in seqDFEmbeds.user.unique():
            dt = seqDFEmbeds.loc[((seqDFEmbeds["task"]==t)&(seqDFEmbeds["team"]==tm)&(seqDFEmbeds["user"]==u))]
            dt = dt.sort_values("QueryRank")
            #print(dt.QueryRank)
            distMatrix, meanVal = LevenshteinNormDist(dt,"value")
            if len(distMatrix)>0:
                dMats4[(t,tm,u)] = distMatrix
            print (t, tm,u, meanVal)
dMats4 = pd.Series(dMats4)
sequentialResultsLevensteinArray = np.stack(dMats4.values)

In [None]:
sequentialResultsLevensteinArray[:,0,:].mean(axis=0)#distances to first query

In [None]:
#distances to subsequent queries
print(
    sequentialResultsLevensteinArray[:,0,1].mean(),
    sequentialResultsLevensteinArray[:,1,2].mean(),
    sequentialResultsLevensteinArray[:,2,3].mean()
)

In [None]:
ttest_ind(sequentialResultsLevensteinArray[:,0,1], sequentialResultsLevensteinArray[:,1,2])

## Results of Levenstein distance support those of embeds distance. It seems that subsequent changes are a bit smaller for later reformulations, but no stat sign.

# Differences in sequences of query reformulations
- only users & tasks, where at least **3** text queries were made

In [None]:
dt = sortedData.loc[((sortedData.QueryRank <= 3)&(sortedData.MaxQueryRank >= 3))]
#cannot be done for verge as only one user is present
dt = dt.loc[dt.team != "Verge"]

stdf = dt.loc[dt.QT=="Text",'joint_text_embedding'].values
stdf = np.stack(stdf)
dfEmbeds = pd.DataFrame(stdf, columns=embedColnames, index=dt.loc[dt.QT=="Text"].index)
seqDFEmbeds = pd.concat([dt.loc[dt.QT=="Text"], dfEmbeds], axis=1)
seqDFEmbeds.shape

In [None]:
dMats3 = {}
for t in seqDFEmbeds.task.unique():
    for tm in seqDFEmbeds.team.unique():
        for u in seqDFEmbeds.user.unique():
            dt = seqDFEmbeds.loc[((seqDFEmbeds["task"]==t)&(seqDFEmbeds["team"]==tm)&(seqDFEmbeds["user"]==u))]
            dt = dt.sort_values("QueryRank")
            #print(dt.QueryRank)
            distMatrix, meanVal = ILD_noRemove(dt,embedColnames)
            if len(distMatrix)>0:
                dMats3[(t,tm,u)] = distMatrix
            print (t, tm,u, meanVal)
dMats3 = pd.Series(dMats3)
sequentialResultsArray = np.stack(dMats3.values)

In [None]:
print(sequentialResultsArray[:,0,:].mean(axis=0)) #distances to first query

print(#distances to subsequent queries
    sequentialResultsArray[:,0,1].mean(),
    sequentialResultsArray[:,1,2].mean()
)

In [None]:
ttest_ind(sequentialResultsArray[:,0,1], sequentialResultsArray[:,1,2])

In [None]:
dMats4 = {}
for t in seqDFEmbeds.task.unique():
    for tm in seqDFEmbeds.team.unique():
        for u in seqDFEmbeds.user.unique():
            dt = seqDFEmbeds.loc[((seqDFEmbeds["task"]==t)&(seqDFEmbeds["team"]==tm)&(seqDFEmbeds["user"]==u))]
            dt = dt.sort_values("QueryRank")
            #print(dt.QueryRank)
            distMatrix, meanVal = LevenshteinNormDist(dt,"value")
            if len(distMatrix)>0:
                dMats4[(t,tm,u)] = distMatrix
            print (t, tm,u, meanVal)
dMats4 = pd.Series(dMats4)
sequentialResultsLevensteinArray = np.stack(dMats4.values)

In [None]:
print(sequentialResultsLevensteinArray[:,0,:].mean(axis=0)) #distances to first query

print(#distances to subsequent queries
    sequentialResultsLevensteinArray[:,0,1].mean(),
    sequentialResultsLevensteinArray[:,1,2].mean()
)

In [None]:
ttest_ind(sequentialResultsLevensteinArray[:,0,1], sequentialResultsLevensteinArray[:,1,2])

# Differences in sequences of query reformulations
- only users & tasks, where at least **5** text queries were made

In [None]:
dt = sortedData.loc[((sortedData.QueryRank <= 5)&(sortedData.MaxQueryRank >= 5))]
#cannot be done for verge as only one user is present
dt = dt.loc[dt.team != "Verge"]

stdf = dt.loc[dt.QT=="Text",'joint_text_embedding'].values
stdf = np.stack(stdf)
dfEmbeds = pd.DataFrame(stdf, columns=embedColnames, index=dt.loc[dt.QT=="Text"].index)
seqDFEmbeds = pd.concat([dt.loc[dt.QT=="Text"], dfEmbeds], axis=1)
seqDFEmbeds.shape

In [None]:
dMats3 = {}
for t in seqDFEmbeds.task.unique():
    for tm in seqDFEmbeds.team.unique():
        for u in seqDFEmbeds.user.unique():
            dt = seqDFEmbeds.loc[((seqDFEmbeds["task"]==t)&(seqDFEmbeds["team"]==tm)&(seqDFEmbeds["user"]==u))]
            dt = dt.sort_values("QueryRank")
            #print(dt.QueryRank)
            distMatrix, meanVal = ILD_noRemove(dt,embedColnames)
            if len(distMatrix)>0:
                dMats3[(t,tm,u)] = distMatrix
            print (t, tm,u, meanVal)
dMats3 = pd.Series(dMats3)
sequentialResultsArray = np.stack(dMats3.values)

In [None]:
print(sequentialResultsArray[:,0,:].mean(axis=0)) #distances to first query

print(#distances to subsequent queries
    sequentialResultsArray[:,0,1].mean(),
    sequentialResultsArray[:,1,2].mean(),
    sequentialResultsArray[:,2,3].mean(),
    sequentialResultsArray[:,3,4].mean(),
    #sequentialResultsArray[:,4,5].mean()
)

In [None]:
dMats4 = {}
for t in seqDFEmbeds.task.unique():
    for tm in seqDFEmbeds.team.unique():
        for u in seqDFEmbeds.user.unique():
            dt = seqDFEmbeds.loc[((seqDFEmbeds["task"]==t)&(seqDFEmbeds["team"]==tm)&(seqDFEmbeds["user"]==u))]
            dt = dt.sort_values("QueryRank")
            #print(dt.QueryRank)
            distMatrix, meanVal = LevenshteinNormDist(dt,"value")
            if len(distMatrix)>0:
                dMats4[(t,tm,u)] = distMatrix
            print (t, tm,u, meanVal)
dMats4 = pd.Series(dMats4)
sequentialResultsLevensteinArray = np.stack(dMats4.values)

In [None]:
print(sequentialResultsLevensteinArray[:,0,:].mean(axis=0)) #distances to first query

print(#distances to subsequent queries
    sequentialResultsLevensteinArray[:,0,1].mean(),
    sequentialResultsLevensteinArray[:,1,2].mean(),
    sequentialResultsLevensteinArray[:,2,3].mean(),
    sequentialResultsLevensteinArray[:,3,4].mean(),
    #sequentialResultsLevensteinArray[:,4,5].mean()
    
)

# Differences in sequences of query reformulations
- only users & tasks, where at least **4** text queries were made
- selecting last 4 queries

In [None]:
dt = sortedData.loc[((sortedData.DiffFromMaxQueryRank >= -3)&(sortedData.MaxQueryRank >= 4))]
#cannot be done for verge as only one user is present
dt = dt.loc[dt.team != "Verge"]

stdf = dt.loc[dt.QT=="Text",'joint_text_embedding'].values
stdf = np.stack(stdf)
dfEmbeds = pd.DataFrame(stdf, columns=embedColnames, index=dt.loc[dt.QT=="Text"].index)
seqDFEmbeds = pd.concat([dt.loc[dt.QT=="Text"], dfEmbeds], axis=1)
seqDFEmbeds.shape

In [None]:
dMats3 = {}
for t in seqDFEmbeds.task.unique():
    for tm in seqDFEmbeds.team.unique():
        for u in seqDFEmbeds.user.unique():
            dt = seqDFEmbeds.loc[((seqDFEmbeds["task"]==t)&(seqDFEmbeds["team"]==tm)&(seqDFEmbeds["user"]==u))]
            dt = dt.sort_values("QueryRank")
            #print(dt.QueryRank)
            distMatrix, meanVal = ILD_noRemove(dt,embedColnames)
            if len(distMatrix)>0:
                dMats3[(t,tm,u)] = distMatrix
            print (t, tm,u, meanVal)
dMats3 = pd.Series(dMats3)
sequentialResultsArray = np.stack(dMats3.values)

In [None]:
print(sequentialResultsArray[:,3,:].mean(axis=0)) #distances to first query

print(#distances to subsequent queries
    sequentialResultsArray[:,0,1].mean(),
    sequentialResultsArray[:,1,2].mean(),
    sequentialResultsArray[:,2,3].mean()
)

In [None]:
ttest_ind(sequentialResultsArray[:,0,1], sequentialResultsArray[:,1,2])

In [None]:
plt.boxplot([sequentialResultsArray[:,0,1],sequentialResultsArray[:,1,2],sequentialResultsArray[:,2,3]])

In [None]:
dMats4 = {}
for t in seqDFEmbeds.task.unique():
    for tm in seqDFEmbeds.team.unique():
        for u in seqDFEmbeds.user.unique():
            dt = seqDFEmbeds.loc[((seqDFEmbeds["task"]==t)&(seqDFEmbeds["team"]==tm)&(seqDFEmbeds["user"]==u))]
            dt = dt.sort_values("QueryRank")
            #print(dt.QueryRank)
            distMatrix, meanVal = LevenshteinNormDist(dt,"value")
            if len(distMatrix)>0:
                dMats4[(t,tm,u)] = distMatrix
            print (t, tm,u, meanVal)
dMats4 = pd.Series(dMats4)
sequentialResultsLevensteinArray = np.stack(dMats4.values)

In [None]:
print(sequentialResultsLevensteinArray[:,0,:].mean(axis=0)) #distances to first query

print(#distances to subsequent queries
    sequentialResultsLevensteinArray[:,0,1].mean(),
    sequentialResultsLevensteinArray[:,1,2].mean()
)

In [None]:
ttest_ind(sequentialResultsLevensteinArray[:,0,1], sequentialResultsLevensteinArray[:,1,2])