# Preprocesses log data and create a pickle file for further analysis

In [None]:
import sys
# if in notebook folder, change directory to parent one
import os
if os.getcwd().split('/')[-1] == 'notebooks':
    os.chdir('..')
import sys

# imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import math
from notebooks.utils import compute_user_penalty, get_team_values_df
from common.load import load_competition_data, process_team_logs

pd.set_option('display.max_colwidth', None)
unknownRankLimit = 1000
unknownRankValue = 2000

# Import common data

In [None]:
config = 'config_vbs2023.yaml'

# load competition data from dres files and auxiliary data (FPSs, sequences)
comp_data = load_competition_data(config)

# compute team logs and put them in the form of dataframes (or load them if already existing)
logs = process_team_logs(config, comp_data, force=False, remove_none=False)

teams = comp_data['config']['teams']
teams.remove('VideoCLIP')

# FIXME: badly patch the name of HTW into vibro
teams_adj = teams.copy()
teams_adj = [t if t != 'HTW' else 'vibro' for t in teams_adj]

# valid teams
team_order = ['vibro', 'VISIONE', 'vitrivr-VR', 'CVHunter',  'Verge']
#team_order = ['vibro', 'VISIONE', 'VIREO' 'vitrivr-VR', 'CVHunter', 'vitrivr', 'Verge'] // full list

# Merge team data into one dataframe

In [None]:
teams

In [None]:
dfs = []
for team in teams:
    team_df = logs[team].get_events_dataframe().reset_index()
    dfs.append(team_df)

dataset = pd.concat(dfs, axis=0).reset_index()
dataset_original = dataset.copy()
dataset.head()

# Data checks
Displaying special log information that concerns only a few teams.

### all category and type combination per team

In [None]:
dataset[["team","category","type"]].groupby(["team","category","type"]).size().reset_index(name='count').to_excel('output_file.xlsx', index=False)

In [None]:
print('\n'.join(map(str, dataset[["team","category","type"]].groupby(["team","category","type"]).groups.keys())))

### only one sketch query

In [None]:
dataset.loc[dataset.category == "SKETCH"]

### only one filter query

In [None]:
dataset.loc[dataset.category == "FILTER"]

### ### only one visualtextcoembedding query

In [None]:
dataset.loc[dataset.category == "visualtextcoembedding"]

### vitrivr's value column is empty

In [None]:
dataset.loc[dataset.team == "vitrivr"]

### VIREO's value column is empty

In [None]:
dataset.loc[dataset.team == "VIREO"]

### vitrivr-VR types and category columns are swapped

In [None]:
dataset.loc[dataset.team == "vitrivr-VR"].head()

### verge concept type query has no values

In [None]:
dataset.loc[((dataset.team == "Verge") & (dataset.type=="concept"))].head()

### HTW temporal category has only text queries

In [None]:
dataset.loc[dataset.category=="TEMPORAL"].head()

### CV-Hunter has a CLIP: or Temporal-CLIP: prefix in the value column for text queries

In [None]:
dataset.loc[dataset.team=="CVHunter"].head()

### CVHunter has some browsing category entries but they do not change the ranking order

In [None]:
print(dataset.loc[((dataset.team == "CVHunter") & (dataset.category=="BROWSING"))].type.unique())
dataset.loc[((dataset.team == "CVHunter") & (dataset.category=="BROWSING"))]

### CVHunter has submit entries which are not important for the ranking or text analysis

In [None]:
dataset.loc[((dataset.team == "CVHunter") & (dataset.category=="SUBMIT"))].head()

### CVHunter has triggert every text query two times

In [None]:
dataset.loc[((dataset.team == "CVHunter") & (dataset.category=="TEXT"))].head()

### VISIONE has two LocalizedObjectAndColor queries

In [None]:
object_and_color_queries=dataset.loc[((dataset.team == "VISIONE") & (dataset.type.str.contains('LocalizedObjectAndColor')))]
object_and_color_queries.head()

In [None]:
#selecct timestamp of object and color queries
dataset.loc[dataset.timestamp.isin(object_and_color_queries["timestamp"])].head() 
#the only object and color query of VISIONE is a combined  query with a textual query

#### Counting temporal queries


In [None]:
#VISIONE TEAMS
#group rows in visione_logs with the same "timestamp" value and count the number of rows agregated in each group
visione_logs=dataset.loc[(dataset.team == "VISIONE") ]
#number of visione queries:
same_timestamp_counts = visione_logs.groupby("timestamp").size().reset_index(name="count")
visione_query_counts=len(same_timestamp_counts)
combined_query_timestamps= same_timestamp_counts.loc[(same_timestamp_counts["count"]>1)] #combined queries cam be either temporal or not temporal
#select rows in visione_logs with timestamp values that are in combined_query_timestamps
temporal_query_logs=visione_logs.loc[(visione_logs.timestamp.isin(combined_query_timestamps["timestamp"]))&(dataset.value.str.contains('Temporal'))]
temporal_query_counts= len(temporal_query_logs.groupby("timestamp").size().reset_index(name="count"))
textual_query_count=len(visione_logs.loc[(visione_logs.type.str.contains('jointEmbedding'))].groupby("timestamp").size().reset_index(name="count"))

print(f"VISIONE has {visione_query_counts} queries ({textual_query_count} textual queries):  {temporal_query_counts} TEMPORAL Queries  and {visione_query_counts-temporal_query_counts} non-temporal queries")

#how many textual query (both temporal and not)?

#VIBRO TEAMS
vibro_logs=dataset.loc[(dataset.team == "HTW") ]
#number of vibro queries with type containing textual query (excluding temporal)
vibro_textual_query_count=len(vibro_logs.loc[(vibro_logs.type.str.contains('jointEmbedding'))].groupby("timestamp").size().reset_index(name="count"))
#number of vibro queries with type containing temporal
temporal_query_count=len(vibro_logs.loc[(vibro_logs.category.str.contains('TEMPORAL'))].groupby("timestamp").size().reset_index(name="count"))
print(f"VIBRO has {temporal_query_count+vibro_textual_query_count} textual queries: {temporal_query_count} temporal text queries and  {vibro_textual_query_count} other textual queries")




### NOTES
- vitrivr-vr has a category called visualtextcoembedding, but only text information are preseted
- vitrivr-vr category and type columns are swapped
- vitrivr-vr additional temporal information '>'
- vitrivr-vr has only submittions for 16 of 19 tasks
- vitrivr value column is always empty
- vitrivr has a single sketch row
- vireo does not have category or type values 
- verge type=concept has empty or NaN value column
- verge has a single filter row
- verge can use temporal search only with concepts, there is no way to identify in teh log if temporal search  was used 
- HTW category=temporal has only text->text queries
- HTW should be renamed to vibro
- CVHunter has some prefixes in the query which were not typed by the user (CLIP: / Temporal CLIP:) gonna remove them in pre-processing
- CVHunter SUBMIT category does always have a video rank of INF
- CVHunter has several BROWSING types in the log, all have a video rank of INF
- CVHunter every query is send twice (with a 2-3s delay)
- VISIONE sends two queries if temporal, defined by "value":"...Temporal_query..." and the same timestamp 
- VISIONE additional information in the value column after the first '>'
- VISIONE in the value column "textualMode=all" means an ensemble of CLIP, Aladin, VideoClip

# Filtering and pre-processing
- remove category SUBMIT 
- remove VISIONE and HTW suffixes from "value"
- remove CVHunter prefixes from "value"
- remove some of CVHunter browsing types (those that does not alter ranking)
- swap content of column category and type for vitrivr-vr 
- remove vitrivr and vireo since they to not have any useable data

#### remove vitrivr and VIREO  and remove all query duplicates 

In [None]:
dataset=dataset_original.copy()
# remove vitrivr and VIREO for now because there are no text query information
dataset = dataset.loc[dataset["team"]!="vitrivr"]
dataset = dataset.loc[dataset["team"]!="VIREO"]

dataset = dataset.sort_values(by='index')
dropped_rows = dataset[dataset.duplicated(subset=["task", "team", "user", "rank_video", "rank_shot_margin_0", "rank_shot_margin_5", "category", "type", "value"], keep='first')]
print(dropped_rows["team"].value_counts())
dropped_rows[["task", "team", "user", "value", "rank_video"]]

#remove all duplicates
dataset = dataset.drop_duplicates(subset=["task", "team", "user", "rank_video", "rank_shot_margin_0", "rank_shot_margin_5", "category", "type", "value"], keep='first')              
dataset.shape
#dataset_cvhunter = dataset.loc[(dataset["team"]=="CVHunter")&(dataset["category"]=="TEXT")].drop_duplicates(subset=["task", "team", "user", "category", "type", "value"], keep='first')
#dataset_wo_cvhunter = dataset.loc[~((dataset["team"]=="CVHunter")&(dataset["category"]=="TEXT"))]
#dataset = pd.concat([dataset_wo_cvhunter, dataset_cvhunter])
#print(dataset.shape)

#### Add temporal column, remove vitrivr and VIREO, and

In [None]:

#add temporal column and
dataset["is_temporal_query"] = False


#VERGE TEAM
#only verge has nconcept queries, but they are without values. Intead of removing them, we will replace the category with CONCEPT, to count them in teh "other-than-text" queries category
dataset.loc[(dataset.type=="concept"),"category"]="CONCEPT"
# there is no way to identify temporal search, in anay case temporal can be used only for concept queries



##CVHUNTER TEAM
# remove category SUBMIT
dataset = dataset.loc[~((dataset["team"]=="CVHunter")&(dataset["category"]=="SUBMIT"))]

# remove category BROWSING
dataset = dataset.loc[~((dataset["team"]=="CVHunter")&(dataset["category"]=="BROWSING"))]

# mark the temporal queries
dataset.loc[((dataset.team == "CVHunter") & (dataset.category=="TEXT") & (dataset.value.str.contains('Temporal'))),"is_temporal_query"]= True
#dataset.loc[((dataset["team"]=="CVHunter")&(dataset.value.str.contains('>'))),"is_temporal_query"] = True

# remove text prefix for CVHunter queries
dataset.loc[((dataset["team"]=="CVHunter")&(dataset["category"]=="TEXT")),"value"] = dataset.loc[((dataset["team"]=="CVHunter")),"value"].str.replace("Temporal CLIP: ","")
dataset.loc[((dataset["team"]=="CVHunter")&(dataset["category"]=="TEXT")),"value"] = dataset.loc[((dataset["team"]=="CVHunter")),"value"].str.replace("CLIP: ","")

#rename the jointEmebedding category 
dataset.loc[((dataset.team == "VISIONE") & (dataset.category=="TEXT") & (dataset.type.str.contains('JointEmbedding'))),"type"]= "jointEmbedding"


##Vibro TEAM
# rename HTW to vibro
dataset.loc[dataset["team"]=="HTW", "team"] = "vibro"

# mark the temporal queries
dataset.loc[((dataset.team == "vibro") & (dataset.category=="TEMPORAL")),"is_temporal_query"] = True

# swap type and category column contents for vitrivr-VR
dataset.loc[dataset.team == "vitrivr-VR", ["type", "category"]] = dataset.loc[dataset.team == "vitrivr-VR", ["category", "type"]].values
# mark the temporal queries
#TODO: how to distinguish vitrivr-VR temporal queries from non-temporal queries?
dataset.loc[((dataset.team == "vitrivr-VR") & (dataset.category=="TEXT") & (dataset.value.str.contains(">"))),"is_temporal_query"] = True


#VISIONE TEAM

# mark the temporal queries
dataset.loc[((dataset["team"]=="VISIONE")&(dataset.value.str.contains('Temporal_query'))),"is_temporal_query"] = True

# remove text suffix for VISIONE queries
dataset.loc[((dataset["team"]=="VISIONE")&(dataset["category"]=="TEXT")),"value"] = dataset.loc[((dataset["team"]=="VISIONE")&(dataset["category"]=="TEXT")),"value"].str.split('>').str[0]
dataset.loc[(dataset["team"]=="VISIONE"),"value"] = dataset.loc[((dataset["team"]=="VISIONE")),"value"].str.split('>').str[0]

#rename the jointEmebedding category 
dataset.loc[((dataset.team == "VISIONE") & (dataset.category=="TEXT") & (dataset.type.str.contains('jointEmbedding'))),"type"]= "jointEmbedding"
dataset.shape



####  merge VISIONE's two row temporal queries

In [None]:
# concatenating temporal queries for VISIONE (query with same timestamp, user, and query type)
visione = dataset.loc[(dataset["team"]=="VISIONE")]
txtGroupby = visione.groupby(["task","team","user","timestamp","type","category"]).agg({'value': ' > '.join}).reset_index()
dataset_clean = visione.drop(columns=['value'])
dataset_visione = dataset_clean.merge(txtGroupby, on=["task","team","user","timestamp", "type", "category"], how='left')
dataset_visione = dataset_visione.drop_duplicates(subset=["task","team","user","timestamp","type","category"])
dataset_wo_visione = dataset.loc[(dataset["team"]!="VISIONE")]
dataset = pd.concat([dataset_wo_visione, dataset_visione])


print(dataset.shape)
print('***Temporal****')
print(' \n'.join(map(str, dataset.loc[dataset.is_temporal_query][["team","category","type"]].groupby(["team","category","type"]).groups.keys())))
print('*** NON Temporal****')
print('\n'.join(map(str, dataset.loc[~ dataset.is_temporal_query][["team","category","type"]].groupby(["team","category","type"]).groups.keys())))


### add a "is joint embedding text query" column

In [None]:
dataset["is_joint_embedding_text_query"] = True
dataset.loc[dataset.type.str.contains("LocalizedObjectAndColors"), "is_joint_embedding_text_query"] = False
dataset.loc[dataset.type.str.contains("LocalizedObjectAndColors"), "category"] = "Other"
dataset.loc[dataset.category=="CONCEPT", "is_joint_embedding_text_query"] = False
dataset.loc[dataset.category=="IMAGE", "is_joint_embedding_text_query"] = False
dataset.loc[dataset.category=="FILTER", "is_joint_embedding_text_query"] = False
dataset.loc[dataset.type.str.contains("LocalizedObjectAndColors"), "is_joint_embedding_text_query"] = False

print('\n'.join(map(str, dataset.loc[dataset.is_joint_embedding_text_query][["team","category","type"]].groupby(["team","category","type"]).groups.keys())))

### Sort table rows by a specific team order

In [None]:
dataset = dataset.sort_values(by='team', key=lambda x: x.map({v: i for i, v in enumerate(team_order)}))
dataset.team.unique()

### add CLIP feature column to every text query

In [None]:
import requests

def computeCLIP(text):
    
    url = 'https://navigu.net/feature/clipfv'
    data = {
        'text': text,
        'networks': 'CLIP-Textual'
    }

    response = requests.post(url, data=data)

    if response.status_code == 200:
        # Request was successful
        return np.frombuffer(response.content[-769:-1], dtype=np.int8)
    else:
        # There was an error
        print(f"Error: {response.status_code}")
        
# compute CLIP feature
dataset["joint_text_embedding"] = None
dataset.loc[dataset.is_joint_embedding_text_query, "joint_text_embedding"] = dataset.loc[dataset.is_joint_embedding_text_query, 'value'].apply(computeCLIP)
dataset["joint_text_embedding"][0]

# Create pickle file

In [None]:
dataset.to_pickle(comp_data["config"]["processed_logs_outdir"] + '/text_query_dataset.pkl')
print(dataset.shape)
print('\n'.join(map(str, dataset[["team","category","type"]].groupby(["team","category","type"]).groups.keys())))

In [None]:
#counting textual  queries thta are not temporal
dataset.loc[(dataset["is_temporal_query"]==False)&(dataset["is_joint_embedding_text_query"]==True)].groupby(["team","category","type"]).size().reset_index(name='count')

In [None]:
#dataset.loc[(dataset["is_temporal_query"]==False)&(dataset["is_joint_embedding_text_query"]==True) & (dataset["team"]=="vitrivr-VR")].head()
#counting of textual temporal queries
dataset.loc[(dataset["is_temporal_query"]==True)&(dataset["is_joint_embedding_text_query"]==True)].groupby(["team","category","type"]).size().reset_index(name='count')

In [None]:
#textual non temporal queries
dataset.loc[(dataset["is_temporal_query"]==False)&(dataset["is_joint_embedding_text_query"]==True)].groupby(["team","category","type"]).size().reset_index(name='count')