# Legal Intelligence Rank Monitor


<br>



This jupyter notebook provides the prototype of the rank monitor:

It should:

- Request queries, clicks and their corresponding position on the query list
- Pre-process the data into readable format
- Calculate the NDCG
- Be able to re-run queries, with appropriate access rights and re-calculate the nDCG

<br> 


In [1]:
# Imports

import numpy as np
from sklearn.metrics import f1_score, average_precision_score
import math
import requests
import os

import datetime
from datetimerange import DateTimeRange
from datetime import timedelta  
from dateutil import parser
from urllib import *
import json

from pathlib import Path
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')


from collections import Counter
from tqdm import tqdm_notebook 
tqdm_notebook().pandas()
from tqdm import tqdm_notebook as tqdm

import requests
import urllib

import re
import glob
import time 

import itertools
import pathlib

# Greedy IDE completion 
%config IPCompleter.greedy=True

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; } </style>"))
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 56)

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"





### Javascript for a duplicate line on Ctrl-P
Run this to set a keyboard shortcut do duplicate a line on: Control+P

In [None]:
%%js

Jupyter.keyboard_manager.edit_shortcuts.add_shortcut('Ctrl-p', {
    help : 'Duplicate current line',
    help_index : 'zz',
    handler: function(env) {
        var cm=env.notebook.get_selected_cell().code_mirror;
        
        // get a position of a current cursor in a current cell
        var current_cursor = cm.doc.getCursor();
        
        // read a content from a line where is the current cursor
        
        var line_content = cm.doc.getLine(current_cursor.line);
        cm.execCommand('goLineEnd');
        cm.execCommand('newlineAndIndent');
        cm.execCommand('indentLess');
        cm.doc.replaceSelection(line_content);
        cm.doc.setCursor(current_cursor.line + 1, current_cursor.ch);
        return false;
    }}
);

In [2]:
def request_into_dataframe(rows= 100, query= '*:*', sort='',start=0) -> pd.DataFrame :
    """
    :param rows: amount of rows to request
    :param query: string to query
    :param sort: string to sort the request eg. sort='ShortTimeStamp desc'
    """

    url = 'http://ec2-18-184-94-154.eu-central-1.compute.amazonaws.com:8080/solr' \
    '/ACC_Logging_Slave/select?indent=on&q={q}&rows={r}&sort={s}&start={st}&wt=json'.format(q=query,r=rows,s=sort,st=start)

    response = requests.get(url)
    
    # Transform the request into a json
    try:
        response = response.json()
        response = response['response']['docs'][:]
    except:
        display(response)
        
    df =  pd.DataFrame(response)
    
    return df
    


In [None]:
def get_dataframe_232(rows=100000000, from_disk=False, save_name='df_232') -> pd.DataFrame:
    """
    Get all the logging with a clicked documentPosition. 
    DocumentPosition clicked is logged since 01-07-2019 -> 20190701
    :param rows: query 
    :param from_disk: query string
    :param save_name: query string
    :return:
    """
    
    directory = './data/{}'.format(save_name)

    
    if from_disk:
        return pd.read_hdf(directory)
    else:
        df_232 = request_into_dataframe(rows=rows, query='EventID:232 & sort=ShortTimeStamp desc')
        
        # Get query of reponsehader into a seperate column. 
        df_232['SearchText'] = df_232['ResponseHeader'].progress_apply(lambda x: json.loads(x)['params']['userQuery'][:])
    
        #   Transform documentIDs from string to list of strings. 
        df['DocumentIDs'] = df['DocumentIDs'].progress_apply(lambda x: str(x).split())    

        if not os.path.exists('./data'):
            os.makedirs(directory)
    
        df_232.to_hdf(directory, key='test', mode='w')
        
    return df_232 


def get_dataframe_27(rows=1000000000, from_disk=False, save_name='df_27') -> pd.DataFrame:
    """
    Get all the logging with a clicked documentPosition. 
    DocumentPosition clicked is logged since 01-07-2019 -> 20190701
    :param rows: query 
    :param from_disk: query string
    :param save_name: query string
    :return:
    """
      
    directory = './data/{}'.format(save_name)
        
    if from_disk:
        return pd.read_hdf(directory)
    
    else:
        print("Requesting dataframe 27")
        df_27 = request_into_dataframe(rows=rows, query='EventID:27 AND ShortTimeStamp:[20190701 TO 20201201] DocumentPosition:* AND SearchText:*')
        df_27.to_hdf(directory, key='test', mode='w')
        print("Retrieved dataframe 27")

    return df_27 


def check_id_list(clicked_pos: int, document_id: int, retrieved_ids: []):
    """
    Check if the clicked pos matches a the corresponding document ID
    """
    
    retrieved_ids = [int(y) for y in retrieved_ids[0].split()]
    clicked_pos = int(clicked_pos) - 1 
    document_id = int(document_id)

    try:
        return retrieved_ids[clicked_pos] == document_id
    except:
        print("Document ID, Clicked pos, retrieved_id ", document_id, clicked_pos, retrieved_ids)

def check_in_timerange(df_232_group = pd.DataFrame, df_27 = pd.DataFrame) -> list:
    """
    Check if the there are clickedobjects within + and - 5 minutes from the query
    timestamp: object from the dateutil parser library
    :param timestamp:
    :param query:
    :param df_27:
    """
    
    timestamps_27 = [parser.parse(time, fuzzy_with_tokens=False) for time in df_27['TimeStamp'].tolist()]    
    stamp_232 = parser.parse(df_232_group['TimeStamp'].iloc[0], fuzzy_with_tokens=False)
    
    retrieved_ids = df_232_group['DocumentIDs'].tolist()
    
    doc_pos = []
    doc_ids = []
    
    for idx, stamp_27 in enumerate(timestamps_27):        
        if (stamp_27 in DateTimeRange(stamp_232, stamp_232  + timedelta(minutes=20))):
            
            doc_id = df_27['DocumentID'].iloc[idx]
            pos    = df_27['DocumentPosition'].iloc[idx]
           
            if (np.isnan(pos)):
                return
        
            if pos <= 19:
                if check_id_list(clicked_pos=pos, document_id=doc_id, retrieved_ids=retrieved_ids):
                    doc_pos.append(df_27['DocumentPosition'].iloc[idx])
                    doc_ids.append(df_27['DocumentID'].iloc[idx])
            else:
                doc_pos.append(df_27['DocumentPosition'].iloc[idx])
                doc_ids.append(df_27['DocumentID'].iloc[idx])

    return doc_pos, doc_ids


def check_and_concat(df_232: pd.DataFrame, df_27: pd.DataFrame) -> pd.DataFrame:
    """
    Check both dataframes get clicks from 27 and add these to responseheader taken from 232
    :param df_232: dataframe containing the 232 log
    :param df_27: dataframe containing the 27 log
    """
    
    final_pos = [] 
    final_ids = []
        
    for idx, (df_shape, df_232_groupby) in enumerate(tqdm(df_232.groupby(by=['UserID','ShortTimeStamp','ID']) , desc='Looping through queries')):
        
        # Only check within the part of dataframe 27 that has similar UserID and SearchText
        temp_27 = df_27[(df_27['UserID'] == df_232_groupby['UserID'].iloc[0]) & (df_27['SearchText'] == df_232_groupby['SearchText'].iloc[0])]
    
        doc_pos = []
        doc_ids = []
        
        if not temp_27.empty:
            doc_pos, doc_ids = check_in_timerange(df_232_groupby, temp_27)
            final_pos.append(doc_pos)
            final_ids.append(doc_ids)
        else:
            # No clicks found
            final_pos.append([])
            final_ids.append([])

            
    # Get the ordering we found from the GroupBy ordering from the for loop above
    df_232 = df_232.sort_values(by=['UserID','ShortTimeStamp','ID'])

    df_232['ClickedPos'] = final_pos
    df_232['ClickedIDs'] = final_ids
        
    return df_232



## Help functions to read in all of EventID:232
<br>

In [10]:
def read_in_232():
    """
    Request 200 000 rows from EventID:232 and saves it in ./data/
    """
    for i in range(0,200000,50000):
        print(i)
        df = request_into_dataframe(rows=50000,query='EventID:232',start=i)
        df.to_hdf('./data/EventID:232_start={}'.format(i), key='test', mode='w')

def read_in_dataframe()
    """
    Concats all the frames into big dataframe 200 000 rows from EventID:232 and saves it in ./data/
    """

    all_frames = ['./data/EventID:232_0_to_50000','./data/EventID:232_all_50000_to_80000', 
             './data/EventID:232_all_80000_to_130000','./data/EventID:232_all_130000_to_170000']    

    df = pd.concat(map(pd.read_hdf, all_frames))
    df.to_hdf('./data/concatenated_frames' ,  key='test', mode='w')

0
50000
100000
150000


### DCG function

Implementation for the discounted cumulative gain (DCG) function.


$$ \mathrm{DCG_{p}} = \sum_{i=1}^{N} \frac{rel_{i}}{\log_{b}(i+1)} = rel_1 + \sum_{i=2}^{N} \frac{rel_{i}}{\log_{b}(i+1)}  $$

Mean average precision (MAP) 

$$ F_{1}=\frac{\text{Precision} \times \text{Recall}}{\text{Precision} + \text{Recall}} $$  

F1 score (only defined for binary relevance scores).

$$ \operatorname{MAP} = \frac{\sum_{q=1}^N \operatorname{AveP(q)}}{N} $$ 


<br> 




## Read in Query-UserID

<br>


In [4]:
gineke_query = pd.read_excel('./data/Queries Rank Monitor Reproduceer file.xlsx', header=0, encoding = 'unicode_escape')
gineke_query.shape


(56, 5)

In [5]:
def find_responses_gineke(gineke_query):
    response_list = [] 

    for i in range(len(gineke_query)):

        df_locked = gineke_query.iloc[i]

        userid = df_locked['UserID'] 
        timestamp = df_locked['TimeStamp'] 
        total_results = df_locked['TotalResults']
        stamp  = timestamp.split('T')
        stamp =  stamp[0].replace('-','')

        df_232 = request_into_dataframe(rows=5000, query='EventID:232 AND ShortTimeStamp:{} AND UserID:{}'.format(stamp,userid))
        df_232 =  df_232[(df_232['TimeStamp'] == timestamp) & (df_232['TotalResult'] == total_results)]

        response_list.append(df_232)

#         df['SearchText'] = df['ResponseHeader'].progress_apply(lambda x: json.loads(x)['params']['userQuery'][:])

    df = pd.concat(response_list)
    return df

df = find_responses_gineke(gineke_query)
# df.to_hdf('./data/gineke_response_added',key='df')
df


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Unnamed: 0,AppID,CompID,Department,DocumentIDs,EventID,FunctionArea,ID,Month,Quarter,ResponseHeader,ShortTimeStamp,SiteID,TimeStamp,TotalResult,UserID,Week,Year,_version_
10,1,406,reinier.voskamp@loyensloeff.com,1745899 31981834 32119524 31560955 32235440 31...,232,,987825fc-9598-4ea1-87e5-6a6a8fdcee46,7,3,"{""li.ext"":[[""li.queryterms"",[[""DOCREF"",[[[""ter...",20190716,1,2019-07-16T12:39:56.047Z,5683,235192,29,2019,1639218723980050432
5,1,1573,,27971647 5184142 32088258 32257035 32256996 32...,232,,b3946182-3a27-42bd-a95f-b72a56f48f3e,7,3,"{""li.ext"":[[""li.queryterms"",[[""concept"",[[[""ID...",20190726,1,2019-07-26T12:40:30.602Z,6307,138614,30,2019,1640124734448336896
2,1,1013,Arbeidsrecht,1749056 7124359 3583160 2337341 31591496 22103...,232,,da2994fc-bd6e-46b2-9616-a20cf036c2c1,7,3,"{""li.ext"":[[""li.queryterms"",[[""concept"",[[[""ID...",20190716,1,2019-07-16T06:55:27.992Z,10510,233684,29,2019,1639197109263532032
8,1,123,Gezondheidszorg,1745791 1745899 30846893 30583970 31401730 316...,232,,0df743eb-d4bd-4659-a2f8-987d49147147,8,3,"{""li.ext"":[[""li.queryterms"",[[""concept"",[[[""ID...",20190808,1,2019-08-08T10:00:26.673Z,21823,228806,32,2019,1641292419700883456
15,1,435,,32236435 32204628 32322220 32257697 32254058 3...,232,,f11d4d1e-2196-4b7f-aa2d-1dc9e73a92ae,8,3,"{""li.ext"":[[""li.queryterms"",[[""concept"",[[[""ID...",20190805,1,2019-08-05T09:46:37.440Z,853,190308,32,2019,1641019759242772480
6,1,635,,32255514 12750602 32227698 31478705 32108213 3...,232,,6bedf993-f099-40ca-b96c-3d274c5d6c10,7,3,"{""li.ext"":[[""li.queryterms"",[[""word"",[[[""term""...",20190718,1,2019-07-18T09:20:23.437Z,1805,178729,29,2019,1639387363579789312
7,1,1234,Belastingdienst,32297597 32293698 32244896 32023897 32012447 3...,232,,8559d130-e3da-452b-94a3-653882bf99ef,7,3,"{""li.ext"":[[""li.queryterms"",[[""word"",[[[""term""...",20190730,2,2019-07-30T08:31:30.724Z,546902,208903,31,2019,1640471451788115968
46,1,822,Civilisten,32317649 15475531 32150904 23695638 22008615 1...,232,,b953a320-c7b4-4a40-8326-79fb10a77705,8,3,"{""li.ext"":[[""li.queryterms"",[[""concept"",[[[""ID...",20190801,1,2019-08-01T11:16:01.435Z,2846,40860,31,2019,1640662996082491393
0,1,341,,18208689 32123282 32108063 31557772 32108064 2...,232,,7c11f3d9-d0a6-41b1-9e78-802805a586a2,7,3,"{""li.ext"":[[""li.queryterms"",[[""LABEL_ART"",[[[""...",20190722,1,2019-07-22T07:06:36.932Z,1999,158579,30,2019,1639741391177252864
0,1,1231,,18208689 32308234 32213801 32333323 32217574 3...,232,,93dd3ef1-846f-4709-932d-166f9c99cb57,8,3,"{""li.ext"":[[""li.queryterms"",[[""DOCREF"",[[[""ID""...",20190808,1,2019-08-08T06:44:15.431Z,12416,93745,32,2019,1641280076979896320


In [6]:
def make_params(x):
    
    # Disable these boosting factors
    x['legislationBoostQ'] = "ID:0" 
    x['lowRankQ'] = "ID:0" 
    x['newsQ'] = "ID:0" 

    new_query = []
    for key,value in x.items():
        if key == 'fq':
            for fq_value in value:
                fq_value = urllib.parse.quote(string=str(fq_value))
                new_query.append("&{}={}".format('fq',fq_value))
        elif key == 'shards':
            value = value.replace("PRD_","ACC_")
            value = urllib.parse.quote(string=str(value))
            new_query.append("&{}={}".format(key,value))
        elif key == 'json.facet':
            pass
        else:
            value = urllib.parse.quote(string=str(value))
            new_query.append("&{}={}".format(key,value))


    params = ''.join(new_query)
    
    
    return params

In [8]:
import urllib.request
import time 

def internet_on():
    try:
        urllib.request.urlopen('http://216.58.192.142', timeout=1)
        return True
    except urllib.request.URLError as err: 
        return False

def loop_through_queries():
    """
    Loop through the queries
    """
    
    print(os.getcwd())

#     df = pd.read_excel('./data/appended_with_clicks.xlsx')

    df['replicated_ids'] = pd.Series()
    df['replicated_scores'] = pd.Series()

    all_doc_ids = []
    all_doc_scores = []
    
    print(len(df))
    for i in tqdm(range(len(df)), desc='Looping through queries'):
        
        if not internet_on():
            print("Internet not on:   ", internet_on())
        
        df_locked = df.iloc[i]
#         os.popen('rm /home/winston/LegalIntelligence/all_curl/params/*')
#         os.popen('rm /home/winston/LegalIntelligence/all_curl/responses/*')
        
        try:
            x = json.loads(df_locked['ResponseHeader'])
            params = make_params(x['params'])


            directory = "./all_curl/params/"
            pathlib.Path(directory  + 'param_{}.txt'.format(i)).write_text(params)
            os.popen('curl --data-binary @./all_curl/params/param_{}.txt http://ec2-18-184-94-154.eu-central-1.compute.amazonaws.com:8080/solr/ACC_Legal_Slave/select \
                                                                          > ./all_curl/responses/response_{}.txt'.format(i,i))

    #         print(os.getcwd()  + "/all_curl/responses/response_{}.txt".format(i))
            time.sleep(6)
            with open(os.getcwd()  + "/all_curl/responses/response_{}.txt".format(i)) as jsonfile:
                data = json.load(jsonfile)

            docids = []
            scores = []

            for idx,document in enumerate(data['response']['docs']):
                docids.append(document['ID'])
                scores.append(document['score'])
            
            all_doc_ids.append(docids)
            all_doc_scores.append(scores)

            df.iloc[i]['replicated_ids'] = docids
            df.iloc[i]['replicated_scores'] = scores
        except:
            print("exception for doc : ", i)
            docids= ['in exception']
            scores= ['in exception']
            all_doc_ids.append(docids)
            all_doc_scores.append(scores)
            df.iloc[i]['replicated_ids'] = docids
            df.iloc[i]['replicated_scores'] = scores
    return df,all_doc_ids, all_doc_scores
    
    

# df, all_doc_ids, all_doc_scores = loop_through_queries()
# all_doc_ids
df['replicated_ids'] = all_doc_ids
df['replicated_scores'] = all_doc_scores
df

Unnamed: 0,AppID,CompID,Department,DocumentIDs,EventID,FunctionArea,ID,Month,Quarter,ResponseHeader,ShortTimeStamp,SiteID,TimeStamp,TotalResult,UserID,Week,Year,_version_,replicated_ids,replicated_scores
10,1,406,reinier.voskamp@loyensloeff.com,1745899 31981834 32119524 31560955 32235440 31...,232,,987825fc-9598-4ea1-87e5-6a6a8fdcee46,7,3,"{""li.ext"":[[""li.queryterms"",[[""DOCREF"",[[[""ter...",20190716,1,2019-07-16T12:39:56.047Z,5683,235192,29,2019,1639218723980050432,"[1745899, 32380854, 32364166, 32319443, 319818...","[60.46819, 58.379963, 57.686615, 57.453606, 56..."
5,1,1573,,27971647 5184142 32088258 32257035 32256996 32...,232,,b3946182-3a27-42bd-a95f-b72a56f48f3e,7,3,"{""li.ext"":[[""li.queryterms"",[[""concept"",[[[""ID...",20190726,1,2019-07-26T12:40:30.602Z,6307,138614,30,2019,1640124734448336896,"[27971647, 5184142, 32371499, 32244533, 320882...","[62.16269, 57.26532, 55.820637, 55.673817, 55...."
2,1,1013,Arbeidsrecht,1749056 7124359 3583160 2337341 31591496 22103...,232,,da2994fc-bd6e-46b2-9616-a20cf036c2c1,7,3,"{""li.ext"":[[""li.queryterms"",[[""concept"",[[[""ID...",20190716,1,2019-07-16T06:55:27.992Z,10510,233684,29,2019,1639197109263532032,"[1749056, 7124359, 3583160, 2337341, 32363135,...","[44.876125, 60.868553, 60.65896, 60.54249, 60...."
8,1,123,Gezondheidszorg,1745791 1745899 30846893 30583970 31401730 316...,232,,0df743eb-d4bd-4659-a2f8-987d49147147,8,3,"{""li.ext"":[[""li.queryterms"",[[""concept"",[[[""ID...",20190808,1,2019-08-08T10:00:26.673Z,21823,228806,32,2019,1641292419700883456,"[1745791, 1745899, 30846893, 30583970, 3140173...","[44.958763, 44.95513, 59.677925, 59.42467, 59...."
15,1,435,,32236435 32204628 32322220 32257697 32254058 3...,232,,f11d4d1e-2196-4b7f-aa2d-1dc9e73a92ae,8,3,"{""li.ext"":[[""li.queryterms"",[[""concept"",[[[""ID...",20190805,1,2019-08-05T09:46:37.440Z,853,190308,32,2019,1641019759242772480,"[32373398, 32380006, 32236435, 32204628, 32322...","[67.739334, 66.37401, 66.294266, 66.18563, 65...."
6,1,635,,32255514 12750602 32227698 31478705 32108213 3...,232,,6bedf993-f099-40ca-b96c-3d274c5d6c10,7,3,"{""li.ext"":[[""li.queryterms"",[[""word"",[[[""term""...",20190718,1,2019-07-18T09:20:23.437Z,1805,178729,29,2019,1639387363579789312,"[32255514, 12750602, 32227698, 31478705, 31471...","[61.933014, 61.387466, 58.57944, 58.38827, 57...."
7,1,1234,Belastingdienst,32297597 32293698 32244896 32023897 32012447 3...,232,,8559d130-e3da-452b-94a3-653882bf99ef,7,3,"{""li.ext"":[[""li.queryterms"",[[""word"",[[[""term""...",20190730,2,2019-07-30T08:31:30.724Z,546902,208903,31,2019,1640471451788115968,"[32297597, 32293698, 32244896, 32023897, 32012...","[69.970695, 69.889015, 69.17725, 67.928116, 67..."
46,1,822,Civilisten,32317649 15475531 32150904 23695638 22008615 1...,232,,b953a320-c7b4-4a40-8326-79fb10a77705,8,3,"{""li.ext"":[[""li.queryterms"",[[""concept"",[[[""ID...",20190801,1,2019-08-01T11:16:01.435Z,2846,40860,31,2019,1640662996082491393,"[32317649, 15475531, 32150904, 23695638, 22008...","[65.47797, 60.821407, 60.42166, 59.027233, 58...."
0,1,341,,18208689 32123282 32108063 31557772 32108064 2...,232,,7c11f3d9-d0a6-41b1-9e78-802805a586a2,7,3,"{""li.ext"":[[""li.queryterms"",[[""LABEL_ART"",[[[""...",20190722,1,2019-07-22T07:06:36.932Z,1999,158579,30,2019,1639741391177252864,"[18208689, 32123282, 32108063, 32333322, 31557...","[52.719208, 68.30559, 64.985756, 64.20583, 63...."
0,1,1231,,18208689 32308234 32213801 32333323 32217574 3...,232,,93dd3ef1-846f-4709-932d-166f9c99cb57,8,3,"{""li.ext"":[[""li.queryterms"",[[""DOCREF"",[[[""ID""...",20190808,1,2019-08-08T06:44:15.431Z,12416,93745,32,2019,1641280076979896320,"[18208689, 32308234, 32213801, 32133938, 32123...","[50.234093, 64.29372, 63.515953, 63.13899, 63...."


In [31]:
print(os.getcwd())

# df = pd.read_excel('./data/appended_with_clicks.xlsx')

# df['replicated_ids'] = pd.Series()
# df['replicated_scores'] = pd.Series()

# all_doc_ids = []
# all_doc_scores = []

# df[20:30]
i = 29

df_locked = df.iloc[i]
df_locked
#         os.popen('rm /home/winston/LegalIntelligence/all_curl/params/*')
#         os.popen('rm /home/winston/LegalIntelligence/all_curl/responses/*')
header = df_locked['ResponseHeader']

x = json.loads(df_locked['ResponseHeader'])
params = make_params(x['params'])


directory = "./all_curl/params/"
pathlib.Path(directory  + 'param_{}.txt'.format(i)).write_text(params)
os.popen('curl --data-binary @./all_curl/params/param_{}.txt http://ec2-18-184-94-154.eu-central-1.compute.amazonaws.com:8080/solr/ACC_Legal_Slave/select \
                                                              > ./all_curl/responses/response_{}.txt'.format(i,i))

#         print(os.getcwd()  + "/all_curl/responses/response_{}.txt".format(i))
time.sleep(5)
with open(os.getcwd()  + "/all_curl/responses/response_{}.txt".format(i)) as jsonfile:
    data = json.load(jsonfile)

docids = []
scores = []

for idx,document in enumerate(data['response']['docs']):
    docids.append(document['ID'])
    scores.append(document['score'])
docids
scores

/home/winston/LegalIntelligence


AppID                                                                1
CompID                                                            1961
Department                                            Bedrijfsjuristen
DocumentIDs          1748920 32236936 32290562 20942533 32310802 32...
EventID                                                            232
FunctionArea                         Senior Bedrijfsjuridisch Adviseur
ID                                7ca4b9df-fdfe-46ab-9e0d-6c578f521bd7
Month                                                                8
Quarter                                                              3
ResponseHeader       {"li.ext":[["li.queryterms",[["concept",[[["ID...
ShortTimeStamp                                                20190803
SiteID                                                               1
TimeStamp                                     2019-08-03T14:27:27.730Z
TotalResult                                                      21140
UserID

9827

<os._wrap_close at 0x7f4ba9dc5e48>

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [9]:
df.to_excel("./data/appended_ids+scores.xlsx")

In [3]:
df = pd.read_excel("./data/appended_ids+scores.xlsx")
df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,DocumentIDs,EventID,ResponseHeader,ShortTimeStamp,TimeStamp,TotalResult,UserID,SearchText,ClickedPos,ClickedIDs,replicated_ids,replicated_scores
0,0,5,31999035 31991880 32245563 31516798 31422938 3...,232,"{""li.ext"":[[""li.queryterms"",[[""word"",[[[""term""...",20190712,2019-07-12T18:01:51.156Z,797,16217,airbnb,[1],['31999035'],"['31999035', '31991880', '32245563', '31516798...","[67.852325, 61.876366, 59.85674, 58.19846, 56...."
1,1,15,32313948 32317567 32317260 32317279 32317331 3...,232,"{""li.ext"":[[""li.queryterms"",[[""word"",[[[""term""...",20190725,2019-07-25T07:40:12.163Z,1030,36905,fintech,"[2, 3]","['32317567', '32317260']","['32388249', '32365983', '32333197', '32333155...","[56.00708, 55.160873, 55.068134, 54.85907, 54...."
2,2,46,32317649 15475531 32150904 23695638 22008615 1...,232,"{""li.ext"":[[""li.queryterms"",[[""concept"",[[[""ID...",20190801,2019-08-01T11:16:01.435Z,2846,40860,letter of intent,"[3, 24, 5]","['32150904', '12488532', '22008615']","['32317649', '15475531', '32150904', '23695638...","[65.47797, 60.821407, 60.42166, 59.027233, 58...."
3,3,13,32249281 32233288 32240581 32235147 32236480 3...,232,"{""li.ext"":[[""li.queryterms"",[[""word"",[[[""term""...",20190715,2019-07-15T06:48:44.647Z,3684,61617,wettelijke bedenktijd,"[71, 75, 53, 24, 18, 7, 19, 48, 50, 49, 50, 58...","['32004684', '31873842', '32038254', '32133735...","['32388301', '32376183', '32371363', '32363402...","[55.681873, 46.73511, 62.00183, 55.523857, 54...."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51,51,17,32252732 32252956 32254842 32254856 32250282 3...,232,"{""li.ext"":[[""li.queryterms"",[[""concept"",[[[""ID...",20190715,2019-07-15T12:16:57.597Z,29877,237551,witwassen,"[78, 92, 96, 119, 120, 130, 3, 17, 37, 40, 38,...","['32232740', '32227294', '32228622', '32227870...","['32388244', '32388278', '32381724', '32382752...","[57.23603, 56.20309, 53.709152, 55.49561, 53.0..."
52,52,17,32054960 31985185 31985172 31943995 31384129 3...,232,"{""li.ext"":[[""li.queryterms"",[[""DOCREF"",[[[""ID""...",20190715,2019-07-15T08:21:17.209Z,421,237724,wet forensische zorg,"[281, 334, 342, 326, 61, 239, 181, 287, 214, 2...","['30576243', '29416686', '28148739', '27844884...","['31964318', '32020599', '32016655', '32016484...","[52.326096, 52.31185, 52.266598, 52.238434, 52..."
53,53,27,3773923 31894725 11786559 32336280 23350993 32...,232,"{""li.ext"":[[""li.queryterms"",[[""DOCREF"",[[[""ID""...",20190807,2019-08-07T11:50:17.858Z,504,239810,Portacabin-arrest,"[1, 3]","['3773923', '11786559']","['3773923', '31894725', '11786559', '23350993'...","[45.87572, 56.08824, 55.278988, 54.967503, 54...."
54,54,1,7959048 32332011 32310990 32142315 32243833 32...,232,"{""li.ext"":[[""li.queryterms"",[[""DOCREF"",[[[""ID""...",20190808,2019-08-08T14:22:47.585Z,5329,239814,wet normering topinkomens,"[2, 31]","['32332011', '27982425']","['7959048', '32332011', '32310990', '32243833'...","[60.876095, 57.910812, 55.644527, 55.297977, 5..."
