Run the below cell once to install the needed libraries

In [None]:
#!rm -rf AIKoGAM#
#!rmdir /s /q AIKoGAM#
!git clone https://github.com/riccardogvn/AIKoGAM.git
from os import mkdir
mkdir('AIKoGAM/datasets')
!pip install -r AIKoGAM/requirements.txt



<b>THE ABOVE CELL MUST BE RUN ONLY ONCE</b>


Import the libraries that will be used

In [2]:
import spacy
from datetime import datetime
import os
import json
from AIKoGAM.src.utils.utils import *
import logging
import requests
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup
from html import unescape
from typing import Dict, Any
import hashlib
now = datetime.now()
date_and_hour = datetime.now().strftime("%d%m%Y_%H%M")
nlp = spacy.load('en_core_web_sm') #load the spacy library for Named Entity Recognition

<b>Christie's data retrieval</b>
<p>
In the input, insert the first year from wich to start the collection, and the last year. Before 1998 there are no data.</br>
Chose if you want to store images.</br>
The estimated time for storing data of a sale with around 80 objects and no images is 59 sec on a CPU.</br></p

In [None]:
start_year = int(input("Please, input the year from which to start scraping"))
end_year = int(input("Please, input the last year from which you want to scrape")) + 1
storingImage_christies = input("Do you want to store images? Y or N").lower()
if storingImage_christies == 'y':
    antiquities = collect_sales(start_year, end_year, log_file='data_collection.log', storeImage=True)
else:
    antiquities = collect_sales(start_year, end_year, log_file='data_collection.log', storeImage=False)

In [None]:
antiquities[0]

<b>Sotheby's data retrieval</b>

In [None]:
with open('auctionIds_sotheby.json','r',encoding='utf-8') as file:
    auctionIds_sotheby = json.load(file)
auctionIds_sotheby = auctionIds_sotheby[57:59]

storingImage_sothebys = input("Do you want to store images? Y or N").lower()
if storingImage_sothebys == 'y':
    antiquities_sothebys = collect_sales_sothebys(auctionIds_sotheby, storeImage=True)
else:
    antiquities_sothebys = collect_sales_sothebys(auctionIds_sotheby, storeImage=False)

In [None]:
antiquities_sothebys[0] #print the first item of the stored json

<b>Phoenix Ancient Art data retrieval</b>

In [None]:
storingImage_paa = input("Do you want to store images? Y or N").lower()
if storingImage_paa == 'y':
    antiquities_paa = collectPAA(storeImage=True)
else:
    antiquities_paa = collectPAA(storeImage=False)

In [None]:
antiquities_paa[0]

<b>Datasets remapping to final keys</b>

In [28]:
import json
with open(r'christies_raw.json','r',encoding='utf-8') as f:
    christies_data = json.load(f)
with open(r'sothebys_raw.json','r',encoding='utf-8') as f2:
    sothebys_data = json.load(f2)
with open(r'paa_raw.json','r',encoding='utf-8') as f3:
    paa_data = json.load(f3)

# Set up logging
logging.basicConfig(filename='error_log.log', level=logging.ERROR,format='%(asctime)s - %(levelname)s - %(message)s')
# Map Christie's data to the final keys
final_christies_data = remap_christies_data(christies_data)
# Map Sotheby's data to the final keys
final_sothebys_data = map_sothebys_data(sothebys_data)
# Map PAA's data to the final keys
final_paa_data = remap_paa_data(paa_data)
# Combine datasets into a single list
final_output = final_sothebys_data + final_christies_data + final_paa_data
# Hash and Reorder json file
db = hashAndClean(final_output)

Remapping Christie's data: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 1475.48it/s]
Mapping Sotheby's data: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  5.56it/s]
Remapping PAA data: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 225/225 [00:00<00:00, 22521.50it/s]


In [29]:
def hashAndClean(final_output):
    lots = dict()
    events = dict()
    
    for fin in final_output:
        for j in fin['saleLots']:
            j['lotHash'] = dict_hash(j)
        fin['saleHash'] = dict_hash(fin)
        for j in fin['saleLots']:
            j['saleHash'] = fin['saleHash']
        for j in fin['saleLots']:
            lots[j['lotHash']] = j
        events[fin['saleHash']] = fin
    for k,v in events.items():
        v.pop('saleLots')
    
    for k,v in lots.items():
        for x,j in v.items():
            if j == None:
                v[x] = ""
                print(v[x])
            if type(j) == dict:
                for a,b in j.items():
                    if b == None:
                        j[a] = ""
                        print(j)
                        
    db = {'events':events,'lots':lots}
    
    return db



{'provenance_1': '', 'provenance_2': "Sotheby's London 07 January 2022 lot 1"}


{'provenance_1': '', 'provenance_2': "Sotheby's London 07 January 2022 lot 2"}


{'provenance_1': '', 'provenance_2': "Sotheby's London 07 January 2022 lot 3"}


{'provenance_1': '', 'provenance_2': "Sotheby's London 07 January 2022 lot 4"}


{'provenance_0': 'Sotheby’s London, 13 October 1989, lot 79.', 'provenance_2': '', 'provenance_3': "Sotheby's London 07 January 2022 lot 5"}


{'provenance_1': '', 'provenance_2': "Sotheby's London 07 January 2022 lot 6"}


{'provenance_1': '', 'provenance_2': "Sotheby's London 07 January 2022 lot 7"}


{'provenance_1': '', 'provenance_2': "Sotheby's London 07 January 2022 lot 8"}


{'provenance_1': '', 'provenance_2': "Sotheby's London 07 January 2022 lot 9"}


{'provenance_1': '', 'provenance_2': "Sotheby's London 07 January 2022 lot 10"}


{'provenance_1': '', 'provenance_2': "Sotheby's London 07 January 2022 lot 11"}


{'provenance_1': '', 'provenance_2': "Sothe

In [30]:
with open(r'AIKoGAM/datasets/db.json','w') as file:
  json.dump(db,file)

In [34]:
!python AIKoGAM/event_extraction.py

{"0": {"lotId": "3ccf4833-1062-4fb0-8fc7-6f1b7d19485e", "lotNumber": "1", "lotUrl": "www.sothebys.com/en/buy/auction/2022/arts-of-the-islamic-world-india-including-fine-rugs-and-carpetsa-large-calligraphic-samanid-pottery-dish-nishapur", "lotTitle": "A large calligraphic Samanid pottery dish, Nishapur or Samarqand, 10th century", "lotSubtitle": "", "lotOther": "", "lotLastOwner": "", "lotDescription": "<p>of rounded form on short foot, with flaring rim, the earthenware&nbsp;body decorated in white slip with a single line of dark brown Kufic calligraphy around the rim, the exterior plain</p><p><br></p><p>38.7cm. diam.</p>", "lotImage": "https://sothebys-md.brightspotcdn.com/24/26/06d36be74926ab9480853dfdb4b8/l22220-c6s94-03-t2.jpg", "lotImageLocalPath": "\\\\images_\\sothe_02082023_0818/06d36be74926ab9480853dfdb4b8_l22220-c6s94-03-t2.jpg", "lotEstimateLow": "25000", "lotEstimateHigh": "35000", "lotCurrency": "25000", "lotWithdrawn": "NotAffected", "lotPrice": {}, "lotSale": "Sotheby's L

In [35]:
!python AIKoGAM/kg_construction.py

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [1]:
from neo4j import GraphDatabase

driver = GraphDatabase.driver("bolt://localhost:7687/prova", auth=("neo4j", "administrator"), encrypted=False) 

In [2]:

tx = session.begin_transaction()
query =    '''
            MATCH (p:event)<-[r:PARTECIPATED_TO]-(a:artwork)
            with [a,p] as ap, collect(r) as rels
            CALL apoc.refactor.mergeRelationships(rels)
            yield rel
            return count(rel) as result
            '''
tx.run(query)
tx.commit()

'FB:kcwQgiLBlFy4SX+BfLOstmIZgclpj5A='

In [None]:

tx = session.begin_transaction()
query =  MATCH (n:event), MATCH (m:event)  
         
tx.run(query)
tx.commit()

In [18]:
import os
import json
directory = "AIKoGAM/events"
filename = "events.txt"
file = os.path.join(directory, filename)

def take_hashes():
    with driver.session() as session:
        query = f"MATCH (a:artwork) RETURN a.artwork_id"
                
        tx = session.begin_transaction()
        c = tx.run(query)
        c = c.data()
        tx.commit() 
                 
    return c

In [43]:
hash_ids = []
def prepare_artwork_data(artwork):
    """
    Propare the list of properties and values to be stored in a KG node of type 'artwork'.

    Parameters
    ----------
    event : DICT
        EVENT DICTIONARY.

    Returns
    -------
    proccessed_data : LIST
        LIST OF PROPERTIES AND VALUES.

    """    
    properties = []
    values = []
    
    for property_name in artwork.keys():
        properties.append(property_name)
        values.append(str(artwork[property_name]))
        
    proccessed_data = [properties, values]
    return  proccessed_data


def add_node(node_type, node_id, data):
        """
        Add node to the KG.
        
        Parameters
        ----------
        node_type : STR
            NODE TYPE.
        node_id : STR
            NODE ID.              
        node_label : STR
            NODE LABEL.
        data : LIST
            PROPERTIES AND VALUES.

        Returns
        -------
        INT
            STATUS.

        """

        
                



        with driver.session() as session:
            props = data[0]
            vals = data[1]
            
            
            tx = session.begin_transaction()
            query = f"CREATE (a:{node_type} "
            query += "{"
            query += f"{node_type}_id: '{node_id}'"
            
            for idx, property_name in enumerate(props):
                if property_name == "DATE":
                    if type(vals[idx]) is list:
                        query += ", " + property_name + ": " + str(vals[idx]) + ""  
                    elif type(vals[idx]) is int:
                        query += ", " + property_name + ": " + str(vals[idx]) + "" 
                    else:
                        query += ", " + property_name + ": \"" + str(vals[idx]).replace('\"', '\'') + "\""     
                else:    
                    query += ", " + property_name + ": \"" + str(vals[idx]).replace('\"', '\'') + "\""
                    
            query += "})"
            
            try:
                tx.run(query)
                tx.commit()
                
            except Exception as e:
                pass
        
    
        return tx      

from AIKoGAM.src.utils import utils
def prepare_event_data(event):
    """
    Propare the list of properties and values to be stored in a KG node of type 'event'.

    Parameters
    ----------
    event : DICT
        EVENT DICTIONARY.

    Returns
    -------
    proccessed_data : LIST
        LIST OF PROPERTIES AND VALUES.

    """
    properties = []
    values = []

    # Extract event dates (using RegEx)
    label_duration = utils.extract_duration(event['label'])
    label_years = utils.extract_year(event['label'])
    label_century = utils.extract_century(event['label'])
    
    if label_duration:
        year_range = utils.extract_year(label_duration[0])
        properties.append('START_DATE')
        values.append(int(year_range[0]))
        properties.append('END_DATE')
        values.append(int(year_range[1]))
        
    elif label_years:
        if len(label_years) == 2:
            properties.append('START_DATE')
            values.append(int(label_years[0]))
            properties.append('END_DATE')
            values.append(int(label_years[1]))  
        else:
            properties.append('DATE')
            values.append(int(label_years[0]))
            
    elif label_century:
        year_range = utils.year_from_century(label_century[0])
        properties.append('START_DATE')
        values.append(year_range[0])
        properties.append('END_DATE')
        values.append(year_range[1])
            
    # Handle other properties
    for property_name in event.keys():
        if property_name != 'DATE':
            properties.append(property_name)
            values.append(str(event[property_name]))
     
    proccessed_data = [properties, values]
    return  proccessed_data


In [None]:
 def get_similar_event(data):
    """
    Get similar event.

    Parameters
    ----------
    data : LIST
        PROPERTIES AND VALUES.

    Returns
    -------
    INT
        STATUS.

    """
    with driver.session() as session:
        output = []
        
        props = data[0]
        vals = data[1]
        
        ev_label = ""  
        ev_date = []
        
        for idx, property_name in enumerate(props):
            if property_name == "DATE":
                ev_date = vals[idx] 
            if property_name == "label":
                ev_label = ''.join(c for c in vals[idx] if c not in '"')
                
        tx = session.begin_transaction()
        
        query = "MATCH (a:event) "
        query += f"WHERE a.label = \"{ev_label}\" OR ("
        
        # Match event subject
        query += "(a." + self.event_subject[0] + " in " + str(vals)
        for attr in self.event_subject[1:]:
            query += " OR a." + str(attr) + " in " + str(vals)
        
        # Match event location    
        query += ") AND (a." + self.event_location[0] + " in " + str(vals)   
        for attr in self.event_location[1:]:
            query += " OR a." + str(attr) + " in " + str(vals) 
        
        # Match event date
        query += ") AND "
        if type (ev_date) is list and len(ev_date) > 1:
            query += "((a.DATE >= " + str(ev_date[0]) + " AND a.DATE =< " + str(ev_date[1]) + ") OR (a.START_DATE = " + str(ev_date[0]) + " AND a.END_DATE = " + str(ev_date[1]) + ")) "
        elif type (ev_date) is int:
            query += "((a.START_DATE <= " + str(ev_date) + " AND a.END_DATE >= " + str(ev_date) + ") OR a.DATE = " + str(ev_date) + ") "
        elif type (ev_date) is str:
            query += "a.DATE in " + str(vals)
        else:
            query += "a.DATE is null "
            
        query += ") RETURN a.event_id as id"

        try:
            result = tx.run(query)
            for record in result:
                output.append(record['id'])
            return output   
        
        except Exception as e:
            print("Exception: {0}".format(e))
            return self._error_status 
        
    return self._success_status

In [44]:
with open(file, newline='', encoding='utf8') as jsonfile:
    lines = jsonfile.readlines()
    hashes = take_hashes()
    hash_ids = []
    #for hash in hashes:
     #   hash_ids.append(hash['a.artwork_id'])
    for line in lines:
        json_line = json.loads(line)
        for json_key in json_line.keys():
            print(json_key)
            json_object = json_line[str(json_key)]
            if json_object['lotHash'] in hash_ids:
                print('already there')
                pass
            else:
                json_object['_events'] = json_object['events']
                json_object.pop('events')
                json_object['events'] = []
                for event in json_object['_events']:
                    for e in event:
                        json_object['events'].append(e)
                json_object.pop('_events')
                print(json_object['events'])
                artwork_data = prepare_artwork_data(json_object)
                artwork_id = json_object['lotHash']
                add_node('artwork', artwork_id, artwork_data)
                for event in json_object['events']:
                    ev_data = prepare_event_data(event)
                    print(ev_data)
                    
                

0
[]
1
[]
2
[]
3
[]
4
[{'label': 'Sotheby’s London, 13 October 1989, lot 79', 'ORG': 'Sotheby’s London', 'DATE': '13 October 1989', 'CARDINAL': '79'}, {'label': 'Sotheby’s London, 13 October 1989, lot 79', 'ORG': 'Sotheby’s London', 'DATE': '13 October 1989', 'CARDINAL': '79'}, {'label': "Sotheby's London 07 January 2022 lot 5", 'ORG': "Sotheby's", 'GPE': 'London', 'DATE': '07 January 2022', 'CARDINAL': '5'}]
[['DATE', 'label', 'ORG', 'CARDINAL'], [1989, 'Sotheby’s London, 13 October 1989, lot 79', 'Sotheby’s London', '79']]
[['DATE', 'label', 'ORG', 'CARDINAL'], [1989, 'Sotheby’s London, 13 October 1989, lot 79', 'Sotheby’s London', '79']]
[['DATE', 'label', 'ORG', 'GPE', 'CARDINAL'], [2022, "Sotheby's London 07 January 2022 lot 5", "Sotheby's", 'London', '5']]
5
[]
6
[]
7
[]
8
[]
9
[]
10
[]
11
[]
12
[{'label': 'Dr and Mrs W Böhning, Mannheim, Germany, 1970s', 'PERSON': 'Mrs W Böhning', 'GPE': 'Mannheim', 'DATE': '1970s'}, {'label': 'Dr and Mrs W Böhning, Mannheim, Germany, 1970s', 'P

In [56]:
event_subject = ["ORG", "PERSON", "WORK_OF_ART", "FAC", "EVENT", "NORP", "LANGUAGE", "PRODUCT", "LAW"]
event_location = ["LOC", "GPE"]
event_date = ["DATE", "START_DATE", "END_DATE"]
   
output = []
props = ev_data[0]
vals = ev_data[1]
print(props)
print(vals)
ev_label = ""
ev_date = []
for idx, property_name in enumerate(props):
    if property_name == "DATE":
        ev_date = vals[idx]
        print(ev_date)
    if property_name == "label":
        ev_label = ''.join(c for c in vals[idx] if c not in '"')
        print(ev_label)
session = driver.session()
tx = session.begin_transaction()
query = "MATCH (a:event) "
query += f"WHERE a.label = \"{ev_label}\" "    
query += " RETURN a.event_id as id"

print(query)


['DATE', 'label', 'ORG', 'CARDINAL']
[2023, "Christie's 05 July 2023", "Christie's", '84']
2023
Christie's 05 July 2023
MATCH (a:event) WHERE a.label = "Christie's 05 July 2023"  RETURN a.event_id as id


In [45]:
ev_data

[['DATE', 'label', 'ORG', 'CARDINAL'],
 [2023, "Christie's 05 July 2023", "Christie's", '84']]

In [21]:
json_line

{'398': {'lotId': '6436574',
  'lotNumber': '84',
  'lotUrl': 'https://www.christies.com/lot/lot-6436574?ldp_breadcrumb=back&intObjectID=6436574&from=salessummary&lid=1',
  'lotTitle': 'AN EGYPTIAN BRONZE CAT',
  'lotSubtitle': 'LATE PERIOD-PTOLEMAIC PERIOD, CIRCA 664-30 B.C.',
  'lotOther': '',
  'lotLastOwner': 'PROPERTY FROM AN IMPORTANT EUROPEAN COLLECTION',
  'lotDescription': 'AN EGYPTIAN BRONZE CAT<br>\nLATE PERIOD-PTOLEMAIC PERIOD, CIRCA 664-30 B.C.<br>\n11 in. (28 cm.) high excl. tenons.<br>\n',
  'lotImage': 'https://www.christies.com/img/lotimages/2023/CKS/2023_CKS_21909_0084_000(an_egyptian_bronze_cat_late_period-ptolemaic_period_circa_664-30_bc022434).jpg?mode=max',
  'lotImageLocalPath': None,
  'lotEstimateLow': '40000.0',
  'lotEstimateHigh': '60000.0',
  'lotWithdrawn': False,
  'lotPrice': '60480.0',
  'lotPriceCurrency': 'GBP',
  'lotSale': "Christie's 05 July 2023",
  'lotReference': "Christie's 05 July 2023 lot 84",
  'lotProvenance': {'provenance_0': 'with Amina S