## Install Packages

In [1]:
!pip install -U selenium
!apt update
!apt install chromium-chromedriver
!pip install python-terrier

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
import pyterrier as pt
from pyterrier.measures import *

import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting urllib3[socks]~=1.26
  Using cached urllib3-1.26.13-py2.py3-none-any.whl (140 kB)
Installing collected packages: urllib3
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.25.11
    Uninstalling urllib3-1.25.11:
      Successfully uninstalled urllib3-1.25.11
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
requests 2.23.0 requires urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1, but you have urllib3 1.26.13 which is incompatible.[0m
Successfully installed urllib3-1.26.13
Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Hit:2 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:3 http://archive.ubuntu.com/ubuntu bionic-updates InRelease
Hit:4 http://security.ubuntu.com/ubuntu bionic-security InRelease
Hit:

## Data Preprocessing

### Load The Metropolitan Museum of Art Open Access CSV

In [2]:
# https://metmuseum.github.io/
df = pd.read_csv('https://media.githubusercontent.com/media/metmuseum/openaccess/master/MetObjects.csv')
# df = pd.read_csv('MetObjects.csv', index_col=0)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
df.head()  # 477804 rows × 54 columns

Unnamed: 0,Object Number,Is Highlight,Is Timeline Work,Is Public Domain,Object ID,Gallery Number,Department,AccessionYear,Object Name,Title,...,River,Classification,Rights and Reproduction,Link Resource,Object Wikidata URL,Metadata Date,Repository,Tags,Tags AAT URL,Tags Wikidata URL
0,1979.486.1,False,False,False,1,,The American Wing,1979.0,Coin,One-dollar Liberty Head Coin,...,,,,http://www.metmuseum.org/art/collection/search/1,,,"Metropolitan Museum of Art, New York, NY",,,
1,1980.264.5,False,False,False,2,,The American Wing,1980.0,Coin,Ten-dollar Liberty Head Coin,...,,,,http://www.metmuseum.org/art/collection/search/2,,,"Metropolitan Museum of Art, New York, NY",,,
2,67.265.9,False,False,False,3,,The American Wing,1967.0,Coin,Two-and-a-Half Dollar Coin,...,,,,http://www.metmuseum.org/art/collection/search/3,,,"Metropolitan Museum of Art, New York, NY",,,
3,67.265.10,False,False,False,4,,The American Wing,1967.0,Coin,Two-and-a-Half Dollar Coin,...,,,,http://www.metmuseum.org/art/collection/search/4,,,"Metropolitan Museum of Art, New York, NY",,,
4,67.265.11,False,False,False,5,,The American Wing,1967.0,Coin,Two-and-a-Half Dollar Coin,...,,,,http://www.metmuseum.org/art/collection/search/5,,,"Metropolitan Museum of Art, New York, NY",,,


In [4]:
df.columns

Index(['Object Number', 'Is Highlight', 'Is Timeline Work', 'Is Public Domain',
       'Object ID', 'Gallery Number', 'Department', 'AccessionYear',
       'Object Name', 'Title', 'Culture', 'Period', 'Dynasty', 'Reign',
       'Portfolio', 'Constituent ID', 'Artist Role', 'Artist Prefix',
       'Artist Display Name', 'Artist Display Bio', 'Artist Suffix',
       'Artist Alpha Sort', 'Artist Nationality', 'Artist Begin Date',
       'Artist End Date', 'Artist Gender', 'Artist ULAN URL',
       'Artist Wikidata URL', 'Object Date', 'Object Begin Date',
       'Object End Date', 'Medium', 'Dimensions', 'Credit Line',
       'Geography Type', 'City', 'State', 'County', 'Country', 'Region',
       'Subregion', 'Locale', 'Locus', 'Excavation', 'River', 'Classification',
       'Rights and Reproduction', 'Link Resource', 'Object Wikidata URL',
       'Metadata Date', 'Repository', 'Tags', 'Tags AAT URL',
       'Tags Wikidata URL'],
      dtype='object')

### Web Scrapping

In [5]:
# Reference: 
# Pfalzgraf, Bryan. “How to Use Selenium to Web-Scrape with Example.” Medium, Towards Data Science, 29 Apr. 2020, https://towardsdatascience.com/how-to-use-selenium-to-web-scrape-with-example-80f9b23a843a. 

def driversetup():
    options = webdriver.ChromeOptions()
    #run Selenium in headless mode
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    #overcome limited resource problems
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument("lang=en")
    #open Browser in maximized mode
    options.add_argument("start-maximized")
    #disable infobars
    options.add_argument("disable-infobars")
    #disable extension
    options.add_argument("--disable-extensions")
    options.add_argument("--incognito")
    options.add_argument("--disable-blink-features=AutomationControlled")
    
    driver = webdriver.Chrome(options=options)

    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined});")

    return driver

In [6]:
driver = driversetup()
def get_desc(url):
    driver.get(url)
    try:
      element = driver.find_element(By.CLASS_NAME, "artwork__intro__desc")
      desc = element.text
    except NoSuchElementException:
      desc = np.NaN
    return desc

In [7]:
df['Object Name'].value_counts().head(20)

Print                                   99439
Photograph                              28458
Drawing                                 25788
Book                                    13393
Fragment                                 9566
Kylix fragment                           8927
Piece                                    8630
Painting                                 5932
Negative                                 5928
Bowl                                     3617
Vase                                     3162
Figure                                   3030
Dress                                    2566
Baseball card                            2505
Baseball card, print                     2463
Plate                                    2230
Books Prints Ornament & Architecture     2127
Baseball card, photograph                2099
Ensemble                                 2090
Medal                                    1943
Name: Object Name, dtype: int64

In [8]:
photograph = df[df['Object Name']=='Photograph'] # 28458 rows × 54 columns
drawing = df[df['Object Name']=='Drawing'] # 25788 rows × 54 columns
painting = df[df['Object Name']=='Painting'] # 5932 rows × 54 columns
figure = df[df['Object Name']=='Figure'] # 3030 rows × 54 columns

In [9]:
# # Repeat the same for other object types
# from time import sleep
# from tqdm import tqdm
# drawing_link = list(drawing['Link Resource'])
# drawing_desc = []
# for i in tqdm(range(25788)):
#     drawing_desc.append(get_desc(drawing_link[i]))

# drawing_desc =np.array([drawing_link, drawing_desc])
# # transpose the Dataframe				 
# drawing_desc = pd.DataFrame(data=[drawing_desc[0],drawing_desc[1]]).T 
# drawing_desc.columns=['Link Resource','artwork__intro__desc']
# drawing_desc.to_csv('drawing_desc.csv') 

Load saved scrapped content

In [10]:
photograph_desc = pd.read_csv('data/photograph_desc.csv',  index_col=0) # 28458 rows × 54 columns
# photograph_desc[photograph_desc['artwork__intro__desc'].notnull()] # 10916 rows × 2 columns
# photograph_desc.duplicated().any()

drawing_desc = pd.read_csv('data/drawing_desc.csv',  index_col=0) # 25788 rows × 2 columns
# drawing_desc[drawing_desc['artwork__intro__desc'].notnull()] # 6704 rows × 2 columns

painting_desc = pd.read_csv('data/painting_desc.csv',  index_col=0) # 5932 rows × 2 columns
# painting_desc[painting_desc['artwork__intro__desc'].notnull()] # 3195 rows × 2 columns

figure_desc = pd.read_csv('data/figure_desc.csv',  index_col=0) # 3030 rows × 2 columns
# figure_desc[figure_desc['artwork__intro__desc'].notnull()] # 856 rows × 2 columns

link_desc = pd.concat([photograph_desc, drawing_desc, painting_desc, figure_desc]) # 63209 rows × 2 columns
data = link_desc.merge(df, on='Link Resource', how='left') # 63209 rows × 55 columns
data_w_desc = data[data['artwork__intro__desc'].notnull()] # 21672 rows × 55 columns

In [11]:
data_w_desc.duplicated().any()

False

### Preprocessing

In [12]:
data_w_desc.columns

Index(['Link Resource', 'artwork__intro__desc', 'Object Number',
       'Is Highlight', 'Is Timeline Work', 'Is Public Domain', 'Object ID',
       'Gallery Number', 'Department', 'AccessionYear', 'Object Name', 'Title',
       'Culture', 'Period', 'Dynasty', 'Reign', 'Portfolio', 'Constituent ID',
       'Artist Role', 'Artist Prefix', 'Artist Display Name',
       'Artist Display Bio', 'Artist Suffix', 'Artist Alpha Sort',
       'Artist Nationality', 'Artist Begin Date', 'Artist End Date',
       'Artist Gender', 'Artist ULAN URL', 'Artist Wikidata URL',
       'Object Date', 'Object Begin Date', 'Object End Date', 'Medium',
       'Dimensions', 'Credit Line', 'Geography Type', 'City', 'State',
       'County', 'Country', 'Region', 'Subregion', 'Locale', 'Locus',
       'Excavation', 'River', 'Classification', 'Rights and Reproduction',
       'Object Wikidata URL', 'Metadata Date', 'Repository', 'Tags',
       'Tags AAT URL', 'Tags Wikidata URL'],
      dtype='object')

In [13]:
data_w_desc['docno'] = 'd' + data_w_desc['Object ID'].astype(str)
data_w_desc["AccessionYear"] =  data_w_desc["AccessionYear"].apply(lambda x: str(x).replace('.0',''))
data_w_desc['Artist Nationality']= data_w_desc['Artist Nationality'].replace(' ', np.nan)
data_w_desc['Artist Nationality']= data_w_desc['Artist Nationality'].replace(' | ', np.nan)

In [14]:
data_w_desc.to_csv("df.csv",index=False)

#### Docs for models with 'artwork__intro__desc'

In [15]:
docs_df_desc = data_w_desc.loc[:,['docno', 'artwork__intro__desc']]
docs_df_desc.columns = ['docno', 'text']
docs_df_desc.reset_index(drop=True, inplace = True)
docs_df_desc.head(5)
docs_df_desc.to_csv("docs_df_desc.csv", index = False)

#### Docs for multivariable models

In [16]:
def concat(row):
  text = ''
  cols = ['artwork__intro__desc','Department', 'AccessionYear', 'Object Name', 'Title', 'Culture', 'Period', 'Portfolio', 'Artist Display Name', 'Artist Display Bio', 'Artist Nationality', 'Object Date', 'Medium', 'Credit Line', 'City', 'County', 'Country', 'Region', 'River', 'Classification', 'Tags']
  for col in cols:
    if not pd.isnull(row[col]) and row[col] != ' ':
      # text = text + col + ': ' + str(row[col]) + '. '
      text = text + str(row[col]) + '.'
  return text

In [17]:
data_w_desc['text'] = data_w_desc.apply(concat, axis=1)
docs_df = data_w_desc.loc[:,['docno', 'text']]
docs_df.reset_index(drop=True, inplace = True)
docs_df.head(5)
docs_df.to_csv("docs_df.csv", index = False)

## Index creation

In [18]:
 if not pt.started():
    pt.init()


PyTerrier 0.9.1 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7



In [19]:
index_dir_desc = './metdocs_index_desc'
indexer_desc = pt.DFIndexer(index_dir_desc, overwrite=True)
index_ref_desc = indexer_desc.index(docs_df_desc["text"], docs_df_desc["docno"])
# index_ref.toString()
index_desc = pt.IndexFactory.of(index_ref_desc)

In [20]:
print(index_desc.getCollectionStatistics().toString())

Number of documents: 21672
Number of terms: 32126
Number of postings: 1350367
Number of fields: 0
Number of tokens: 1651899
Field names: []
Positions:   false



## Data annotation with BM25

In [21]:
bm25_desc = pt.BatchRetrieve(index_desc, wmodel="BM25")   

In [22]:
search = bm25_desc.search("Who was influenced by Claude Monet").head(100)
search[['rank','docno','score','query']]

Unnamed: 0,rank,docno,score,query
0,0,d438551.0,30.732890,Who was influenced by Claude Monet
1,1,d11936.0,29.147936,Who was influenced by Claude Monet
2,2,d19523.0,25.450963,Who was influenced by Claude Monet
3,3,d335611.0,24.665922,Who was influenced by Claude Monet
4,4,d11480.0,24.365300,Who was influenced by Claude Monet
...,...,...,...,...
95,95,d437941.0,10.266216,Who was influenced by Claude Monet
96,96,d437481.0,10.264567,Who was influenced by Claude Monet
97,97,d436329.0,10.253823,Who was influenced by Claude Monet
98,98,d437426.0,10.104696,Who was influenced by Claude Monet


## Baseline models

In [23]:
topics = pd.read_csv("data/Annotation - Query.csv")
qrels = pd.read_csv("data/Annotation - Evaluation.csv")

In [24]:
tf = pt.BatchRetrieve(index_desc, wmodel="Tf")
bm25 = pt.BatchRetrieve(index_desc, wmodel="BM25")
tfidf = pt.BatchRetrieve(index_desc, wmodel="TF_IDF")
pl2 = pt.BatchRetrieve(index_desc, wmodel="PL2")
pipeline = bm25 >> (tf ** pl2)

In [25]:
pt.Experiment(
    [tf, bm25, tfidf, pl2, pipeline],
    topics,
    qrels,
    eval_metrics=["map", "ndcg", "ndcg_cut_10", "mrt"],
)

Unnamed: 0,name,map,ndcg,ndcg_cut_10,mrt
0,BR(Tf),0.513812,0.681429,0.502862,15.315088
1,BR(BM25),0.844051,0.84396,0.791426,13.405924
2,BR(TF_IDF),0.842302,0.843766,0.791584,11.712242
3,BR(PL2),0.841602,0.843863,0.792883,12.009766
4,"Compose(BR(BM25), FUnion(BR(Tf), BR(PL2)))",0.844051,0.84396,0.791426,65.110237
