# GPI Datathon 2025
This notebook is meant to provide some common, Pythonic methods for approaching the GPI data. It may or may not be applicable to other languages or methods, but hopefully can provide a common basis for workflows, and point to common aspects of the data that need cleaning/transformation to be useful for visualization.

## 1. Setup Code

In [None]:
# Basic libraries
import pandas as pd
import numpy as np
import re
import ipywidgets as widgets

In [None]:
# By default pandas will cut off the number of rows and columns it will show. This overrides that.
pd.options.display.max_rows = 500000
pd.options.display.max_columns = 500000

## 2. Importing Data

### Load CSVs

In [None]:
goupil = '2020_02_18_GoupilDatathon.csv'
inv_french = '2024_04_29_DatathonFRENCHINV.csv'
inv_dutch = '2024_05_08_DatathonDUTCHINV.csv'
inv_german = '2024_05_08_DatathonGERMANINV.csv'
inv_spanish = '2024_05_09_DatathonSPANINV.csv'
info_french = '2024_05_27_DatathonFRENCHINFO.csv'
info_german = '2024_05_27_DatathonGERMANINFO.csv'
info_dutch = '2024_05_31_DatathonDUTCHINFO.csv'
info_spanish = '2024_05_31_DatathonSPANINFO.csv'
info_inven = '2024_06_05_DatathonINVENINFO.csv'
inventories = '2024_07_11_DatathonINVENTORIES.csv'
collectors = '2025_02_06_DatathonCOLLECTORS.csv'
prices = '2025_02_06_DatathonPRICES.csv'
xwebcolls = '2025_02_10_DatathonXWEBCOLLS.csv'
xwebprof = '2025_02_10_DatathonXWEBPROF.csv'
sales_belgian = '2025_02_18_DatathonBELGSALES.csv'
sales_british1 = '2025_02_18_DatathonBRITISHSALES-1.csv'
sales_british2 = '2025_02_18_DatathonBRITISHSALES-2.csv'
sales_british3 = '2025_02_18_DatathonBRITISHSALES-3.csv'
sales_british4 = '2025_02_18_DatathonBRITISHSALES-4.csv'
sales_british5 = '2025_02_18_DatathonBRITISHSALES-5.csv'
sales_dutch = '2025_02_18_DatathonDUTCHSALES.csv'
sales_french1 = '2025_02_18_DatathonFRENCHSALES-1.csv'
sales_french3 = '2025_02_18_DatathonFRENCHSALES-3.csv'
sales_german1 = '2025_02_18_DatathonGERMANSALES-1.csv'
sales_german2 = '2025_02_18_DatathonGERMANSALES-2.csv'
sales_german3 = '2025_02_18_DatathonGERMANSALES-3.csv'
sales_german7 = '2025_02_18_DatathonGERMANSALES-7.csv'
sales_german8 = '2025_02_18_DatathonGERMANSALES-8.csv'
knoedler = '2025_02_18_Datathonknoedler.csv'
sales_scandinavian = '2025_02_18_DatathonSCANDISALES.csv'
collectors_xslx = '2025_02_06_DatathonCOLLECTORS.xlsx'
prices_xslx = '2025_02_06_DatathonPRICES.xlsx'
xwebcolls_headers_xslx = '2025_02_10_DatathonXWEBCOLLS-HEADERS.xlsx'
xwebcolls_xslx = '2025_02_10_DatathonXWEBCOLLS.xlsx'
xwebprof_xslx = '2025_02_10_DatathonXWEBPROF.xlsx'
sales_contents_by_nation_tracking_sheet_xslx = '2025_02_18_DatathonSales-contents-by-nation-tracking-sheet.xlsx'
sales_scandinavia_xslx = '2025_02_18_DatathonSCANDISALES.xlsx'

In [None]:
# Some of the datasets come headerless. 
knoedler_header = ['STAR Record No.', 'PI Record No.', 'Stock Book No.', 'Knoedler Number', 'Page Number', 'Row Number', 'Consign. No.', 'Consign. Name', 'Consign. Auth', 'Consign. Loc.', 'Artist Name 1', 'Art. Authority 1', 'Nationality 1', 'Attrib Mod 1', 'Attrib Mod Auth1', 'Artist Name 2', 'Art. Authority 2', 'Nationality 2', 'Attrib Mod 2', 'Attrib Mod Auth2', 'Title', 'Description', 'Subject', 'Genre', 'Object Type', 'Materials', 'Dimensions', 'Entry Date-Year', 'Entry Date-Month', 'Entry Date-Day', 'Sale Date-Year', 'Sale Date-Month', 'Sale Date-Day', 'Purch. Amount', 'Purch. Currency', 'Purch. Note', 'KnoedPurch Amt.', 'KnoedPurch Curr.', 'KnoedPurch Note', 'Price Amount', 'Price Currency', 'Price Note', 'KnoedSale Amt.', 'KnoedSale Curr. ', 'KnoedSale Note', 'Seller Name 1', 'Seller Loc 1', 'Sell Auth Name 1', 'Sell Auth Loc 1', 'Sell Auth Mod 1', 'Seller Name 2', 'Seller Loc 2', 'Sell Auth Name 2', 'Sell Auth Loc 2', 'Sell Auth Mod 2', 'Seller Name 3 ', 'Seller Loc 3 ', 'Sell Auth Name 3 ', 'Sell Auth Loc 3 ', 'Sell Auth Mod 3 ', 'Joint Own 1', 'Joint Own Auth 1 ', 'Joint Own Sh 1', 'Joint Own 2', 'Joint Own Auth 2 ', 'Joint Own Sh 2', 'Joint Own 3', 'Joint Own Auth 3 ', 'Joint Own Sh 3', 'Transaction', 'Buyer Name 1', 'Buyer Loc 1', 'Buy Auth Name 1', 'Buy Auth Addr 1', 'Buy Auth Mod 1', 'Buyer Name 2', 'Buyer Loc 2', 'Buy Auth Name 2', 'Buy Auth Addr 2', 'Buy Auth Mod 2', 'Folio', 'prev own 1', 'prev own auth1 ', 'prev own loc1', 'prev own 2', 'prev own auth2 ', 'prev own loc2', 'prev own 3', 'prev own auth3 ', 'prev own loc3', 'prev own 4', 'prev own auth4 ', 'prev own loc4', 'prev own 5', 'prev own auth5 ', 'prev own loc5', 'prev own 6', 'prev own auth6 ', 'prev own loc6', 'prev own 7', 'prev own auth7 ', 'prev own loc7', 'prev own 8', 'prev own auth8 ', 'prev own loc8', 'prev own 9', 'prev own auth9 ', 'prev own loc9', 'post owner', 'post own auth ', 'Present Loc Geog', 'Present Loc Inst', 'Present Loc Acc', 'Present Loc Note', 'Working Note', 'Verbatim Notes', 'Rosetta Handle', 'Main Heading', 'Subheading', 'Flag']

# Huge datasets are not loaded by default. Setting the 'low_memory' variable to false disables this. 
knoedler_df = pd.read_csv('data/' + knoedler, names=knoedler_header, low_memory=False)

In [None]:
knoedler_df.head(5)

In [None]:
knoedler_df.shape

### Reload as one dataframe

In [None]:
data = knoedler

## 3. Data Visualization

### Sankey Chart

This is a classic visualization, originally invented to visualize the workings of internal combustion engines. The coolest version of it remains Charles Minard's 1869 visualization of Napoleon's march into and out of Russia. The major requirement of this kind of visualization is a "source" column and a "target" column.
![Minard](https://upload.wikimedia.org/wikipedia/commons/2/29/Minard.png "Minard Diagram")

In [None]:
import holoviews as hv
import plotly.graph_objects as go

# Nan values create problems for almost any function that is looking for a string. We have to replace them.
# A bonus is we can gather all unknown buyers and sellers into one category.
data['Buy Auth Addr 1'].fillna('Unknown Buyer', inplace=True)
data['Sell Auth Loc 1'].fillna('Unknown Seller', inplace=True)

# We need to assign each buyer and seller a country.
# Another method would be to run the whole string through a geotagging API like Geopy.
# That becomes impossible when we are dealing with 40k+ rows.
# Instead, we regex from the "Buy Auth Addr 1" and "Sell Auth Loc 1" columns.
def actor_country(string):
    output = re.sub("(.*)(,\s)(.*$)","\\3", string)
    return output

data['buyer_country'] = data['Buy Auth Addr 1'].apply(lambda x: actor_country(x))
data['seller_country'] = data['Sell Auth Loc 1'].apply(lambda x: actor_country(x))

In [None]:
# running on the whole dataset is computing-intensive. Let's cut it down.
data = data.head(50)

In [None]:
edges.head(10)

In [None]:
# sankey diagrams run on three columns: source, target, value (usually a number)
# the two lines below grab the two relevant columns for source and target
# We use the "zip" method to stitch those back as a separate df
# then we assign is row a 'value' of 1
source = data['buyer_country']
target = data['seller_country']
edges = pd.DataFrame(zip(source, target), columns=['source', 'target'])
edges['value'] = 1

#--------------------------------------
# The sankey function requires that strings be assigned numeric equivalents
# These can be encoded in a dictionary.
#'label' parameter in plotly 
unique_source_target = list(pd.unique(edges[['source', 'target']].values.ravel('K')))
#for assigning unique number to each source and target
mapping_dict = {k: v for v, k in enumerate(unique_source_target)}
#mapping of full data
edges['source'] = edges['source'].map(mapping_dict)
edges['target'] = edges['target'].map(mapping_dict)
#converting full dataframe as list for using with in plotly
links_dict = edges.to_dict(orient='list')

#---------------------------------------
#Sankey Diagram Code 
fig = go.Figure(data=[go.Sankey(
    node = dict(
    pad = 15,
    thickness = 20,
    line = dict(color = "black", width = 0.5),
    label = unique_source_target,

    ),
    link = dict(
    source = links_dict["source"],
    target = links_dict["target"],
    value = links_dict["value"],

))])

sankey = fig.update_layout(title_text="Knoedler Sankey Diagram", font_size=10,width=1000, height=600)
sankey