# Data Merge & Cleaning
In this notebook, I will merge all data and check what's missing. 
I'll also use this section to figure out what kind of preprocessing will be necessary and how to extract text information.

In [1]:
import pandas as pd
import numpy as np

import pickle

In [2]:
harvard1 = pd.read_pickle('PKL/raw_data_Harvard_1.pkl')

In [3]:
harvard2 = pd.read_pickle('PKL/raw_data_Harvard_3.pkl')

In [4]:
risd1 = pd.read_pickle('PKL/raw_data_RISD_1.pkl')

In [5]:
risd2 = pd.read_pickle('PKL/raw_data_RISD_2.pkl')

In [6]:
moma = pd.read_csv('DATA/MoMA_data.csv')

## Harvard Museum
---
Let's look at the Harvard data first.

In [7]:
harvard = pd.concat([harvard1, harvard2], axis = 0)

In [8]:
# remove duplicates
harvard = harvard.drop_duplicates(subset = ['id'])

### missing image
Drop if it's missing image url.

In [9]:
harvard = harvard.dropna(subset = ['primaryimageurl'])

### Feature Consolidation
I'll consolidate scattered info into ...
1. period
2. culture
3. medium
4. title
5. description
6. palette
7. date
8. name of the artist

### Unnesting Subcategories
There are sub-categories, which I'll unnest first.

Images, worktypes, colors, people are dictionary nested in the list. We'll need to extract information from them.

In [10]:
def extract_info(x, name):
    ''' 
    INPUT: a list of dictionary, key name
    OUTPUT: key value, consolidated as one string if many
    '''
    if isinstance(x, list):
        if len(x) == 1:
            return x[0][name]
        else:
            inst = []
            for i in range(len(x)):
                val = x[i][name]
                if val not in inst and val != None:
                    inst.append(val)
            if len(inst) > 1:
                return ', '.join(inst)
            elif len(inst) == 1:
                return inst[0]
            else:
                return None
    else:
        return None

In [11]:
# for image info
imageinfo = ['description', 'alttext', 'publiccaption']
for item in imageinfo: 
    harvard[f'img_{item}'] = harvard['images'].apply(lambda x: extract_info(x, item))

In [12]:
# worktype
harvard['worktype'] = harvard['worktypes'].apply(lambda x: extract_info(x, 'worktype'))

In [13]:
# color
harvard['color'] = harvard['colors'].apply(lambda x: extract_info(x, 'hue'))

In [14]:
# for artist, do the same but only take the artist roles
def extract_info_artist(x, name):
    if isinstance(x, list):
        if len(x) == 1:
            return x[0][name]
        else:
            inst = []
            for i in range(len(x)):
                if x[i]['role'] == 'Artist':
                    val = x[i][name]
                    if val not in inst and val != None:
                        inst.append(val)
            if len(inst) > 1:
                return ', '.join(inst)
            elif len(inst) == 1:
                return inst[0]
            else:
                return None
    else:
        return None

In [15]:
harvard['artist'] = harvard['people'].apply(lambda x: extract_info(x, 'displayname'))

In [16]:
harvard = harvard.drop(['images', 'worktypes', 'colors', 'people'], axis = 1)

### Combining columns
I'll combine columns in this manner.
1. period: if datebegin is 0 use century
2. description: combine everything: 'style', 'commentary', 'description', 'labeltext', 'img_description', 'img_alttext', 'img_publiccaption'

#### Period

In [17]:
harvard['century'] = np.where(harvard['century'].isnull(), harvard['dated'], harvard['century'])
harvard['period'] = np.where(harvard['period'].isnull(), harvard['century'], harvard['period'])
harvard['period'] = np.where(harvard['datebegin'] == 0, harvard['period'], harvard['datebegin'])

#### Description

In [18]:
desc_list = ['style', 'commentary', 'description', 'labeltext', 'img_description', 'img_alttext', 'img_publiccaption']
for c in desc_list:
    harvard[c] = harvard[c].fillna('')

In [19]:
harvard['all_description'] = harvard[['style', 'commentary', 'description', 'labeltext', 
         'img_description', 'img_alttext', 'img_publiccaption']].agg(' '.join, axis=1)

In [20]:
harvard = harvard[['id', 'period', 'medium', 'title', 'culture', 'color', 'artist', 'all_description']]

In [21]:
harvard['source'] = 'harvard'

## MOMA
---
Now similar steps for MOMA data

### Missing image
drop missing images

In [22]:
moma = moma.dropna(subset = ['ThumbnailURL'])
moma = moma[moma.Classification == 'Painting']

In [23]:
moma['source'] = 'moma'

In [24]:
col_list = ['ObjectID', 'Title', 'Artist', 'Nationality', 'Date', 'Medium', 'source']
moma = moma[col_list]

## RISD
---

In [25]:
risd = pd.concat([risd1, risd2], axis = 0)

### Missing images
Remove if the image is missing.

In [26]:
risd = risd[[len(x) > 0 for x in risd['images']]]

### Location
Get the primary maker's location.

In [27]:
def get_nationality(x):
    '''
    given a dictionaries nested in a list, 
    return the first person's nationality
    '''
    if x:
        ind = list(x)[0]['nationality']
        if ind: 
            return ind[0]
        else: 
            return None


In [28]:
risd['nationality'] = risd.makers.apply(lambda x: get_nationality(x))

### Consolidating info
1. only paintings or other art-like work on paper 
2. culture: culture if empty, place, if empty nationality

In [29]:
# subsetting column
col_list = ['id', 'culture', 'dating', 'description', 'mediumTechnique', 'place', 'primaryMaker',  
            'title', 'type', 'nationality']
risd = risd[col_list]

In [30]:
# filter to only paintings
art_list = ['Paintings']
risd = risd[[any(item in x[0] for item in art_list) for x in risd['type']]]

#### Missing values
RISD datasets don't have missing values. They just have empty values. Let's change them.

In [31]:
risd = risd.replace('', np.nan)

#### Culture

In [32]:
# culture
risd['place'] = np.where(risd['place'].isnull(), risd['nationality'], risd['place'])
risd['culture'] = np.where(risd['culture'].isnull(), risd['place'], risd['culture'])
risd = risd.drop(['place', 'nationality'], axis = 1)

In [33]:
risd = risd.drop(['type'], axis = 1)

In [34]:
risd['source'] = 'risd'

# Purging
Now check if there is an actual matching image file, if not drop it.

In [35]:
import os

In [36]:
harvard_img_list = os.listdir('IMAGES/HARVARD')

In [37]:
def check_image(x, list_):
    '''
    return the filename if the file exists in list_
    otherwise np.nan
    '''
    fn = f'{x}.jpg' 
    if fn in list_:
        return fn
    else: return np.nan

In [38]:
harvard['image'] = harvard['id'].map(lambda x: check_image(x, harvard_img_list))

In [39]:
moma_img_list = os.listdir('IMAGES/MOMA')
moma['image'] = moma['ObjectID'].map(lambda x: check_image(x, moma_img_list))

In [40]:
def check_image_risd(x, list_):
    '''
    return the filename if the file exists in list_
    otherwise np.nan
    '''
    fn = f'risd_{x}.jpg' 
    if fn in list_:
        return fn
    else: return np.nan

In [41]:
risd_img_list = os.listdir('IMAGES/RISD')
risd['image'] = risd['id'].map(lambda x: check_image_risd(x, risd_img_list))

# Merging
now let's merge all dataframes

In [42]:
# first making consistent column names

In [43]:
harvard = harvard.rename(columns={'all_description': 'description'})

In [44]:
moma = moma.rename(columns = {'ObjectID': 'id', 'Title': 'title', 'Artist': 'artist', 'Nationality': 'culture', 
                             'Date': 'period', 'Medium': 'medium'})

In [45]:
risd = risd.rename(columns = {'dating': 'period', 'mediumTechnique': 'medium', 'primaryMaker': 'artist'})

In [46]:
full_df = pd.concat([harvard, risd, moma], ignore_index = True, sort = False)

In [47]:
full_df = full_df[~full_df.image.isnull()]

In [49]:
# reset index
full_df = full_df.reset_index()

In [50]:
full_df.to_pickle('PKL/merged_artworks_data.pkl')

## Subset
I'm going to filter the dataset to include only recent art where abstraction started to be incorporated in art. 

In [52]:
import re

In [53]:
# remove all non-digits
tmp = [re.sub('th', '00', str(x)) for x in full_df.period]

# for later, period names without numbers
# list of [x for x in tmp if re.match('^\D*$', x)]

# remove all non-digits
tmp = [re.sub('[^\d-]', '', str(x)) for x in tmp]
# remove all number after '-'
tmp = [re.sub('-.*', '', str(x)) for x in tmp]

# turn them into numbers
tmp = [int(x) if x else 0 for x in tmp]

In [54]:
tmp = [int(str(x)[0:4]) if x > 2030 else x for x in tmp ]

In [55]:
tmp = np.array(tmp)

In [56]:
# upto 1800, combine all century, after that, go by decades
cond = [(tmp < 1300) | (tmp > 2030), 
       tmp < 1900]
val = [0, (tmp//100)*100]
full_df['period'] = np.select(cond, val, (tmp//10)*10)

In [57]:
full_df = full_df[full_df['period'] >= 1900]

In [58]:
full_df.to_pickle('PKL/merged_artworks_recent.pkl')

# Captions
---
Now we need data with captions. The goal is to get a series of sentences that describe an image.

## Mediums

For mediums, I'll do following cleaning.
1. remove all texts in parentheses 
2. remove all texts following 'reading' (refers to specific signature)
3. if words are too long (e.g. 30), cut last words
4. change semi-colons to 'and'
5. remove \r\n

In [59]:
def clean_medium_text(text):
    if isinstance(text, str):
        # remove texts in parentheses
        text = re.sub('\(.*\)', '', text)
        # remove after 'reading'
        text = re.sub('reading.*', '', text)
        # change semi-colons
        text = re.sub(';', ' and ', text)
        # remove alphabet starts with \
        text = re.sub('\[a-z]{1}', '', text)
        # if longer than 30, remove the last part 
        if len(text.split()) > 30:
            text = ' '.join(text.split()[0:30])
            
        return ' '.join(text.split()) # clean up whitespae
    else: 
        return None

In [60]:
full_df['medium'] = full_df['medium'].apply(lambda x: clean_medium_text(x))

## Title
For titles, I'll run these cleaning steps. One of the biggest problem is that the title includes a lot of proper nouns that can meaninglessly increase the dimensions. So based on our data, I'll try to create a dicitonary that filters out the proper nouns and change to appropriate pronoun.

1. if the text format is person's name + years, don't include. e.g. texts(optional text 4 digits-4 digits), change it to a portrait of a person
2. remove texts in parentheses
3. change colons to 'about'
4. remove texts after 'replica'
5. removed texts after 'Identified'
6. remove digits and special characters

In [61]:
import string
punctuations = string.punctuation + string.digits
table_ = str.maketrans('', '', punctuations)

def clean_title_text(text):
    if isinstance(text, str):
        text = re.sub('.*\(\d{4}\-\d{4}\)', 'portrait of a person', text)
        text = re.sub('\(.*\)', '', text)
        text = re.sub(':', ' about ', text)
        text = re.sub('replica.*', '', text)
        text = re.sub('Identified.*', '', text)
        # remove alphabet starts with \
        text = re.sub('\[a-z]{1}', '', text)
        text = str.translate(text, table_)
        # if longer than 30, remove the last part 
        if len(text.split()) > 30:
            text = ' '.join(text.split()[0:30])
            
        return ' '.join(text.split()) # clean up whitespae
    else: 
        return None

In [212]:
tmp = full_df['title'].apply(lambda x: clean_title_text(x))

There are a bit too many unique words, especially because art title does not have any standard. This could be problematic as it can lead to too big of dimensions. I'll try to reduce the dimension using similarity measure.

In [64]:
import spacy
import en_core_web_lg
from collections import Counter

In [76]:
# loading the corpus 
nlp = en_core_web_lg.load()

For each word that only appears once, find the closest word in the corpus

In [213]:
tmp = [x.lower() for x in tmp]

In [214]:
def consolidate_words(textlist, thresh = .8):
    '''
    Takes a list of texts to run iteration
    Change word that occurs only once to similar word in the text
    Return consolidated list 
    '''
    list_of_text = textlist.copy()
    wordcounts = Counter(' '.join(list_of_text).split())    
    words_once = [k for k, v in wordcounts.items() if v == 1]
    other_words = [k for k, v in wordcounts.items() if v > 1]
    
    tokens = nlp(' '.join(other_words))
    replacement_dict = {}
    for word in words_once:

        word_token = nlp(word)
        max_similarity = thresh

        for tk in tokens:
            # find the maximum similarity above threshold
            sim_score = word_token.text, tk.text, word_token.similarity(tk)
            if 1 > sim_score[2] > max_similarity:
                replacement_dict[word] = sim_score[1]
                max_similarity = sim_score[2]
        try:
            print(word, 'to', replacement_dict[word])
        except KeyError:
            print('no matching word for', word)
        
    
    for i, text in enumerate(list_of_text):
        
        text = text.split()
        for j, te in enumerate(text):
            if te in replacement_dict: 
                text[j] = replacement_dict[te]
        list_of_text[i] = ' '.join(text)
            
    return list_of_text
    

In [None]:
new_tmp = consolidate_words(tmp)

In [218]:
len(Counter(' '.join(tmp).split()))

3239

In [219]:
len(Counter(' '.join(new_tmp).split()))

3132

In [222]:
full_df['caption'] = new_tmp

It reduced about 100 words. 

### Exporting
Now I'll save the files with more consistent and appropriate img_id. Then save a img_id and the description (medium and title combined) as a csv file.

In [223]:
# generate img ids
full_df['img_id'] = full_df['source'] + '_' + full_df['id'].astype('str')

In [227]:
select_df = full_df.copy()

In [235]:
img_caption1 = dict(zip(select_df.img_id, select_df.caption))
img_caption2 = dict(zip(select_df.img_id, select_df.medium))

In [238]:
import shutil

In [239]:
# risd set
fnames = select_df[select_df['source'] == 'risd']['img_id']
for fn in fnames:
    shutil.move(f'IMAGES/RISD/{fn}.jpg', f'IMAGES/paintings/{fn}.jpg')

In [248]:
# harvard set
sel = select_df[select_df['source'] == 'harvard'].reset_index()
old_fnames = sel['id']
new_fnames = sel['img_id']
for i in range(len(old_fnames)):
    shutil.move(f'IMAGES/HARVARD/{old_fnames[i]}.jpg', 
                f'IMAGES/paintings/{new_fnames[i]}.jpg')

In [249]:
# moma set
sel = select_df[select_df['source'] == 'moma'].reset_index()
old_fnames = sel['id']
new_fnames = sel['img_id']
for i in range(len(old_fnames)):
    shutil.move(f'IMAGES/MOMA/{old_fnames[i]}.jpg', 
                f'IMAGES/paintings/{new_fnames[i]}.jpg')

In [250]:
with open('PKL/img_captions1.pkl', 'wb') as fp:
    pickle.dump(img_caption1, fp, pickle.HIGHEST_PROTOCOL)
    
with open('PKL/img_captions2.pkl', 'wb') as fp:
    pickle.dump(img_caption2, fp, pickle.HIGHEST_PROTOCOL)

# Additional Cleaning
---
Below is not directly related to this project, but it's here in case any further analysis is needed.

## Medium
This would be multi-label case. 
I'll first consolidate all the unique mediums, categorize them and then turn them into a list of binary columns for each categories.

In [62]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# lemmatizer
wnl = WordNetLemmatizer()

# special chracters
special_chars = string.punctuation + string.digits

# stopwords
sw = stopwords.words('english')
sw += ['color', 'mounted', 'synthetic', 'hanging', 'painted', 'signature', 'reading', 'two', 'light', 'artist', 
      'one', 'opaque', 'colors', 'folding', 'three', 'one', 'frame', 'painting', 'parts', 'album', 'seal', 
       'nan', 'style', 'seals']

def text_preprocess(x):
    # remove punctuations and digits
    if isinstance(x, str):
        table_ = str.maketrans('', '', special_chars)
        text = str.translate(x, table_)
        text = text.split()
        clean_text = [wnl.lemmatize(w.lower()) for w in text]
        return [w for w in clean_text if w not in sw]
    else: 
        return np.nan

In [63]:
full_df['medium'] = full_df['medium'].apply(lambda x: text_preprocess(x))

In [64]:
# get frequencies
from collections import Counter
#top_30_medium = 
top_30_medium = list(dict(Counter(np.sum([x for x in full_df['medium'] if isinstance(x, list)])).most_common(30)).keys())

In [65]:
for med in top_30_medium: 
    full_df[med] = [str(med) in x if isinstance(x, list) else 'h' for x in full_df['medium']]

In [66]:
full_df['other_medium'] = np.where(np.sum(full_df.iloc[:, -30:], axis = 1) == 0, 1, 0)