# Data Merge & Cleaning
In this notebook, I will merge all data and check what's missing. 
I'll also use this section to figure out what kind of preprocessing will be necessary and how to extract text information.

In [85]:
import pandas as pd
import numpy as np

import pickle

In [86]:
harvard1 = pd.read_pickle('PKL/raw_data_Harvard_1.pkl')

In [87]:
harvard2 = pd.read_pickle('PKL/raw_data_Harvard_3.pkl')

In [88]:
risd1 = pd.read_pickle('PKL/raw_data_RISD_1.pkl')

In [89]:
risd2 = pd.read_pickle('PKL/raw_data_RISD_2.pkl')

In [90]:
moma = pd.read_csv('DATA/MoMA_data.csv')

## Harvard Museum
---
Let's look at the Harvard data first.

In [91]:
harvard = pd.concat([harvard1, harvard2], axis = 0)

In [102]:
# remove duplicates
harvard = harvard.drop_duplicates(subset = ['id'])

### missing image
Drop if it's missing image url.

In [103]:
harvard = harvard.dropna(subset = ['primaryimageurl'])

### Feature Consolidation
I'll consolidate scattered info into ...
1. period
2. culture
3. medium
4. title
5. description
6. palette
7. date
8. name of the artist

In [104]:
# removing unnecessary columns
cols = ['id', 'period', 'images', 'worktypes', 'accessionyear', 'style', 'commentary', 
        'technique', 'description', 'medium', 'title', 'colors', 'provenance', 'dated', 
        'dateend', 'people', 'century', 'labeltext', 'datebegin', 'culture']

In [105]:
harvard = harvard[cols]

### Unnesting Subcategories
There are sub-categories, which I'll unnest first.

Images, worktypes, colors, people are dictionary nested in the list. We'll need to extract information from them.

In [106]:
def extract_info(x, name):
    ''' 
    INPUT: a list of dictionary, key name
    OUTPUT: key value, consolidated as one string if many
    '''
    if isinstance(x, list):
        if len(x) == 1:
            return x[0][name]
        else:
            inst = []
            for i in range(len(x)):
                val = x[i][name]
                if val not in inst and val != None:
                    inst.append(val)
            if len(inst) > 1:
                return ', '.join(inst)
            elif len(inst) == 1:
                return inst[0]
            else:
                return None
    else:
        return None

In [107]:
# for image info
imageinfo = ['description', 'alttext', 'publiccaption']
for item in imageinfo: 
    harvard[f'img_{item}'] = harvard['images'].apply(lambda x: extract_info(x, item))

In [108]:
# worktype
harvard['worktype'] = harvard['worktypes'].apply(lambda x: extract_info(x, 'worktype'))

In [109]:
# color
harvard['color'] = harvard['colors'].apply(lambda x: extract_info(x, 'hue'))

In [110]:
# for artist, do the same but only take the artist roles
def extract_info_artist(x, name):
    if isinstance(x, list):
        if len(x) == 1:
            return x[0][name]
        else:
            inst = []
            for i in range(len(x)):
                if x[i]['role'] == 'Artist':
                    val = x[i][name]
                    if val not in inst and val != None:
                        inst.append(val)
            if len(inst) > 1:
                return ', '.join(inst)
            elif len(inst) == 1:
                return inst[0]
            else:
                return None
    else:
        return None

In [111]:
harvard['artist'] = harvard['people'].apply(lambda x: extract_info(x, 'displayname'))

In [112]:
harvard = harvard.drop(['images', 'worktypes', 'colors', 'people'], axis = 1)

### Combining columns
I'll combine columns in this manner.
1. period: if empty century if empty dated
2. description: combine everything: 'style', 'commentary', 'description', 'labeltext', 'img_description', 'img_alttext', 'img_publiccaption'

In [113]:
harvard['century'] = np.where(harvard['century'].isnull(), harvard['dated'], harvard['century'])
harvard['period'] = np.where(harvard['period'].isnull(), harvard['century'], harvard['period'])

In [114]:
desc_list = ['style', 'commentary', 'description', 'labeltext', 'img_description', 'img_alttext', 'img_publiccaption']
for c in desc_list:
    harvard[c] = harvard[c].fillna('')

In [115]:
harvard['all_description'] = harvard[['style', 'commentary', 'description', 'labeltext', 
         'img_description', 'img_alttext', 'img_publiccaption']].agg(' '.join, axis=1)

In [116]:
harvard = harvard[['id', 'title', 'worktype', 'period', 'all_description', 'artist', 'color']]

In [117]:
harvard.head(2)

Unnamed: 0,id,title,worktype,period,all_description,artist,color
0,47769,Page from an album of Rice and Silk Culture,"album leaf, painting","Qing dynasty, 1644-1911",,Traditionally attributed to Qiu Ying,"Brown, Yellow"
1,47969,Page from an album of Rice and Silk Culture,"painting, album leaf","Qing dynasty, 1644-1911",,Traditionally attributed to Qiu Ying,"Brown, Yellow, Green"


In [118]:
harvard['source'] = 'harvard'

## MOMA
---
Now similar steps for MOMA data

### Missing image
drop missing images

In [119]:
moma = moma.dropna(subset = ['ThumbnailURL'])
moma = moma[moma.Classification == 'Painting']

In [120]:
moma['source'] = 'moma'

In [121]:
col_list = ['ObjectID', 'Title', 'Artist', 'Nationality', 'Date', 'Medium', 'source']
moma = moma[col_list]

## RISD
---

In [163]:
risd = pd.concat([risd1, risd2], axis = 0)

### Missing images
Remove if the image is missing.

In [164]:
risd = risd[[len(x) > 0 for x in risd['images']]]

### Location
Get the primary maker's location.

In [165]:
def get_nationality(x):
    '''
    given a dictionaries nested in a list, 
    return the first person's nationality
    '''
    if x:
        ind = list(x)[0]['nationality']
        if ind: 
            return ind[0]
        else: 
            return None


In [166]:
risd['nationality'] = risd.makers.apply(lambda x: get_nationality(x))

### Consolidating info
1. only paintings or other art-like work on paper 
2. culture: culture if empty, place, if empty nationality

In [167]:
# subsetting column
col_list = ['id', 'culture', 'dating', 'description', 'mediumTechnique', 'place', 'primaryMaker',  
            'title', 'type', 'nationality']
risd = risd[col_list]

In [168]:
# filter to only paintings
art_list = ['Paintings']
risd = risd[[any(item in x[0] for item in art_list) for x in risd['type']]]

#### Missing values
RISD datasets don't have missing values. They just have empty values. Let's change them.

In [178]:
risd = risd.replace('', np.nan)

#### Culture

In [180]:
# culture
risd['place'] = np.where(risd['place'].isnull(), risd['nationality'], risd['place'])
risd['culture'] = np.where(risd['culture'].isnull(), risd['place'], risd['culture'])
risd = risd.drop(['place', 'nationality'], axis = 1)

In [182]:
risd = risd.drop(['type'], axis = 1)

In [183]:
risd['source'] = 'risd'