# Data Collection
Here I will take the open source object data from Harvard Museum.  
(API Documentation and data source: https://www.harvardartmuseums.org/collections/api)

In [5]:
import pandas as pd
import numpy as np
import requests
import json
import nltk

%matplotlib inline

You must request the API key from Harvard Museum by using a link provided in their documentations.  
Usually you will receive the key right away.  
Then create a harvard_mus_api.json file to store the key as a dictionary.  
e.g. {"api_key": "your key here"}  
If you are not uploading this to public and it's for your personal use, you can ignore below step and just assign api_key to your api key. 

In [7]:
def get_keys(path):
    with open(path) as f:
        return json.load(f)

path = '/Users/stereopickles/.secret' # input the location of your tmdb_api.json

In [8]:
api_key = get_keys(f"{path}/harvard_mus_api.json")['api_key']

Let's test if it's working.

In [469]:
url = "https://api.harvardartmuseums.org/object"

url_params = {
    "apikey": api_key,
}

resp = requests.get(url, params = url_params)
print(resp.status_code)


200


In [470]:
print(resp.json().keys()) 

dict_keys(['info', 'records'])


In [471]:
resp.json()['info']

{'totalrecordsperquery': 10,
 'totalrecords': 234997,
 'pages': 23500,
 'page': 1,
 'next': 'https://api.harvardartmuseums.org/object?apikey=def72120-c45a-11ea-89a3-6722767e4145&page=2'}

## Full data

In [475]:
# We will keep it to paintings only for the first round. 

In [476]:
classes = ['Paintings'] 
full_db = []

for cls in classes: 
    url_params = {
        "apikey": api_key, 
        "classification": cls, 
    }
    
    res = requests.get(url, params = url_params)
    
    if res.status_code == 200: # if connection is successful
        # run the rest of the pages
        n = int(res.json()['info']['pages']) # getting the page number 
        
        for i in range(n):
            url_params = {
                "apikey": api_key,
                "classification": cls, 
                "page": i
            }

            resp = requests.get(url, params = url_params)
            
            try: 
                full_db.extend(resp.json()['records']) # add it to the list
            except:
                print(f"Error on page {i+1}") # let me know if there's an error

    else: 
        continue
        

In [477]:
# Converting data to a Pandas dataframe
full_df = pd.DataFrame(full_db)

In [478]:
# drop items without tags
full_df.dropna(subset = ["description"], inplace = True)
full_df.description.isnull().sum()

0

In [479]:
full_df.classification.value_counts()

Paintings    557
Name: classification, dtype: int64

# Data Cleaning and Sampling

First we will preprocess the description data.  
We will do  
1. Make everything lowercase 
2. Remove stopwords
3. Lemmatization


In [480]:
clean_df = full_df.copy()

First, we will remove all the stopwords using stopwords corpus from NLTK. 

# NLTK stemming/lemmatizing options
I'll quickly run through porter stemmer, lancaster stemmer and worldnetlemmatizer to choose the best option.


In [482]:
testlist = ['abstract', 'abstracts', 'abstracted', 
            'abstracting', 'abstraction', 'women']

In [483]:
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()
wnl = nltk.WordNetLemmatizer()

print("Lancaster Stemmer: ")
print([lancaster.stem(x) for x in testlist])
print("Porter Stemmer: ")
print([porter.stem(x) for x in testlist])
print("Worldnet Lemmatizer: ")
print([wnl.lemmatize(x) for x in testlist])

Lancaster Stemmer: 
['abstract', 'abstract', 'abstract', 'abstract', 'abstract', 'wom']
Porter Stemmer: 
['abstract', 'abstract', 'abstract', 'abstract', 'abstract', 'women']
Worldnet Lemmatizer: 
['abstract', 'abstract', 'abstracted', 'abstracting', 'abstraction', 'woman']


It seems like best way might be to run Porter Stemmer first and then running Worldnet Lemmatizer.

In [484]:
# Importing stopwords
from nltk.corpus import stopwords
#nltk.download('stopwords')

In [485]:
import re

def normalizing(string):
    """
    Input: string 
    Return: list of lower case keywords with special characters removed

    """
    # remove special character, lowercase, then remove individual words
    return re.sub('[^A-Za-z]+', ' ', string).lower().split() 


In [486]:
# Stop words corpus
# We'll take from NLTK package and add couple more
sw = stopwords.words('english')
sw += ['p', 'r', 'l', 'x', 'e']

In [487]:
def remove_stop(list_):
    """
    Input: list of words
    Return: list of words excluding stopwords
    """
    return [x for x in list_ if x not in sw]

In [488]:

def make_keywords(string):
    """
    Input: string of words
    Return: list of words excluding stopwords (after normalizing) and lemmatized
    """
    wordslist = remove_stop(normalizing(string))
    return list(map(lambda x: wnl.lemmatize(porter.stem(x)), wordslist))


In [489]:
clean_df.description = clean_df.description.apply(lambda x: make_keywords(x))

# Checking
Let's just randomly checks couple samples to ensure it worked.

In [490]:
np.random.seed(1)
clean_df.sample(1).description

3147    [vasakasajja, nayika, heroin, dress, lover, paint, shown, open, terrac, seat, surround, femal, attend, one, help, shoe, distanc, left, lover, seen, seat, dayb, smoke, hookah, pahari, style, kangra, school]
Name: description, dtype: object

In [491]:
print(full_df.loc[3147, 'description'])

The vasakasajja nayika, is a heroine who dresses up for her lover. Here, in this painting, she is shown on an open terrace, seated and surrounded by female attendants, one of whom helps her with her shoes. In the distance, on the left, her lover can be seen seated on a daybed and smoking a hookah. Pahari Style, Kangra School.


In [492]:
words = sum(clean_df.description, [])
unique = set(words)
counts = dict.fromkeys(unique, 0)
for w in words: 
    counts[w] += 1

In [493]:
counts = {k: v for k, v in sorted(counts.items(), reverse = True, key=lambda item: item[1])}

In [494]:
counts

{'paint': 608,
 'right': 266,
 'left': 241,
 'artist': 193,
 'white': 193,
 'depict': 178,
 'two': 169,
 'figur': 158,
 'larg': 154,
 'wear': 154,
 'red': 148,
 'one': 146,
 'ink': 142,
 'appear': 141,
 'composit': 141,
 'gold': 134,
 'landscap': 134,
 'style': 133,
 'hold': 127,
 'hand': 123,
 'chine': 122,
 'portrait': 120,
 'scroll': 119,
 'tree': 104,
 'top': 101,
 'work': 100,
 'flower': 98,
 'head': 97,
 'black': 94,
 'blue': 94,
 'krishna': 90,
 'paper': 90,
 'mountain': 88,
 'green': 88,
 'attend': 83,
 'small': 83,
 'color': 83,
 'school': 82,
 'long': 82,
 'god': 81,
 'origin': 80,
 'charact': 80,
 'femal': 78,
 'stand': 78,
 'also': 78,
 'background': 75,
 'hindu': 73,
 'set': 73,
 'screen': 73,
 'inscript': 73,
 'seal': 72,
 'center': 70,
 'face': 70,
 'side': 69,
 'mao': 69,
 'decor': 69,
 'panel': 69,
 'centuri': 67,
 'repres': 65,
 'lower': 64,
 'use': 63,
 'hang': 61,
 'scene': 60,
 'behind': 60,
 'leav': 59,
 'front': 58,
 'three': 58,
 'dark': 58,
 'rajput': 58,
 'aro

I think this looks pretty good. 

# Subset
Now we will subset to descriptions that include the word "abstract"  
Ideally, we would subset by existing keyword 'abstraction' but since this data doesn't contain such tagging system, we will just use inclusion of the word 'abstract' as a parameter to subset abstract art vs non-abstract art.

In [496]:
abstract_df = clean_df[(clean_df.description.apply(lambda x: 'abstract' in x))]

In [497]:
len(abstract_df)

14

Unfortunately, there are only 14 pieces that included the term 'abstract'. For now, we will go ahead with this data, 