# Data Collection
Here I will take the open source object data from Harvard Museum.  
(API Documentation and data source: https://www.harvardartmuseums.org/collections/api)

In [1]:
import pandas as pd
import numpy as np
import requests
import json
import nltk
import pickle

You must request the API key from Harvard Museum by using a link provided in their documentations.  
Usually you will receive the key right away.  
Then create a harvard_mus_api.json file to store the key as a dictionary.  
e.g. {"api_key": "your key here"}  
If you are not uploading this to public and it's for your personal use, you can ignore below step and just assign api_key to your api key. 

In [2]:
def get_keys(path):
    with open(path) as f:
        return json.load(f)

path = '/Users/stereopickles/.secret' # input the location of your tmdb_api.json

In [3]:
api_key = get_keys(f"{path}/harvard_mus_api.json")['api_key']

Let's test if it's working.

In [4]:
url = "https://api.harvardartmuseums.org/object"

url_params = {
    "apikey": api_key,
}

resp = requests.get(url, params = url_params)
print(resp.status_code)


200


In [5]:
print(resp.json().keys()) 

dict_keys(['info', 'records'])


In [6]:
resp.json()['info']

{'totalrecordsperquery': 10,
 'totalrecords': 234937,
 'pages': 23494,
 'page': 1,
 'next': 'https://api.harvardartmuseums.org/object?apikey=def72120-c45a-11ea-89a3-6722767e4145&page=2'}

## Check classifications

In [38]:
url = "https://api.harvardartmuseums.org/classification"

url_params = {
    "apikey": api_key,
}

resp = requests.get(url, params = url_params)
print(resp.status_code)

200


In [44]:
classifications = []

n = int(res.json()['info']['pages']) # getting the page number 
        
for i in range(n):
    url_params["page"] = i
    print(f"page {i}")
    
    resp = requests.get(url, params = url_params)

    try: 
        classifications.extend(resp.json()['records']) # add it to the list
    except:
        print(f"Error on page {i+1}") # let me know if there's an error


In [47]:
classifications = pd.DataFrame(classifications)
classifications

Unnamed: 0,objectcount,name,id,lastupdate,classificationid
0,348,Material Specimens,1075,2020-07-27T05:02:40-0400,1075
1,3,Text,204,2020-07-27T05:02:40-0400,204
2,382,(not assigned),0,2020-07-27T05:02:39-0400,0
3,847,Jewelry,19,2020-07-27T05:02:39-0400,19
4,6959,Paintings,26,2020-07-27T05:02:40-0400,26
...,...,...,...,...,...
62,475,Furniture,76,2020-07-27T05:02:39-0400,76
63,1154,Architectural Elements,133,2020-07-27T05:02:39-0400,133
64,46,Cameos,1086,2020-07-27T05:02:39-0400,1086
65,16,Graphic Design,171,2020-07-27T05:02:39-0400,171


## Full data

In [None]:
# We will keep it to paintings only for the first round. 

In [11]:
classes = ['Paintings'] 
full_db = []

for cls in classes: 
    url_params = {
        "apikey": api_key, 
        "classification": cls, 
    }
    
    res = requests.get(url, params = url_params)
    
    if res.status_code == 200: # if connection is successful
        # run the rest of the pages
        n = int(res.json()['info']['pages']) # getting the page number 
        
        for i in range(n):
            url_params["page"] = i

            resp = requests.get(url, params = url_params)
            
            try: 
                full_db.extend(resp.json()['records']) # add it to the list
            except:
                print(f"Error on page {i+1}") # let me know if there's an error

    else: 
        continue
        

In [12]:
# Converting data to a Pandas dataframe
full_df = pd.DataFrame(full_db)

In [13]:
filename = 'pickles/raw_data.pkl'
full_df.to_pickle(filename)

In [14]:
# drop items without tags
full_df.dropna(subset = ["description"], inplace = True)
full_df.description.isnull().sum()

0

In [15]:
full_df.classification.value_counts()

Paintings    474
Name: classification, dtype: int64

# Data Cleaning and Sampling

First we will preprocess the description data.  
We will do  
1. Make everything lowercase 
2. Remove stopwords
3. Lemmatization


In [32]:
clean_df = full_df.copy()

First, we will remove all the stopwords using stopwords corpus from NLTK. 

# NLTK stemming/lemmatizing options
I'll quickly run through porter stemmer, lancaster stemmer and worldnetlemmatizer to choose the best option.


In [23]:
testlist = ['abstracts', 'abstracted', 'abstracting', 'abstraction', 
            'woman', 'women', 'womb', 'made', 'making', 'lying', 'laying']

In [24]:
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()
wnl = nltk.WordNetLemmatizer()

print(f"Lancaster Stemmer: {[lancaster.stem(x) for x in testlist]}")
print(f"Porter Stemmer: {[porter.stem(x) for x in testlist]}")
print(f"Worldnet Lemmatizer: {[wnl.lemmatize(x) for x in testlist]}")
print(f"Worldnet Lemmatizer than Porter Stemmer: {[porter.stem(wnl.lemmatize(x)) for x in testlist]}")
print(f"Worldnet Lemmatizer than Lancaster Stemmer: {[lancaster.stem(wnl.lemmatize(x)) for x in testlist]}")


Lancaster Stemmer: ['abstract', 'abstract', 'abstract', 'abstract', 'wom', 'wom', 'womb', 'mad', 'mak', 'lying', 'lay']
Porter Stemmer: ['abstract', 'abstract', 'abstract', 'abstract', 'woman', 'women', 'womb', 'made', 'make', 'lie', 'lay']
Worldnet Lemmatizer: ['abstract', 'abstracted', 'abstracting', 'abstraction', 'woman', 'woman', 'womb', 'made', 'making', 'lying', 'laying']
Worldnet Lemmatizer than Porter Stemmer: ['abstract', 'abstract', 'abstract', 'abstract', 'woman', 'woman', 'womb', 'made', 'make', 'lie', 'lay']
Worldnet Lemmatizer than Lancaster Stemmer: ['abstract', 'abstract', 'abstract', 'abstract', 'wom', 'wom', 'womb', 'mad', 'mak', 'lying', 'lay']


It seems like best way might be to run Lemmatizer first and run Porter might be our best bet.

In [17]:
import re

def normalizing(string):
    """
    Input: string 
    Return: list of lower case keywords with special characters removed

    """
    # remove special character, lowercase, then remove individual words
    return re.sub('[^A-Za-z]+', ' ', string).lower().split() 


In [19]:
# Importing stopwords
from nltk.corpus import stopwords
#nltk.download('stopwords')

# We'll take from NLTK package and add couple more
sw = stopwords.words('english')
sw += ['p', 'r', 'l', 'x', 'e', 'h']

In [20]:
def remove_stop(list_):
    """
    Input: list of words
    Return: list of words excluding stopwords
    """
    return [x for x in list_ if x not in sw]

In [25]:

def make_keywords(string):
    """
    Input: string of words
    Return: list of words excluding stopwords (after normalizing) and lemmatized
    """
    wordslist = remove_stop(normalizing(string))
    #return list(map(lambda x: porter.stem(wnl.lemmatize(x)), wordslist))
    return list(map(lambda x: wnl.lemmatize(x), wordslist))


In [33]:
clean_df.description = clean_df.description.apply(lambda x: make_keywords(x))

# Checking
Let's just randomly checks couple samples to ensure it worked.

In [35]:
np.random.seed(9)
clean_df.sample(1).description

1937    [represented, panel, mother, god, bogoluibor, ...
Name: description, dtype: object

In [36]:
print(full_df.loc[1937, 'description'])

Represented on the panel:  the Mother of God of Bogoluibor on the left; three-handed Virgin on the right; St. Stephan with Vlasius and the Archangel Michael on the left; St. Maria of Egypt with Natalia and an unknown saint on the right.  The metal cross has numerous inscriptions which are typical for this type.


I think this looks pretty good. 

# Exporting

Just for the sake of working in separate notebooks, I'll remove the list, and re-split in EDA notebook.  
This part can be skipped if it's all in the same notebook.

In [37]:
filename = 'pickles/cleaned_df.pkl'
clean_df.to_pickle(filename)

In [None]:
#clean_df.description = clean_df.description.apply(lambda x: ','.join(x))
#clean_df.to_csv('data/clean_df.csv')