# Data Collection
Here I will take the open source object data from Harvard Museum.  
(API Documentation and data source: https://www.harvardartmuseums.org/collections/api)

In [2]:
import pandas as pd
import numpy as np
import requests
import json
import nltk
import pickle

You must request the API key from Harvard Museum by using a link provided in their documentations.  
Usually you will receive the key right away.  
Then create a harvard_mus_api.json file to store the key as a dictionary.  
e.g. {"api_key": "your key here"}  
If you are not uploading this to public and it's for your personal use, you can ignore below step and just assign api_key to your api key. 

In [2]:
def get_keys(path):
    with open(path) as f:
        return json.load(f)

path = '/Users/stereopickles/.secret' # input the location of your tmdb_api.json

In [3]:
api_key = get_keys(f"{path}/harvard_mus_api.json")['api_key']

Let's test if it's working.

In [4]:
url = "https://api.harvardartmuseums.org/object"

url_params = {
    "apikey": api_key,
}

resp = requests.get(url, params = url_params)
print(resp.status_code)


200


In [5]:
print(resp.json().keys()) 

dict_keys(['info', 'records'])


In [6]:
resp.json()['info']

{'totalrecordsperquery': 10,
 'totalrecords': 234937,
 'pages': 23494,
 'page': 1,
 'next': 'https://api.harvardartmuseums.org/object?apikey=def72120-c45a-11ea-89a3-6722767e4145&page=2'}

## Check classifications

In [38]:
url = "https://api.harvardartmuseums.org/classification"

url_params = {
    "apikey": api_key,
}

resp = requests.get(url, params = url_params)
print(resp.status_code)

200


In [44]:
classifications = []

n = int(res.json()['info']['pages']) # getting the page number 
        
for i in range(n):
    url_params["page"] = i
    print(f"page {i}")
    
    resp = requests.get(url, params = url_params)

    try: 
        classifications.extend(resp.json()['records']) # add it to the list
    except:
        print(f"Error on page {i+1}") # let me know if there's an error


In [66]:
classifications = pd.DataFrame(classifications)
pd.options.display.max_rows = 100
classes_id = [26, 30, 62, 80, 23]

In [64]:
classifications[classifications.classificationid.isin(classes_id)]

Unnamed: 0,objectcount,name,id,lastupdate,classificationid
4,6959,Paintings,26,2020-07-27T05:02:40-0400,26
14,6959,Paintings,26,2020-07-27T05:02:40-0400,26
20,6216,Sculpture,30,2020-07-27T05:02:40-0400,30
22,3452,Textile Arts,62,2020-07-27T05:02:40-0400,62
35,158,Paintings with Calligraphy,80,2020-07-27T05:02:40-0400,80
56,16,Graphic Design,171,2020-07-27T05:02:39-0400,171
58,84282,Photographs,17,2020-07-27T05:02:40-0400,17
59,70872,Prints,23,2020-07-27T05:02:40-0400,23
65,16,Graphic Design,171,2020-07-27T05:02:39-0400,171


## Full data

In [None]:
# We will keep it to paintings only for the first round. 

In [71]:
classes = ['Prints'] 

url = url = "https://api.harvardartmuseums.org/object"
full_db = []

for cls in classes: # or classes_id
    url_params = {
        "apikey": api_key, 
        "classification": cls, 
    }
    
    res = requests.get(url, params = url_params)
    
    if res.status_code == 200: # if connection is successful
        # run the rest of the pages
        n = int(res.json()['info']['pages']) # getting the page number 
        
        for i in range(n):
            url_params["page"] = i
            print(f"page: {i}/{n}")
            resp = requests.get(url, params = url_params)
            
            try: 
                full_db.extend(resp.json()['records']) # add it to the list
            except:
                print(f"Error on page {i+1}") # let me know if there's an error

    else: 
        continue
        

page: 0/6822
page: 1/6822
page: 2/6822
page: 3/6822
page: 4/6822
page: 5/6822
page: 6/6822
page: 7/6822
page: 8/6822
page: 9/6822
page: 10/6822
page: 11/6822
page: 12/6822
page: 13/6822
page: 14/6822
page: 15/6822
page: 16/6822
page: 17/6822
page: 18/6822
page: 19/6822
page: 20/6822
page: 21/6822
page: 22/6822
page: 23/6822
page: 24/6822
page: 25/6822
page: 26/6822
page: 27/6822
page: 28/6822
page: 29/6822
page: 30/6822
page: 31/6822
page: 32/6822
page: 33/6822
page: 34/6822
page: 35/6822
page: 36/6822
page: 37/6822
page: 38/6822
page: 39/6822
page: 40/6822
page: 41/6822
page: 42/6822
page: 43/6822
page: 44/6822
page: 45/6822
page: 46/6822
page: 47/6822
page: 48/6822
page: 49/6822
page: 50/6822
page: 51/6822
page: 52/6822
page: 53/6822
page: 54/6822
page: 55/6822
page: 56/6822
page: 57/6822
page: 58/6822
page: 59/6822
page: 60/6822
page: 61/6822
page: 62/6822
page: 63/6822
page: 64/6822
page: 65/6822
page: 66/6822
page: 67/6822
page: 68/6822
page: 69/6822
page: 70/6822
page: 71/6822
pa

page: 555/6822
page: 556/6822
page: 557/6822
page: 558/6822
page: 559/6822
page: 560/6822
page: 561/6822
page: 562/6822
page: 563/6822
page: 564/6822
page: 565/6822
page: 566/6822
page: 567/6822
page: 568/6822
page: 569/6822
page: 570/6822
page: 571/6822
page: 572/6822
page: 573/6822
page: 574/6822
page: 575/6822
page: 576/6822
page: 577/6822
page: 578/6822
page: 579/6822
page: 580/6822
page: 581/6822
page: 582/6822
page: 583/6822
page: 584/6822
page: 585/6822
page: 586/6822
page: 587/6822
page: 588/6822
page: 589/6822
page: 590/6822
page: 591/6822
page: 592/6822
page: 593/6822
page: 594/6822
page: 595/6822
page: 596/6822
page: 597/6822
page: 598/6822
page: 599/6822
page: 600/6822
page: 601/6822
page: 602/6822
page: 603/6822
page: 604/6822
page: 605/6822
page: 606/6822
page: 607/6822
page: 608/6822
page: 609/6822
page: 610/6822
page: 611/6822
page: 612/6822
page: 613/6822
page: 614/6822
page: 615/6822
page: 616/6822
page: 617/6822
page: 618/6822
page: 619/6822
page: 620/6822
page: 621/

page: 1096/6822
page: 1097/6822
page: 1098/6822
page: 1099/6822
page: 1100/6822
page: 1101/6822
page: 1102/6822
page: 1103/6822
page: 1104/6822
page: 1105/6822
page: 1106/6822
page: 1107/6822
page: 1108/6822
page: 1109/6822
page: 1110/6822
page: 1111/6822
page: 1112/6822
page: 1113/6822
page: 1114/6822
page: 1115/6822
page: 1116/6822
page: 1117/6822
page: 1118/6822
page: 1119/6822
page: 1120/6822
page: 1121/6822
page: 1122/6822
page: 1123/6822
page: 1124/6822
page: 1125/6822
page: 1126/6822
page: 1127/6822
page: 1128/6822
page: 1129/6822
page: 1130/6822
page: 1131/6822
page: 1132/6822
page: 1133/6822
page: 1134/6822
page: 1135/6822
page: 1136/6822
page: 1137/6822
page: 1138/6822
page: 1139/6822
page: 1140/6822
page: 1141/6822
page: 1142/6822
page: 1143/6822
page: 1144/6822
page: 1145/6822
page: 1146/6822
page: 1147/6822
page: 1148/6822
page: 1149/6822
page: 1150/6822
page: 1151/6822
page: 1152/6822
page: 1153/6822
page: 1154/6822
page: 1155/6822
page: 1156/6822
page: 1157/6822
page: 11

page: 1610/6822
page: 1611/6822
page: 1612/6822
page: 1613/6822
page: 1614/6822
page: 1615/6822
page: 1616/6822
page: 1617/6822
page: 1618/6822
page: 1619/6822
page: 1620/6822
page: 1621/6822
page: 1622/6822
page: 1623/6822
page: 1624/6822
page: 1625/6822
page: 1626/6822
page: 1627/6822
page: 1628/6822
page: 1629/6822
page: 1630/6822
page: 1631/6822
page: 1632/6822
page: 1633/6822
page: 1634/6822
page: 1635/6822
page: 1636/6822
page: 1637/6822
page: 1638/6822
page: 1639/6822
page: 1640/6822
page: 1641/6822
page: 1642/6822
page: 1643/6822
page: 1644/6822
page: 1645/6822
page: 1646/6822
page: 1647/6822
page: 1648/6822
page: 1649/6822
page: 1650/6822
page: 1651/6822
page: 1652/6822
page: 1653/6822
page: 1654/6822
page: 1655/6822
page: 1656/6822
page: 1657/6822
page: 1658/6822
page: 1659/6822
page: 1660/6822
page: 1661/6822
page: 1662/6822
page: 1663/6822
page: 1664/6822
page: 1665/6822
page: 1666/6822
page: 1667/6822
page: 1668/6822
page: 1669/6822
page: 1670/6822
page: 1671/6822
page: 16

page: 2123/6822
page: 2124/6822
page: 2125/6822
page: 2126/6822
page: 2127/6822
page: 2128/6822
page: 2129/6822
page: 2130/6822
page: 2131/6822
page: 2132/6822
page: 2133/6822
page: 2134/6822
page: 2135/6822
page: 2136/6822
page: 2137/6822
page: 2138/6822
page: 2139/6822
page: 2140/6822
page: 2141/6822
page: 2142/6822
page: 2143/6822
page: 2144/6822
page: 2145/6822
page: 2146/6822
page: 2147/6822
page: 2148/6822
page: 2149/6822
page: 2150/6822
page: 2151/6822
page: 2152/6822
page: 2153/6822
page: 2154/6822
page: 2155/6822
page: 2156/6822
page: 2157/6822
page: 2158/6822
page: 2159/6822
page: 2160/6822
page: 2161/6822
page: 2162/6822
page: 2163/6822
page: 2164/6822
page: 2165/6822
page: 2166/6822
page: 2167/6822
page: 2168/6822
page: 2169/6822
page: 2170/6822
page: 2171/6822
page: 2172/6822
page: 2173/6822
page: 2174/6822
page: 2175/6822
page: 2176/6822
page: 2177/6822
page: 2178/6822
page: 2179/6822
page: 2180/6822
page: 2181/6822
page: 2182/6822
page: 2183/6822
page: 2184/6822
page: 21

page: 2637/6822
page: 2638/6822
page: 2639/6822
page: 2640/6822
page: 2641/6822
page: 2642/6822
page: 2643/6822
page: 2644/6822
page: 2645/6822
page: 2646/6822
page: 2647/6822
page: 2648/6822
page: 2649/6822
page: 2650/6822
page: 2651/6822
page: 2652/6822
page: 2653/6822
page: 2654/6822
page: 2655/6822
page: 2656/6822
page: 2657/6822
page: 2658/6822
page: 2659/6822
page: 2660/6822
page: 2661/6822
page: 2662/6822
page: 2663/6822
page: 2664/6822
page: 2665/6822
page: 2666/6822
page: 2667/6822
page: 2668/6822
page: 2669/6822
page: 2670/6822
page: 2671/6822
page: 2672/6822
page: 2673/6822
page: 2674/6822
page: 2675/6822
page: 2676/6822
page: 2677/6822
page: 2678/6822
page: 2679/6822
page: 2680/6822
page: 2681/6822
page: 2682/6822
page: 2683/6822
page: 2684/6822
page: 2685/6822
page: 2686/6822
page: 2687/6822
page: 2688/6822
page: 2689/6822
page: 2690/6822
page: 2691/6822
page: 2692/6822
page: 2693/6822
page: 2694/6822
page: 2695/6822
page: 2696/6822
page: 2697/6822
page: 2698/6822
page: 26

page: 3150/6822
page: 3151/6822
page: 3152/6822
page: 3153/6822
page: 3154/6822
page: 3155/6822
page: 3156/6822
page: 3157/6822
page: 3158/6822
page: 3159/6822
page: 3160/6822
page: 3161/6822
page: 3162/6822
page: 3163/6822
page: 3164/6822
page: 3165/6822
page: 3166/6822
page: 3167/6822
page: 3168/6822
page: 3169/6822
page: 3170/6822
page: 3171/6822
page: 3172/6822
page: 3173/6822
page: 3174/6822
page: 3175/6822
page: 3176/6822
page: 3177/6822
page: 3178/6822
page: 3179/6822
page: 3180/6822
page: 3181/6822
page: 3182/6822
page: 3183/6822
page: 3184/6822
page: 3185/6822
page: 3186/6822
page: 3187/6822
page: 3188/6822
page: 3189/6822
page: 3190/6822
page: 3191/6822
page: 3192/6822
page: 3193/6822
page: 3194/6822
page: 3195/6822
page: 3196/6822
page: 3197/6822
page: 3198/6822
page: 3199/6822
page: 3200/6822
page: 3201/6822
page: 3202/6822
page: 3203/6822
page: 3204/6822
page: 3205/6822
page: 3206/6822
page: 3207/6822
page: 3208/6822
page: 3209/6822
page: 3210/6822
page: 3211/6822
page: 32

page: 3664/6822
page: 3665/6822
page: 3666/6822
page: 3667/6822
page: 3668/6822
page: 3669/6822
page: 3670/6822
page: 3671/6822
page: 3672/6822
page: 3673/6822
page: 3674/6822
page: 3675/6822
page: 3676/6822
page: 3677/6822
page: 3678/6822
page: 3679/6822
page: 3680/6822
page: 3681/6822
page: 3682/6822
page: 3683/6822
page: 3684/6822
page: 3685/6822
page: 3686/6822
page: 3687/6822
page: 3688/6822
page: 3689/6822
page: 3690/6822
page: 3691/6822
page: 3692/6822
page: 3693/6822
page: 3694/6822
page: 3695/6822
page: 3696/6822
page: 3697/6822
page: 3698/6822
page: 3699/6822
page: 3700/6822
page: 3701/6822
page: 3702/6822
page: 3703/6822
page: 3704/6822
page: 3705/6822
page: 3706/6822
page: 3707/6822
page: 3708/6822
page: 3709/6822
page: 3710/6822
page: 3711/6822
page: 3712/6822
page: 3713/6822
page: 3714/6822
page: 3715/6822
page: 3716/6822
page: 3717/6822
page: 3718/6822
page: 3719/6822
page: 3720/6822
page: 3721/6822
page: 3722/6822
page: 3723/6822
page: 3724/6822
page: 3725/6822
page: 37

page: 4177/6822
page: 4178/6822
page: 4179/6822
page: 4180/6822
page: 4181/6822
page: 4182/6822
page: 4183/6822
page: 4184/6822
page: 4185/6822
page: 4186/6822
page: 4187/6822
page: 4188/6822
page: 4189/6822
page: 4190/6822
page: 4191/6822
page: 4192/6822
page: 4193/6822
page: 4194/6822
page: 4195/6822
page: 4196/6822
page: 4197/6822
page: 4198/6822
page: 4199/6822
page: 4200/6822
page: 4201/6822
page: 4202/6822
page: 4203/6822
page: 4204/6822
page: 4205/6822
page: 4206/6822
page: 4207/6822
page: 4208/6822
page: 4209/6822
page: 4210/6822
page: 4211/6822
page: 4212/6822
page: 4213/6822
page: 4214/6822
page: 4215/6822
page: 4216/6822
page: 4217/6822
page: 4218/6822
page: 4219/6822
page: 4220/6822
page: 4221/6822
page: 4222/6822
page: 4223/6822
page: 4224/6822
page: 4225/6822
page: 4226/6822
page: 4227/6822
page: 4228/6822
page: 4229/6822
page: 4230/6822
page: 4231/6822
page: 4232/6822
page: 4233/6822
page: 4234/6822
page: 4235/6822
page: 4236/6822
page: 4237/6822
page: 4238/6822
page: 42

page: 4691/6822
page: 4692/6822
page: 4693/6822
page: 4694/6822
page: 4695/6822
page: 4696/6822
page: 4697/6822
page: 4698/6822
page: 4699/6822
page: 4700/6822
page: 4701/6822
page: 4702/6822
page: 4703/6822
page: 4704/6822
page: 4705/6822
page: 4706/6822
page: 4707/6822
page: 4708/6822
page: 4709/6822
page: 4710/6822
page: 4711/6822
page: 4712/6822
page: 4713/6822
page: 4714/6822
page: 4715/6822
page: 4716/6822
page: 4717/6822
page: 4718/6822
page: 4719/6822
page: 4720/6822
page: 4721/6822
page: 4722/6822
page: 4723/6822
page: 4724/6822
page: 4725/6822
page: 4726/6822
page: 4727/6822
page: 4728/6822
page: 4729/6822
page: 4730/6822
page: 4731/6822
page: 4732/6822
page: 4733/6822
page: 4734/6822
page: 4735/6822
page: 4736/6822
page: 4737/6822
page: 4738/6822
page: 4739/6822
page: 4740/6822
page: 4741/6822
page: 4742/6822
page: 4743/6822
page: 4744/6822
page: 4745/6822
page: 4746/6822
page: 4747/6822
page: 4748/6822
page: 4749/6822
page: 4750/6822
page: 4751/6822
page: 4752/6822
page: 47

page: 5205/6822
page: 5206/6822
page: 5207/6822
page: 5208/6822
page: 5209/6822
page: 5210/6822
page: 5211/6822
page: 5212/6822
page: 5213/6822
page: 5214/6822
page: 5215/6822
page: 5216/6822
page: 5217/6822
page: 5218/6822
page: 5219/6822
page: 5220/6822
page: 5221/6822
page: 5222/6822
page: 5223/6822
page: 5224/6822
page: 5225/6822
page: 5226/6822
page: 5227/6822
page: 5228/6822
page: 5229/6822
page: 5230/6822
page: 5231/6822
page: 5232/6822
page: 5233/6822
page: 5234/6822
page: 5235/6822
page: 5236/6822
page: 5237/6822
page: 5238/6822
page: 5239/6822
page: 5240/6822
page: 5241/6822
page: 5242/6822
page: 5243/6822
page: 5244/6822
page: 5245/6822
page: 5246/6822
page: 5247/6822
page: 5248/6822
page: 5249/6822
page: 5250/6822
page: 5251/6822
page: 5252/6822
page: 5253/6822
page: 5254/6822
page: 5255/6822
page: 5256/6822
page: 5257/6822
page: 5258/6822
page: 5259/6822
page: 5260/6822
page: 5261/6822
page: 5262/6822
page: 5263/6822
page: 5264/6822
page: 5265/6822
page: 5266/6822
page: 52

page: 5719/6822
page: 5720/6822
page: 5721/6822
page: 5722/6822
page: 5723/6822
page: 5724/6822
page: 5725/6822
page: 5726/6822
page: 5727/6822
page: 5728/6822
page: 5729/6822
page: 5730/6822
page: 5731/6822
page: 5732/6822
page: 5733/6822
page: 5734/6822
page: 5735/6822
page: 5736/6822
page: 5737/6822
page: 5738/6822
page: 5739/6822
page: 5740/6822
page: 5741/6822
page: 5742/6822
page: 5743/6822
page: 5744/6822
page: 5745/6822
page: 5746/6822
page: 5747/6822
page: 5748/6822
page: 5749/6822
page: 5750/6822
page: 5751/6822
page: 5752/6822
page: 5753/6822
page: 5754/6822
page: 5755/6822
page: 5756/6822
page: 5757/6822
page: 5758/6822
page: 5759/6822
page: 5760/6822
page: 5761/6822
page: 5762/6822
page: 5763/6822
page: 5764/6822
page: 5765/6822
page: 5766/6822
page: 5767/6822
page: 5768/6822
page: 5769/6822
page: 5770/6822
page: 5771/6822
page: 5772/6822
page: 5773/6822
page: 5774/6822
page: 5775/6822
page: 5776/6822
page: 5777/6822
page: 5778/6822
page: 5779/6822
page: 5780/6822
page: 57

page: 6233/6822
page: 6234/6822
page: 6235/6822
page: 6236/6822
page: 6237/6822
page: 6238/6822
page: 6239/6822
page: 6240/6822
page: 6241/6822
page: 6242/6822
page: 6243/6822
page: 6244/6822
page: 6245/6822
page: 6246/6822
page: 6247/6822
page: 6248/6822
page: 6249/6822
page: 6250/6822
page: 6251/6822
page: 6252/6822
page: 6253/6822
page: 6254/6822
page: 6255/6822
page: 6256/6822
page: 6257/6822
page: 6258/6822
page: 6259/6822
page: 6260/6822
page: 6261/6822
page: 6262/6822
page: 6263/6822
page: 6264/6822
page: 6265/6822
page: 6266/6822
page: 6267/6822
page: 6268/6822
page: 6269/6822
page: 6270/6822
page: 6271/6822
page: 6272/6822
page: 6273/6822
page: 6274/6822
page: 6275/6822
page: 6276/6822
page: 6277/6822
page: 6278/6822
page: 6279/6822
page: 6280/6822
page: 6281/6822
page: 6282/6822
page: 6283/6822
page: 6284/6822
page: 6285/6822
page: 6286/6822
page: 6287/6822
page: 6288/6822
page: 6289/6822
page: 6290/6822
page: 6291/6822
page: 6292/6822
page: 6293/6822
page: 6294/6822
page: 62

page: 6746/6822
page: 6747/6822
page: 6748/6822
page: 6749/6822
page: 6750/6822
page: 6751/6822
page: 6752/6822
page: 6753/6822
page: 6754/6822
page: 6755/6822
page: 6756/6822
page: 6757/6822
page: 6758/6822
page: 6759/6822
page: 6760/6822
page: 6761/6822
page: 6762/6822
page: 6763/6822
page: 6764/6822
page: 6765/6822
page: 6766/6822
page: 6767/6822
page: 6768/6822
page: 6769/6822
page: 6770/6822
page: 6771/6822
page: 6772/6822
page: 6773/6822
page: 6774/6822
page: 6775/6822
page: 6776/6822
page: 6777/6822
page: 6778/6822
page: 6779/6822
page: 6780/6822
page: 6781/6822
page: 6782/6822
page: 6783/6822
page: 6784/6822
page: 6785/6822
page: 6786/6822
page: 6787/6822
page: 6788/6822
page: 6789/6822
page: 6790/6822
page: 6791/6822
page: 6792/6822
page: 6793/6822
page: 6794/6822
page: 6795/6822
page: 6796/6822
page: 6797/6822
page: 6798/6822
page: 6799/6822
page: 6800/6822
page: 6801/6822
page: 6802/6822
page: 6803/6822
page: 6804/6822
page: 6805/6822
page: 6806/6822
page: 6807/6822
page: 68

In [72]:
# Converting data to a Pandas dataframe
tmp2 = pd.DataFrame(full_db)

In [69]:
#tmp1 = full_df.copy()

In [78]:
full_df = pd.concat([tmp1, tmp2])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [87]:
full_df = full_df.drop_duplicates(subset = 'id')

In [88]:
filename = 'pickles/raw_data.pkl'
full_df.to_pickle(filename)

In [26]:
#full_df = pd.read_pickle("pickles/raw_data.pkl")

In [27]:
# drop items without tags
full_df.dropna(subset = ["description"], inplace = True)
full_df.description.isnull().sum()

0

In [28]:
full_df.classification.value_counts()

Photographs                   2537
Prints                        1463
Sculpture                      528
Textile Arts                   514
Paintings                      395
Graphic Design                  10
Paintings with Calligraphy       5
Name: classification, dtype: int64

# Data Cleaning and Sampling

First we will preprocess the description data.  
We will do  
1. Make everything lowercase 
2. Remove stopwords
3. Lemmatization


In [29]:
clean_df = full_df.copy()

First, we will remove all the stopwords using stopwords corpus from NLTK. 

# NLTK stemming/lemmatizing options
I'll quickly run through porter stemmer, lancaster stemmer and worldnetlemmatizer to choose the best option.


In [30]:
testlist = ['abstracts', 'abstracted', 'abstracting', 'abstraction', 
            'woman', 'women', 'womb', 'made', 'making', 'lying', 'laying']

In [31]:
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()
wnl = nltk.WordNetLemmatizer()

print(f"Lancaster Stemmer: {[lancaster.stem(x) for x in testlist]}")
print(f"Porter Stemmer: {[porter.stem(x) for x in testlist]}")
print(f"Worldnet Lemmatizer: {[wnl.lemmatize(x) for x in testlist]}")
print(f"Worldnet Lemmatizer than Porter Stemmer: {[porter.stem(wnl.lemmatize(x)) for x in testlist]}")
print(f"Worldnet Lemmatizer than Lancaster Stemmer: {[lancaster.stem(wnl.lemmatize(x)) for x in testlist]}")


Lancaster Stemmer: ['abstract', 'abstract', 'abstract', 'abstract', 'wom', 'wom', 'womb', 'mad', 'mak', 'lying', 'lay']
Porter Stemmer: ['abstract', 'abstract', 'abstract', 'abstract', 'woman', 'women', 'womb', 'made', 'make', 'lie', 'lay']
Worldnet Lemmatizer: ['abstract', 'abstracted', 'abstracting', 'abstraction', 'woman', 'woman', 'womb', 'made', 'making', 'lying', 'laying']
Worldnet Lemmatizer than Porter Stemmer: ['abstract', 'abstract', 'abstract', 'abstract', 'woman', 'woman', 'womb', 'made', 'make', 'lie', 'lay']
Worldnet Lemmatizer than Lancaster Stemmer: ['abstract', 'abstract', 'abstract', 'abstract', 'wom', 'wom', 'womb', 'mad', 'mak', 'lying', 'lay']


It seems like best way might be to run Lemmatizer first and run Porter might be our best bet.

In [32]:
import re

def normalizing(string):
    """
    Input: string 
    Return: list of lower case keywords with special characters removed

    """
    # remove special character, lowercase, then remove individual words
    return re.sub('[^A-Za-z]+', ' ', string).lower().split() 


In [33]:
# Importing stopwords
from nltk.corpus import stopwords
#nltk.download('stopwords')

# We'll take from NLTK package and add couple more
sw = stopwords.words('english')
sw += ['p', 'r', 'l', 'x', 'e', 'h', 'br', 'th', 'v']

In [34]:
def remove_stop(list_):
    """
    Input: list of words
    Return: list of words excluding stopwords
    """
    return [x for x in list_ if x not in sw]

In [35]:

def make_keywords(string):
    """
    Input: string of words
    Return: list of words excluding stopwords (after normalizing) and lemmatized
    """
    wordslist = remove_stop(normalizing(string))
    #return list(map(lambda x: porter.stem(wnl.lemmatize(x)), wordslist))
    return list(map(lambda x: wnl.lemmatize(x), wordslist))


In [36]:
clean_df.description = clean_df.description.apply(lambda x: make_keywords(x))

# Checking
Let's just randomly checks couple samples to ensure it worked.

In [37]:
np.random.seed(9)
clean_df.sample(1).description

27260    [illustrates, july, corresponding, zodiac, sig...
Name: description, dtype: object

In [38]:
print(full_df.loc[27260, 'description'])

illustrates July, corresponding zodiac sign (leo) at top center


I think this looks pretty good. 

# Exporting

Just for the sake of working in separate notebooks, I'll remove the list, and re-split in EDA notebook.  
This part can be skipped if it's all in the same notebook.

In [39]:
filename = 'pickles/cleaned_df.pkl'
clean_df.to_pickle(filename)

In [None]:
#clean_df.description = clean_df.description.apply(lambda x: ','.join(x))
#clean_df.to_csv('data/clean_df.csv')