In [94]:
import pandas as pd
import spacy
from collections import defaultdict, Counter
from tqdm.notebook import tqdm
import numpy as np
import pickle

In [2]:
df = pd.read_csv("../Final_Streamlit/data/Final_Dataset_w_colorgroup.csv")

In [78]:
artist_linking_df = pd.read_csv("../EntityLinking/data/artist_predictions.csv")

In [79]:
artist_linking_df.head()

Unnamed: 0,Artsy,Widewalls,artist_name
0,2,281,Roy Lichtenstein
1,12,2260,Josep Guinovart
2,13,103,Macha Poynder
3,36,1954,Eddie Martinez
4,37,1826,Louis Masai


In [80]:
target_artists = artist_linking_df['artist_name'].tolist()

In [77]:
full_bio_df = pd.read_csv("../Scrape/Widewalls/data/CleanedWidewallsData.csv")

In [82]:
target_full_bio_df = full_bio_df[full_bio_df['artist_names'].isin(target_artists)]

In [83]:
target_full_bio_df.head()

Unnamed: 0,artist_names,full_bios,id
1,David Shrigley,David Shrigley is a Glasgow-based British arti...,2
4,David Yarrow,An artist followed by an amazing reputation fo...,5
6,Dan Christensen,Dan Christensen was an American abstract paint...,7
10,KAWS,"In the late ’80s and early ’90s, the interesti...",11
18,Andy Warhol,It was the beginning of the 1950s when the art...,19


In [84]:
nlp = spacy.load('en_core_web_sm')

In [97]:
def build_artist_profile(artist_profiles, artist_name, full_bio):
    doc = nlp(full_bio)
    def tag_entity(entity):
        if entity.label_ == "ORG":
            if "university" in entity.text.lower() or "school" in entity.text.lower()\
            or "masters" in entity.text.lower() or "institute" in entity.text.lower():
                return "EDUCATION"
        if entity.label_ == "NORP":
            return "NATIONALITY"
        if entity.label_ == "GPE":
            return "LOCATION BASE"
        else:
            return None
    
    locations = []
    has_first_norp = False
    for entity in doc.ents:
        tag = tag_entity(entity)
        if tag:
    #         print(entity.text, entity.label_, tag)
            if tag == "NATIONALITY" and not has_first_norp:
                artist_profiles[artist_name]['nationality'] = entity.text
                has_first_norp = True
            if tag == "EDUCATION":
                artist_profiles[artist_name]['education'] = entity.text
            if tag == "LOCATION BASE":
                locations.append(entity.text)
    if len(locations)>0:
        c = Counter(locations)
        location_base = list(c.most_common())[0][0]
        artist_profiles[artist_name]['location'] = location_base
    if 'nationality' not in artist_profiles[artist_name]:
        artist_profiles[artist_name]['nationality'] = np.NaN
    if 'education' not in artist_profiles[artist_name]:
        artist_profiles[artist_name]['education'] = np.NaN
    if 'location' not in artist_profiles[artist_name]:
        artist_profiles[artist_name]['location'] = np.NaN
    return 
#         print(entity.text, entity.label_)

In [98]:
artist_profiles = defaultdict(dict)
i = 0
for _, name, bio, _ in tqdm(target_full_bio_df.itertuples(), total=len(full_bio_df)):
    
    if isinstance(name, str) and isinstance(bio, str) and len(bio) > 10:
        try:
            build_artist_profile(artist_profiles, name, bio)
        except e:
            print(name)
        i += 1

  0%|          | 0/2534 [00:00<?, ?it/s]

In [100]:
len(artist_profiles)

500

In [101]:
len(target_full_bio_df)

500

In [102]:
artist_profiles

defaultdict(dict,
            {'David Shrigley': {'nationality': 'British',
              'location': 'Glasgow',
              'education': nan},
             'David Yarrow': {'education': 'the Edinburgh University',
              'nationality': 'British',
              'location': 'Glasgow'},
             'Dan Christensen': {'nationality': 'American',
              'education': 'the\xa0Kansas City Art Institute',
              'location': 'Denver'},
             'KAWS': {'nationality': 'American',
              'education': 'the School of Visual Arts',
              'location': 'Tokyo'},
             'Andy Warhol': {'nationality': 'American',
              'education': 'Carnegie-Mellon University',
              'location': 'New York'},
             'Hunter & Gatti': {'nationality': nan,
              'education': nan,
              'location': nan},
             'Leslie Sheryll': {'nationality': 'American',
              'education': nan,
              'location': nan},
             

In [110]:
artist_profiles_list = []
for artist, profile in artist_profiles.items():
    temp = [profile['nationality'], profile['education'], profile['location']]
    row = [artist]+temp
    artist_profiles_list.append(row)

In [112]:
artist_profiles_df = pd.DataFrame(artist_profiles_list, columns = ['artist_name', 'nationality', 'education', 'location'])

In [113]:
artist_profiles_df.head()

Unnamed: 0,artist_name,nationality,education,location
0,David Shrigley,British,,Glasgow
1,David Yarrow,British,the Edinburgh University,Glasgow
2,Dan Christensen,American,the Kansas City Art Institute,Denver
3,KAWS,American,the School of Visual Arts,Tokyo
4,Andy Warhol,American,Carnegie-Mellon University,New York


# Merge with final dataset

In [114]:
final_df = pd.read_csv("../FinalDataSetCreation/FinalDataSet.csv")

In [115]:
final_df.head()

Unnamed: 0,id,media,url,artist_name,artist_url,work_name,work_year,media_long,dimension,gallery_name,gallery_url,bid,price,img_url,full_bio,artist_website,artist_ins
0,painting15,painting,https://www.artsy.net/artwork/hebru-brantley-d...,Hebru Brantley,https://www.artsy.net/artist/hebru-brantley,"Death Of The Black Fairy, The Great Debate Pt. 5",2020,Acrylic & Diamond Dust on Canvas,60 × 60 in | 152.4 × 152.4 cm,Corridor Contemporary,https://www.artsy.net/partner/corridor-contemp...,False,125000.0,https://d7hftxdivxxvm.cloudfront.net?resize_to...,"Hebru Brantley, a Bronzeville-native living an...",www.hebrubrantley.com,https://www.instagram.com/hebrubrantley/?hl=en
1,painting37,painting,https://www.artsy.net/artwork/miaz-brothers-ma...,Miaz Brothers,https://www.artsy.net/artist/miaz-brothers,Maestro,2022,Acrylic on canvas,40 1/5 × 29 9/10 in | 102 × 76 cm,Maddox Gallery,https://www.artsy.net/partner/maddox-gallery,False,17500.0,https://d7hftxdivxxvm.cloudfront.net?resize_to...,The Miaz Brothers present a radical new take o...,www.miazbrothers.com,https://www.instagram.com/miazbrothers/?hl=en
2,painting41,painting,https://www.artsy.net/artwork/sandra-chevrier-...,Sandra Chevrier,https://www.artsy.net/artist/sandra-chevrier,La cage où le brouillard traverse l’esprit,2017,Oil on canvas,12 × 12 in | 30.5 × 30.5 cm,West Chelsea Contemporary,https://www.artsy.net/partner/west-chelsea-con...,False,5000.0,https://d7hftxdivxxvm.cloudfront.net?resize_to...,Sandra Chevrier is a Canadian contemporary art...,www.sandrachevrier.com,https://www.instagram.com/sandrachevrier/?hl=en
3,painting45,painting,https://www.artsy.net/artwork/florian-eymann-b...,Florian Eymann,https://www.artsy.net/artist/florian-eymann,BAYC Clueless Sailor,2022,Oil on canvas,43 × 34 1/2 in | 109.2 × 87.6 cm,Avant Gallery,https://www.artsy.net/partner/avant-gallery,False,11000.0,https://d7hftxdivxxvm.cloudfront.net?resize_to...,Florian Eymann is a self-taught French artist ...,www.florianeymann.com,https://www.instagram.com/florianeymann/?hl=en
4,painting52,painting,https://www.artsy.net/artwork/miaz-brothers-a-...,Miaz Brothers,https://www.artsy.net/artist/miaz-brothers,A Promising Young Man,2022,Acrylic on canvas,48 × 35 4/5 in | 122 × 91 cm,Maddox Gallery,https://www.artsy.net/partner/maddox-gallery,False,17500.0,https://d7hftxdivxxvm.cloudfront.net?resize_to...,The Miaz Brothers present a radical new take o...,www.miazbrothers.com,https://www.instagram.com/miazbrothers/?hl=en


In [116]:
final_merged_df = final_df.merge(artist_profiles_df, on="artist_name")

In [123]:
final_merged_df.head()

Unnamed: 0,id,media,url,artist_name,artist_url,work_name,work_year,media_long,dimension,gallery_name,...,education,location,color1,color2,color3,color4,color_group1,color_group2,color_group3,color_group4
0,painting15,painting,https://www.artsy.net/artwork/hebru-brantley-d...,Hebru Brantley,https://www.artsy.net/artist/hebru-brantley,"Death Of The Black Fairy, The Great Debate Pt. 5",2020,Acrylic & Diamond Dust on Canvas,60 × 60 in | 152.4 × 152.4 cm,Corridor Contemporary,...,Clark Atlanta University,Chicago,#E7DFCC,#A69784,#403931,#877C76,69,24,192,246
1,prints369,prints,https://www.artsy.net/artwork/hebru-brantley-f...,Hebru Brantley,https://www.artsy.net/artist/hebru-brantley,Flower N Word,2022,"22-color Hand-Pulled Screen Print, Saunders Wa...",40 × 30 in | 101.6 × 76.2 cm,Pinto Gallery,...,Clark Atlanta University,Chicago,,,,,-1,-1,-1,-1
2,prints826,prints,https://www.artsy.net/artwork/hebru-brantley-m...,Hebru Brantley,https://www.artsy.net/artist/hebru-brantley,Monoprint,2017,Hand-Pulled Screen Print on Mohawk Superfine U...,24 × 18 in | 61 × 45.7 cm,MODCLAIR,...,Clark Atlanta University,Chicago,,,,,-1,-1,-1,-1
3,photography133,photography,https://www.artsy.net/artwork/hebru-brantley-n...,Hebru Brantley,https://www.artsy.net/artist/hebru-brantley,Night Flight,2021,Digital print with screen printed glow in the ...,29 × 36 in | 73.7 × 91.4 cm,Heritage: Urban Art,...,Clark Atlanta University,Chicago,,,,,-1,-1,-1,-1
4,painting37,painting,https://www.artsy.net/artwork/miaz-brothers-ma...,Miaz Brothers,https://www.artsy.net/artist/miaz-brothers,Maestro,2022,Acrylic on canvas,40 1/5 × 29 9/10 in | 102 × 76 cm,Maddox Gallery,...,European Institute of Design,Cyan,#181A15,#2F342E,#43492F,#B8A36C,121,280,147,12


In [118]:
color_group_df = pd.read_csv("../Final_Streamlit/data/Final_Dataset_w_colorgroup.csv")

In [120]:
color_group_df = color_group_df[['id', 'color1','color2', 'color3', 'color4', 'color_group1', 'color_group2', 'color_group3', 'color_group4']]



In [122]:
final_merged_df = final_merged_df.merge(color_group_df, on="id")

In [124]:
final_merged_df.to_csv("../FinalDataSetCreation/FinalDataset_with_all.csv")

# Manual check 

In [70]:
random_checks = []
for idx in random_idx:
    artist = artist_profiles_list[idx]
    random_checks.append((artist, artist_profiles[artist]))

In [71]:
random_checks

[('Stan Maksun', {'nationality': 'American'}),
 ('Julian Stanczak',
  {'nationality': 'American',
   'education': 'School of Art and Architecture',
   'location': 'Poland'}),
 ('Ernie Barnes', {'nationality': 'American'}),
 ('Jim Perry', {'nationality': 'American'}),
 ('Lois Gross Smiley', {'nationality': 'American'}),
 ('Bisser', {'nationality': 'Belgian', 'location': 'Ibiza'}),
 ('Jeffrey Maron', {'nationality': 'American', 'location': 'Watercolor'}),
 ('Jean Louis Liberte', {'nationality': 'French'}),
 ('Oyvind Fahlstrom', {'nationality': 'Swedish'}),
 ('Donald Lipski', {'nationality': 'American'}),
 ('Manfred Pernice', {'nationality': 'German', 'location': 'Berlin'}),
 ('Leonetto Cappiello', {'nationality': 'French', 'location': 'France'}),
 ('Mathew Hale',
  {'nationality': 'British',
   'education': 'Harvard University',
   'location': 'Berlin'}),
 ('Evan Hecox', {'location': 'Colorado'}),
 ('Natalya Zaloznaya',
  {'education': 'the Belorussian State University of Theatre and\nAr

In [76]:
print(full_bio_df[full_bio_df['artist_names'] == "Stan Maksun"])

      Unnamed: 0 artist_names  \
1265        1265  Stan Maksun   

                                              full_bios  
1265  Combining elements of horror,\nfantasy and sci...  
