## **MoMA Collection Data Processing**
## Table of Contents
### 1. [General Cleaning](#general)
### 2. [Cleaning Dates](#dates)
### 3. [Extracting from Biographies](#bio-extraction)

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
df = pd.read_csv('./collection/Artworks.csv')
# met_ref = pd.read_csv('./openaccess-master/MetObjects.csv', low_memory=False)

## General Cleaning <a class='anchor' id='general'></a>
### Cleaning Punctuation <a class="anchor" id="punctuation"></a>

In [3]:
print('{:,} total rows \n{:,} rows have dates correctly formatted \n'.format(len(df), len(df[df.Date.str.len()==4])))

140,848 total rows 
91,364 rows have dates correctly formatted 



In [4]:
df['Nationality'].fillna('Nationality unknown',inplace=True)
df.fillna('-', inplace=True)
df['Cataloged'].replace({'Y':True,'N':False}, inplace=True)

df['cleanedDate'] = 0
df['Date2'] = 0

In [5]:
output = ''

[output.join(i) for i in list(set([i.replace(')','').replace('(','') for i in re.split('\)\s\(',"(American) (American) (French)") if len(i) > 0 ]))]
print(output)




In [6]:
def parse_multi_nat(input: str) -> str:
    return ' '.join(list(set([i.replace(')','').replace('(','').replace('\t','') for i in re.split('\)\s\(',input) if len(i) > 0 ]))).strip()

In [7]:
# # testing
# parse_multi_nat('(American) (American) (Brazilian) (French) () (American)')
# parse_multi_nat('(American) (American) () (American)')
parse_multi_nat('(German) (Swedish) (German)	')

'German Swedish'

In [8]:
multi_nationality = df[df.Nationality.str.contains('\) ')].index
df.loc[multi_nationality,'cleanedNationality'] = df.loc[multi_nationality,'Nationality'].apply(lambda x: parse_multi_nat(x))

In [9]:
def strip_punct(df: pd.DataFrame, cols: [str, list], punct: str) -> pd.DataFrame:
    ''' 
    Takes in dataframe, column name or list of columns, and regex string of characters to remove
    overwrites original column with corrected column
    '''
    if type(cols) == str:
        df.loc[:,cols] = df.loc[:,cols].str.replace(f'{punct}','',regex=True)
    if type(cols) == list:
        for i in cols:
            df.loc[:,i] = df.loc[:,i].str.replace(f'{punct}','',regex=True)

In [10]:
strip_punct(df,['BeginDate','EndDate','Date','Gender'],'[^0-9\-]')
strip_punct(df,'Nationality','[^A-Za-z0-9\,\-\s]')
strip_punct(df,'ArtistBio','[\(\)]')

In [11]:
clean_nats = df[df.cleanedNationality.isnull()].index
df.loc[clean_nats,'cleanedNationality'] = df.loc[clean_nats,'Nationality']
# df.loc[:,'cleanedNationality'] = df.loc[:,'cleanedNationality'].str.replace(' Nationality unkown ','')

In [12]:
df.cleanedNationality.unique()

array(['Austrian', 'French', '', 'American', 'German', 'Dutch', 'Italian',
       'Swedish', 'American French', 'British', 'Japanese',
       'Dutch British', 'Argentine', 'Brazilian', 'Swiss', 'Luxembourger',
       'Spanish', 'Polish Austrian', 'Russian', 'Iranian',
       'Swiss American', 'German American Spanish Dutch Belgian Canadian',
       'French German American Japanese Dutch Belgian',
       'Dutch American Japanese French', 'Dutch Norwegian',
       'Swiss French', 'Finnish', 'German American', 'German Swiss',
       'Dutch American Japanese', 'Japanese  Italian', 'Canadian',
       'Nationality unknown', 'Danish', 'Belgian', 'American Italian',
       'Czech', 'Moroccan', 'Coptic', 'Persian', 'American Canadian',
       'Colombian', 'Dutch Danish', 'German Italian', 'Australian',
       'Chinese', 'Mexican', 'Slovenian', 'Scottish American',
       'German Swedish', 'Hungarian', 'Japanese Italian',
       'British American', 'Italian Argentine', 'Swedish American',
      

## Extracting From Biographies <a class="anchor" id="bio-extraction"></a>
### [Extracting birthplace](#birthplace), if listed
### [Imputing nationalities](#impute-nationalities)

In [13]:
bio2ref = df[df.ArtistBio.str.contains(',')==True].index

df.loc[:,'NationalityBio'] = df.loc[:,'ArtistBio'].apply(lambda x: x.split(',')[0])
df.loc[bio2ref,'Bio2'] = df.loc[bio2ref,'ArtistBio'].apply(lambda x: x.split(',')[1])

In [14]:
df[df.Bio2.str.contains('born')==True].ArtistBio.value_counts()

American, born France. 1911–2010                                   3339
American, born Germany. 1886–1969                                  2657
American, born 1934                                                1534
French, born Belarus. 1887–1985                                    1166
American, born Lithuania. 1931–1978                                 831
                                                                   ... 
German, born Bohemia. 1875–1951                                       1
Swedish, born 1967                                                    1
Italian, born 1966 Italian, born 1972 Italian, established 1998       1
German, born 1897                                                     1
Malian, born 1953                                                     1
Name: ArtistBio, Length: 3230, dtype: int64

In [25]:
bp_ref = df[df.ArtistBio.str.contains('born')==True].index

for i in bp_ref:
    output = list(set(re.findall('born\s\d{0,}\s{0,1}([A-Za-z]+)', df.loc[i,'ArtistBio'])))
    
    if len(output) == 1:
        df.loc[i,'Birthplace'] = output[0]
    elif len(output) == 0:
        df.loc[i,'Birthplace'] = 'N/A'
    else:
        df.loc[i,'Birthplace'] = [''.join(i) for i in output][0]
        
df['Birthplace'].fillna('N/A', inplace=True)

In [28]:
[' '.join(i) for i in output][0]

'Germany'

In [31]:
df.Birthplace.value_counts()[20:40]

the                                     257
Canada                                  252
Cuba                                    244
India                                   148
Korea                                   141
Japan                                   140
Latvia                                  137
Bohemia                                 131
['Uruguay', 'American', 'Argentine']    130
Estonia                                 129
British                                 122
Chinese                                 120
Spain                                   119
in                                      117
Brass                                   112
Sweden                                  108
Argentina                               102
Ireland                                  87
Swiss                                    81
Italian                                  78
Name: Birthplace, dtype: int64

In [38]:
# Answer
primes = [] # Set a list to catch prime values

for i in range(3, 2000): 
    # All statement evaluates to true if all of the iterables satisfy the criteria
    # If i divided by the existing primes(x) never has a remainder of 0
    if all(i % x != 0 for x in primes):
        # Append this number to the primes list
        primes.append(i)

sum(primes)

277052

In [32]:
df[df.Birthplace=="['Uruguay', 'American', 'Argentine']"]

Unnamed: 0,Title,Artist,ConstituentID,ArtistBio,Nationality,BeginDate,EndDate,Gender,Date,Medium,...,Weight (kg),Width (cm),Seat Height (cm),Duration (sec.),cleanedDate,Date2,cleanedNationality,NationalityBio,Bio2,Birthplace
118066,"ATC (Argentina Televisora Color), Buenos Aires...","Flora Manteola, Javier Sánchez Gómez, Josefa S...","44160, 44161, 44162, 44163, 7229, 46149, 46151","Argentine, born 1936 Argentine, born 1936 Arge...",Argentine Argentine Argentine Argentine Americ...,1936193619311931194400,0000000,,1977,Ink on vellum,...,-,100.0,-,-,0,0,American Argentine,Argentine,born 1936 Argentine,"['Uruguay', 'American', 'Argentine']"
118067,"ATC (Argentina Televisora Color), Buenos Aires...","Flora Manteola, Javier Sánchez Gómez, Josefa S...","44160, 44161, 44162, 44163, 7229, 46149, 46151","Argentine, born 1936 Argentine, born 1936 Arge...",Argentine Argentine Argentine Argentine Americ...,1936193619311931194400,0000000,,1977,Ink on vellum,...,-,100.0,-,-,0,0,American Argentine,Argentine,born 1936 Argentine,"['Uruguay', 'American', 'Argentine']"
118068,"ATC (Argentina Televisora Color), Buenos Aires...","Flora Manteola, Javier Sánchez Gómez, Josefa S...","44160, 44161, 44162, 44163, 7229, 46149, 46151","Argentine, born 1936 Argentine, born 1936 Arge...",Argentine Argentine Argentine Argentine Americ...,1936193619311931194400,0000000,,1977,Ink on vellum,...,-,100.0,-,-,0,0,American Argentine,Argentine,born 1936 Argentine,"['Uruguay', 'American', 'Argentine']"
118069,"ATC (Argentina Televisora Color), Buenos Aires...","Flora Manteola, Javier Sánchez Gómez, Josefa S...","44160, 44161, 44162, 44163, 7229, 46149, 46151","Argentine, born 1936 Argentine, born 1936 Arge...",Argentine Argentine Argentine Argentine Americ...,1936193619311931194400,0000000,,1977,Ink on vellum,...,-,90.0,-,-,0,0,American Argentine,Argentine,born 1936 Argentine,"['Uruguay', 'American', 'Argentine']"
118070,"ATC (Argentina Televisora Color), Buenos Aires...","Flora Manteola, Javier Sánchez Gómez, Josefa S...","44160, 44161, 44162, 44163, 7229, 46149, 46151","Argentine, born 1936 Argentine, born 1936 Arge...",Argentine Argentine Argentine Argentine Americ...,1936193619311931194400,0000000,,1977,Ink on vellum,...,-,74.0,-,-,0,0,American Argentine,Argentine,born 1936 Argentine,"['Uruguay', 'American', 'Argentine']"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120425,"Conjunto Habitacional Rioja, Buenos Aires, Arg...","Flora Manteola, Ignacio Petchersky, Javier Sán...","44160, 46152, 44161, 44162, 44163, 7229","Argentine, born 1936 Argentine Argentine, born...",Argentine Argentine Argentine Argentine Argent...,193601936193119311944,000000,,1969,Ink on paper,...,-,-,-,-,0,0,American Argentine,Argentine,born 1936 Argentine Argentine,"['Uruguay', 'American', 'Argentine']"
120426,"Casa de verano Sanchez Gomez, La Barra, Punta ...","Flora Manteola, Ignacio Petchersky, Javier Sán...","44160, 46152, 44161, 44162, 44163, 7229","Argentine, born 1936 Argentine Argentine, born...",Argentine Argentine Argentine Argentine Argent...,193601936193119311944,000000,,1972,drawings,...,-,-,-,-,0,0,American Argentine,Argentine,born 1936 Argentine Argentine,"['Uruguay', 'American', 'Argentine']"
120427,"Barrio Piedrabuena, Buenos Aires, Argentina","Flora Manteola, Javier Sánchez Gómez, Josefa S...","44160, 44161, 44162, 44163, 7229, 46149, 46151","Argentine, born 1936 Argentine, born 1936 Arge...",Argentine Argentine Argentine Argentine Americ...,1936193619311931194400,0000000,,1974,photographs,...,-,-,-,-,0,0,American Argentine,Argentine,born 1936 Argentine,"['Uruguay', 'American', 'Argentine']"
120428,"Torre UIA (Union Industrial Argentina), Buenos...","Flora Manteola, Javier Sánchez Gómez, Josefa S...","44160, 44161, 44162, 44163, 7229, 46149, 46151","Argentine, born 1936 Argentine, born 1936 Arge...",Argentine Argentine Argentine Argentine Americ...,1936193619311931194400,0000000,,1972-1974,Drawings,...,-,-,-,-,0,0,American Argentine,Argentine,born 1936 Argentine,"['Uruguay', 'American', 'Argentine']"


In [63]:
df[(df.Nationality==None)]

Unnamed: 0,Title,Artist,ConstituentID,ArtistBio,Nationality,BeginDate,EndDate,Gender,Date,Medium,...,Length (cm),Weight (kg),Width (cm),Seat Height (cm),Duration (sec.),Birthplace,cleanedDate,Date2,NationalityBio,Bio2


In [64]:
df['Nationality'].unique()

array(['Austrian', 'French', '', 'American', 'German', 'DutchDutch',
       'ItalianItalianItalian', 'Swedish', 'SwedishSwedish',
       'AmericanFrenchAmerican', 'AmericanAmericanFrench', 'British',
       'AmericanAmericanAmerican', 'Japanese', 'DutchBritishBritishDutch',
       'Italian', 'AmericanAmericanAmericanAmerican', 'Argentine',
       'BrazilianBrazilian', 'Swiss', 'Brazilian', 'Luxembourger',
       'AmericanAmerican', 'ItalianItalianItalianItalianItalian',
       'Spanish', 'DutchBritishDutchBritish', 'AustrianPolish', 'Dutch',
       'Russian', 'Iranian', 'AmericanSwiss',
       'DutchGermanDutchGermanCanadianBelgianAmericanDutchSpanish',
       'SwissSwissSwiss', 'FrenchFrench', 'JapaneseJapanese',
       'DutchBritish',
       'ItalianItalianItalianItalianItalianItalianItalian',
       'ItalianItalian', 'ItalianItalianItalianItalianItalianItalian',
       'BritishBritish',
       'AmericanAmericanAmericanAmericanAmericanAmerican',
       'DutchDutchBelgianFrenchGermanA

In [58]:
df['Bio2'].unique()[0:100]

array([' 1841–1918', ' born 1944', ' 1876–1957', ' born Switzerland 1944',
       ' born Estonia. 1901–1974', ' 1878–1969', ' 1878–1961',
       ' 1869–1936', ' born Austria. 1933–2010', ' born 1932',
       ' born 1944 Dutch', ' 1949–1991', ' 1934–2015',
       ' 1931–1997 Italian', ' born Germany. 1886–1969', ' 1885–1940',
       ' 1885–1940 Swedish', ' 1895–1979 French', ' 1895–1979 American',
       ' born 1947', ' born Germany now Poland. 1887–1953',
       ' born Italy. 1919–2013', ' 1918–1997', ' born 1936',
       ' born Poland 1954 American', ' born Iraq. 1950–2016',
       ' born 1928', ' born 1944 British', ' 1867–1959', ' born 1935',
       ' est. 1964–1979 American', ' established 1980 American',
       ' born 1943', ' born 1941', ' born 1931', ' 1883–1931 Dutch',
       ' 1909–1994 Brazilian', ' born 1957', ' 1895–1983', ' 1909–1994',
       ' born Austria. 1900–1985', ' born 1946', ' born Poland 1946',
       ' born 1944 American', ' 1931–1997', ' born 1956', ' 1927–2016

In [48]:
df[df.Birthplace.str.contains('\[')==True].Birthplace.unique()

array(["['British', 'Greece']", "['American', 'Zambia']",
       "['Estonia', 'China']",
       "['Belgian', 'Canadian', 'American', 'Dutch', 'Spanish', 'German']",
       "['Belgian', 'Japanese', 'American', 'Dutch', 'French', 'German']",
       "['American', 'Dutch', 'Japanese', 'French']",
       "['Dutch', 'Norwegian']", "['American', 'German']",
       "['German', 'Swiss']", "['American', 'Dutch', 'Japanese']",
       "['American', 'Canadian']", "['German', 'Indonesia']",
       "['Germany', 'Austria']", "['German', 'Swedish']",
       "['Finland', 'Germany']", "['American', 'England']",
       "['Belarus', 'Germany']", "['American', 'Argentine']",
       "['Hungary', 'Austria']", "['Spain', 'Germany']",
       "['American', 'Bohemia']", "['Switzerland', 'Germany']",
       "['Russia', 'Greece', 'Switzerland', 'Lithuania', 'Germany']",
       "['Bulgaria', 'Germany']",
       "['American', 'Australian', 'French', 'German', 'British']",
       "['American', 'Latvia', 'British', 'Ge

### Impute Nationalities <a class="anchor" id="impute-nationalities"></a>

In [24]:
# checking mismatches
len(df[df.Nationality!=df.NationalityBio])

AttributeError: 'DataFrame' object has no attribute 'NationalityBio'

In [405]:
# pulling top 10 donors, by volume
[print('{:,} items from {}'.format(df.CreditLine.value_counts()[i], i)) for i in df.CreditLine.value_counts().index[0:10]];

11,258 items from The Louis E. Stern Collection
10,612 items from Gift of the artist
8,392 items from Purchase
5,438 items from The Gilbert and Lila Silverman Fluxus Collection Gift
4,929 items from Abbott-Levy Collection. Partial gift of Shirley C. Burden
2,472 items from The Judith Rothschild Foundation Contemporary Drawings Collection Gift
2,383 items from Gift of Kleiner, Bell & Co.
1,889 items from Gift of Abby Aldrich Rockefeller
1,863 items from -
1,686 items from Gift of The Judith Rothschild Foundation


2001

In [273]:
# pulling remaining, most common formatting issues
df[df.cleanedDate==0].Date.value_counts()[0:10]

0             2105
              1043
193860         512
1947491949     164
1945481948     124
1947531955     122
196465          89
1947581958      83
1976922007      83
1957611961      80
Name: Date, dtype: int64

In [274]:
df[(df.Date==0)&(df.BeginDate.str.len()==4)]

Unnamed: 0,Title,Artist,ConstituentID,ArtistBio,Nationality,BeginDate,EndDate,Gender,Date,Medium,...,Depth (cm),Diameter (cm),Height (cm),Length (cm),Weight (kg),Width (cm),Seat Height (cm),Duration (sec.),cleanedDate,Date2
5686,Artistsas Britanicos - Buenos Aires,E. McKnight Kauffer,3020,"(American, 1890–1954)",(American),1890,1954,(Male),0,poster,...,,,76.2,,,50.800102,,,0,0
7270,Harpers May,Edward Penfield,4547,"(American, 1866–1925)",(American),1866,1925,(Male),0,Lithograph,...,,,45.0,,,30.500000,,,0,0
37004,Luxembourg,Eugène Atget,229,"(French, 1857–1927)",(French),1857,1927,(Male),0,Matte albumen silver print,...,,,,,,,,,0,0
37825,"UNTITLED. (Delahaye graines, plantes)",Eugène Atget,229,"(French, 1857–1927)",(French),1857,1927,(Male),0,Albumen silver print,...,,,,,,,,,0,0
38819,VERSAILLES -- BACCHUS,Eugène Atget,229,"(French, 1857–1927)",(French),1857,1927,(Male),0,Albumen silver print,...,,,,,,,,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135315,The Black Factory Archive,Pope.L,37145,"(American, born 1955)",(American),1955,0,(Male),0,Business card,...,,,,,,,,,0,0
138493,Colour Piece,Henning Christiansen,36942,"(Danish, 1932–2008)",(Danish),1932,2008,(Male),0,,...,,,29.7,,,21.100000,,,0,0
138497,Opus 19 and 20,Henning Christiansen,36942,"(Danish, 1932–2008)",(Danish),1932,2008,(Male),0,Mimeograph?,...,,,29.9,,,21.100000,,,0,0
138664,Here I Come Sweatshirt,Robert Watts,6269,"(American, 1923–1988)",(American),1923,1988,(Male),0,Permanent marker on fabric,...,0.0,,65.6,,,148.900000,,,0,0


In [257]:
df[(df.Date==0)|(df.Date==None)]

Unnamed: 0,Title,Artist,ConstituentID,ArtistBio,Nationality,BeginDate,EndDate,Gender,Date,Medium,...,Depth (cm),Diameter (cm),Height (cm),Length (cm),Weight (kg),Width (cm),Seat Height (cm),Duration (sec.),cleanedDate,Date2
5686,Artistsas Britanicos - Buenos Aires,E. McKnight Kauffer,3020,"(American, 1890–1954)",(American),(1890),(1954),(Male),0,poster,...,,,76.2,,,50.800102,,,0,0
5693,London Transport poster,Unidentified Artist,36597,(British),(British),(0),(0),(),0,Poster,...,,,,,,,,,0,0
7270,Harpers May,Edward Penfield,4547,"(American, 1866–1925)",(American),(1866),(1925),(Male),0,Lithograph,...,,,45.0,,,30.500000,,,0,0
37004,Luxembourg,Eugène Atget,229,"(French, 1857–1927)",(French),(1857),(1927),(Male),0,Matte albumen silver print,...,,,,,,,,,0,0
37825,"UNTITLED. (Delahaye graines, plantes)",Eugène Atget,229,"(French, 1857–1927)",(French),(1857),(1927),(Male),0,Albumen silver print,...,,,,,,,,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135315,The Black Factory Archive,Pope.L,37145,"(American, born 1955)",(American),(1955),(0),(Male),0,Business card,...,,,,,,,,,0,0
138493,Colour Piece,Henning Christiansen,36942,"(Danish, 1932–2008)",(Danish),(1932),(2008),(Male),0,,...,,,29.7,,,21.100000,,,0,0
138497,Opus 19 and 20,Henning Christiansen,36942,"(Danish, 1932–2008)",(Danish),(1932),(2008),(Male),0,Mimeograph?,...,,,29.9,,,21.100000,,,0,0
138664,Here I Come Sweatshirt,Robert Watts,6269,"(American, 1923–1988)",(American),(1923),(1988),(Male),0,Permanent marker on fabric,...,0.0,,65.6,,,148.900000,,,0,0


In [142]:
# for i in df.loc[ref, 'Date'].index:
#     f = df.loc[i,'Date'][:4]
#     s = df.loc[i,'Date'][4:]
    
#     df.loc[i,'Date'] = f
#     df.loc[i,'Date2'] = s