In [1]:
# import zip_longest from itertools
from itertools import zip_longest
import pandas as pd
import io

In [2]:
# Print the number of lines in the file from the shell script
no_of_lines = ! wc -l "MetObjects.csv"
no_of_lines

["'wc' is not recognized as an internal or external command,",
 'operable program or batch file.']

In [3]:
def return_dict_from_csv_line(header, line):
    # zip them
    zipped_line = zip_longest(header, line, fillvalue='NA')
    # Use dict comprehension to generate the final dict
    ret_dict = {kv[0]: kv[1] for kv in zipped_line}
    return ret_dict

In [5]:
# Reading the encoding characterset for the dataset.
# Although I tried to read using the Windows-1252 encoding but was not able to read, so used latin1 instead 
import chardet
with open("MetObjects.csv", 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))
result

{'encoding': 'Windows-1252', 'confidence': 0.728850927952508, 'language': ''}

In [6]:
# The above charset did not work so using latin1
lines=[]
with io.open("MetObjects.csv", 'r', encoding='latin1') as fd:
    first_line = fd.readline()
    header = first_line.replace("\n","").split(",")
    for i, line in enumerate(fd):
        line = line.replace("\n", "").split(",")
        d = return_dict_from_csv_line(header, line)
        lines.append(d)

In [9]:
# Reading the first line into a dataframe
# Using the same encoding here also
with io.open("MetObjects.csv", 'r', encoding='latin1') as fd:
    first_line = fd.readline()
    header = first_line.replace("\n", "").split(",")
col_header = header

In [10]:
col_header

['"Object Number',
 'Is Highlight',
 'Is Public Domain',
 'Object ID',
 'Department',
 'Object Name',
 'Title',
 'Culture',
 'Period',
 'Dynasty',
 'Reign',
 'Portfolio',
 'Artist Role',
 'Artist Prefix',
 'Artist Display Name',
 'Artist Display Bio',
 'Artist Suffix',
 'Artist Alpha Sort',
 'Artist Nationality',
 'Artist Begin Date',
 'Artist End Date',
 'Object Date',
 'Object Begin Date',
 'Object End Date',
 'Medium',
 'Dimensions',
 'Credit Line',
 'Geography Type',
 'City',
 'State',
 'County',
 'Country',
 'Region',
 'Subregion',
 'Locale',
 'Locus',
 'Excavation',
 'River',
 'Classification',
 'Rights and Reproduction',
 'Link Resource',
 'Metadata Date',
 'Repository',
 'Tags"']

In [11]:
metobj_df = pd.DataFrame(data = lines, columns=col_header)
print(metobj_df.shape)

(12108, 44)


In [12]:
# Removing spaces from the dataframe columns
metobj_df = metobj_df.rename(columns=lambda x: x.strip())
metobj_df.shape

(12108, 44)

In [13]:
metobj_df.describe

<bound method NDFrame.describe of           "Object Number Is Highlight Is Public Domain Object ID  \
0            "1979.486.1        False            False         1   
1            "1980.264.5        False            False         2   
2              "67.265.9        False            False         3   
3             "67.265.10        False            False         4   
4             "67.265.11        False            False         5   
...                  ...          ...              ...       ...   
12103  "1974.356.1 recto        False            False     11814   
12104          "54.143.8        False            False     11815   
12105        "1976.201.4        False            False     11816   
12106            "64.118        False            False     11817   
12107                  4           NA               NA        NA   

              Department Object Name                                Title  \
0      The American Wing        Coin         One-dollar Liberty Head Coi

In [14]:
# Filter out missing data
# Using how = all will drop rows that are all NA
# So after using the parameter it did not drop a single row
metobj_df_1 = metobj_df.dropna(how='all')
metobj_df_1.shape

(12108, 44)

In [15]:
# Removing duplicates
# After dropping the duplicates the new row count
metobj_df_1 = metobj_df.drop_duplicates()

In [16]:
bef4 = metobj_df.shape[0]
aftr = metobj_df_1.shape[0]
print("The row count before dropping duplicates is: {}.""\n""The row count after dropping the duplicates is: {}.".format(bef4,aftr))

The row count before dropping duplicates is: 12108.
The row count after dropping the duplicates is: 11923.


In [17]:
# So all rows that were duplicate are dropped from the actual dataframe 
# The new data frame with no duplicates is metobj_df_1
metobj_dedup = metobj_df_1

In [18]:
metobj_dedup.shape

(11923, 44)

In [19]:
# Verifying some sample data from the data frame
metobj_dedup['Object Name'].unique()

array(['Coin', 'Peso', 'Centavos', 'Pesos', 'Bust', 'Clock', 'Vase', 'NA',
       '', 'Side Chair', 'Figure', 'Advertisement', 'Ale glass',
       'Andiron', 'Apothecary jar', 'Statuette', 'Frame',
       'Architectural elements', 'Loggia elements', 'Bedroom',
       'The American Wing', 'Argand lamp', 'Lamp shade', 'Armchair',
       'Banister-back armchair', 'Drawing', 'Slat-back armchair',
       'Spindle-back armchair', 'Teapot', 'Rocking chair',
       'Reclining armchair', 'Folding armchair', 'Side chair',
       'Revolving armchair', 'Furniture hardware', 'Astral lamp',
       'Window', 'Aviary', 'Baking dish', 'Balcony', 'Ball', 'Wall panel',
       'Baluster', 'Banjo clock', 'Bank', 'Baptismal basin',
       'Baptismal bowl', 'Baptismal certificate', 'Bottle', 'Mug', 'Keg',
       'Basin', 'Wash basin', 'Basin Stand', 'Basin stand', 'Basket',
       'Door', 'Oil cruet', 'Beaker', 'Pulque beaker', 'Made in',
       'Tumbler', 'Paris', 'Barnstable', 'Bean pot', 'Folding bedstead

In [88]:
metobj_dedup['Object ID'].nunique()

10072

In [89]:
# Showing the various values in a coloum
metobj_dedup['Object Name']

0              Coin
1              Coin
2              Coin
3              Coin
4              Coin
            ...    
12103    Watercolor
12104    Watercolor
12105    Watercolor
12106    Watercolor
12107            NA
Name: Object Name, Length: 11923, dtype: object

In [90]:
# After looking at the column values it looks like the data is not good here for Object name
metobj_dedup.groupby('Object Name').groups.keys()

dict_keys(['', ' 1953""', ' 1967""', ' 1982""', ' 1985""', ' Bequest of Edward W. C. Arnold', ' Drue Heinz', ' Elinor Irwin Holden', ' Friends of the American Wing Fund', ' in memory of Berry B. Tracy', '""Birth', '""Birth and baptismal certificate', '""Birth certificate', '""Bolt', '""Box', '""Bust', '""Cameo medallion', '""Certificate', '""Corner cupboard from a House in Lancaster County', '""Drawing', '""Exterior Doorway from One West Broad Street', '""Fire Tongs"', '""Flyleaf', '""Latch', '""Latch lock', '""Love token', '""Manuscript sampler', '""Medal', '""Minature', '""Painting', '""Plan', '""Plaque', '""Plate"', '""Rattle', '""Rogers Fund', '""Snuffbox', '""Stained glass window from St. Paul\'s United Methodist Church', '""Teakettle', '""Watercolor', '32 3/4 x 79 3/8 x 30 1/2 in. (83.2 x 201.6 x 77.5 cm)', '3757', 'Advertisement', 'Albany', 'Ale glass', 'Ale pitcher', 'American', 'Andiron', 'Apothecary jar', 'Appliqu\x8e', 'Architectural elements', 'Argand lamp', 'Armchair', 'As

In [124]:
# After looking at the column values it looks like the data is not good here for Department name
metobj_dedup.groupby('Department').groups.keys()

dict_keys(['', ' 1915""', ' 1918""', ' 1928""', ' 1931""', ' 1932""', ' 1933""', ' 1946""', ' 1950""', ' 1958""', ' 1960""', ' 1961""', ' 1969""', ' 1972""', ' 1979""', ' 1980""', ' 1984""', ' 2014""', ' Alexander Orr Vietor', ' Friends of the American Wing and"', ' Georgia""', ' Joseph Pulitzer Bequest', ' Mr. and Mrs. Robert G. Goelet', ' Mrs. Russell"', ' The Horace W. Goldsmith Foundation Gift', ' a founder of The Metropolitan Museum of Art', ' and Pictures', ' by exchange', ' glass with"', ' in devoted memory of their grandmother', ' white pine""', '""Purchase', '10073', '1017', '10170', '1018', '1019', '1020', '1021', '1022', '1029', '1048', '1049', '10752', '1080', '1081', '1082', '1088', '1089', '1120', '11390', '1149', '11492', '1150', '1151', '1152', '1205', '1223', '1224', '1251', '1252', '1253', '1254', '1295', '1296', '1297', '1298', '1314', '1321', '1339', '1340', '1350', '1351', '1352', '1353', '1358', '1359', '1360', '1361', '137', '1370', '1371', '1372', '1373', '1374'

In [21]:
objects = metobj_dedup.groupby('Department')

In [22]:
# Showing the row for one department
objects.get_group("2110")

Unnamed: 0,"""Object Number",Is Highlight,Is Public Domain,Object ID,Department,Object Name,Title,Culture,Period,Dynasty,...,Locale,Locus,Excavation,River,Classification,Rights and Reproduction,Link Resource,Metadata Date,Repository,"Tags"""
2192,"""""""60.111.168",".184""""",False,True,2110,The American Wing,Coffee cup and saucer,Coffee Cup and Saucer,"""""Chinese","for American market""""",...,,,,,,,,,,


In [128]:
# List of object Date
# As the dataset is not properly cleaned so not getting the dates.
metobj_dedup.groupby('Object Date').groups.keys()

dict_keys(['', ' 1818Ð1888""', ' 1837Ð1958""', ' 1844Ð1900""', ' A. and H.""', ' Agnes F.""', ' Albert Pinkham""', ' Alexander Stirling""', ' Alfred Cornelius""', ' Alvan""', ' Ambrose""', ' Andrea""', ' Andrew""', ' Anna""', ' Anne""', ' Asher Brown""', ' Augustin|Tassie', ' Ball and Poor""', ' Benjamin Henry""', ' Benjamin""', ' Boston', ' Bryson""', ' Cann & Dunn""', ' Cecilia""', ' Charles', ' Charles Caryl""', ' Charles H""', ' Charles Henry""', ' Charles W.""', ' Charles Willson""', ' Charles""', ' Charles|Volkmar Ceramic Company""', ' Chester (Charles)""', ' Childe""', ' Christian""', ' Christian|Alexander', ' Colin Cambell""', ' D.""', ' Daniel Chester""', ' Daniel|Inman', ' Dard|Roycroft ""', ' David Claypoole""', ' David""', ' Deming|Boston and Sandwich Glass Company ""', ' Dewitt""', ' E. & W.', ' Eastman""', ' Edith Woodman""', ' Edward Augustus""', ' Edward""', ' Eli""', ' Elizabeth""', ' Emil""', ' Emile|Peale', ' Emile|Savage', ' Emile|Stuart', ' Erastus Dow""', ' Ester"

In [126]:
# List of object names
metobj_dedup.groupby('Classification').groups.keys()

dict_keys(['', ' Allison V. Armour', ' Bequest of Edward W. C. Arnold', ' The Manney Collection Gift', ' The Sylmaris Collection', '""John Stewart Kennedy Fund', '2019-07-31T03:00:40.447Z', 'Biloxi', 'Boston', 'Chicago', 'England', 'Frames', 'France', 'Glass', 'Haute-Vienne', 'Made in', 'Metal', 'Midwest ', 'Mississippi', 'NA', 'Natural Substances', 'New England ', 'New York', 'United States', 'United States|Germany', 'United States|United States'])