# Exploring pandas data input 

* Text Files
  * csv - a popular format. Pandas can accept various separators.
  * json - widely used in the web allowing nested objects. 
  * html - 

* Binary Files
  * Useful for data interchange between various software formats (pandas and excel). 
  * Useful for optimizing IO performance. 

* Relational databases
  * Pandas can read data from many relationional databases.
  
Original data available here: https://github.com/tategallery/collection

In [3]:
# load csv file 
# inspect the DataFrame

import pandas as pd
import os

In [4]:
CSV_PATH = os.path.join('./data', 'artwork_data.csv')

In [5]:
# limit the number of rows to 5
df = pd.read_csv(CSV_PATH, nrows=5)

In [6]:
df

Unnamed: 0,id,accession_number,artist,artistRole,artistId,title,dateText,medium,creditLine,year,acquisitionYear,dimensions,width,height,depth,units,inscription,thumbnailCopyright,thumbnailUrl,url
0,1035,A00001,"Blake, Robert",artist,38,A Figure Bowing before a Seated Old Man with h...,date not known,"Watercolour, ink, chalk and graphite on paper....",Presented by Mrs John Richmond 1922,,1922,support: 394 x 419 mm,394,419,,mm,,,http://www.tate.org.uk/art/images/work/A/A00/A...,http://www.tate.org.uk/art/artworks/blake-a-fi...
1,1036,A00002,"Blake, Robert",artist,38,"Two Drawings of Frightened Figures, Probably f...",date not known,Graphite on paper,Presented by Mrs John Richmond 1922,,1922,support: 311 x 213 mm,311,213,,mm,,,http://www.tate.org.uk/art/images/work/A/A00/A...,http://www.tate.org.uk/art/artworks/blake-two-...
2,1037,A00003,"Blake, Robert",artist,38,The Preaching of Warning. Verso: An Old Man En...,?c.1785,Graphite on paper. Verso: graphite on paper,Presented by Mrs John Richmond 1922,1785.0,1922,support: 343 x 467 mm,343,467,,mm,,,http://www.tate.org.uk/art/images/work/A/A00/A...,http://www.tate.org.uk/art/artworks/blake-the-...
3,1038,A00004,"Blake, Robert",artist,38,Six Drawings of Figures with Outstretched Arms,date not known,Graphite on paper,Presented by Mrs John Richmond 1922,,1922,support: 318 x 394 mm,318,394,,mm,,,http://www.tate.org.uk/art/images/work/A/A00/A...,http://www.tate.org.uk/art/artworks/blake-six-...
4,1039,A00005,"Blake, William",artist,39,The Circle of the Lustful: Francesca da Rimini...,"1826–7, reprinted 1892",Line engraving on paper,Purchased with the assistance of a special gra...,1826.0,1919,image: 243 x 335 mm,243,335,,mm,,,http://www.tate.org.uk/art/images/work/A/A00/A...,http://www.tate.org.uk/art/artworks/blake-the-...


In [7]:
# specify a column to be used as the index
df = pd.read_csv(CSV_PATH, nrows=5, index_col='id')

In [8]:
df

Unnamed: 0_level_0,accession_number,artist,artistRole,artistId,title,dateText,medium,creditLine,year,acquisitionYear,dimensions,width,height,depth,units,inscription,thumbnailCopyright,thumbnailUrl,url
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1035,A00001,"Blake, Robert",artist,38,A Figure Bowing before a Seated Old Man with h...,date not known,"Watercolour, ink, chalk and graphite on paper....",Presented by Mrs John Richmond 1922,,1922,support: 394 x 419 mm,394,419,,mm,,,http://www.tate.org.uk/art/images/work/A/A00/A...,http://www.tate.org.uk/art/artworks/blake-a-fi...
1036,A00002,"Blake, Robert",artist,38,"Two Drawings of Frightened Figures, Probably f...",date not known,Graphite on paper,Presented by Mrs John Richmond 1922,,1922,support: 311 x 213 mm,311,213,,mm,,,http://www.tate.org.uk/art/images/work/A/A00/A...,http://www.tate.org.uk/art/artworks/blake-two-...
1037,A00003,"Blake, Robert",artist,38,The Preaching of Warning. Verso: An Old Man En...,?c.1785,Graphite on paper. Verso: graphite on paper,Presented by Mrs John Richmond 1922,1785.0,1922,support: 343 x 467 mm,343,467,,mm,,,http://www.tate.org.uk/art/images/work/A/A00/A...,http://www.tate.org.uk/art/artworks/blake-the-...
1038,A00004,"Blake, Robert",artist,38,Six Drawings of Figures with Outstretched Arms,date not known,Graphite on paper,Presented by Mrs John Richmond 1922,,1922,support: 318 x 394 mm,318,394,,mm,,,http://www.tate.org.uk/art/images/work/A/A00/A...,http://www.tate.org.uk/art/artworks/blake-six-...
1039,A00005,"Blake, William",artist,39,The Circle of the Lustful: Francesca da Rimini...,"1826–7, reprinted 1892",Line engraving on paper,Purchased with the assistance of a special gra...,1826.0,1919,image: 243 x 335 mm,243,335,,mm,,,http://www.tate.org.uk/art/images/work/A/A00/A...,http://www.tate.org.uk/art/artworks/blake-the-...


In [9]:
# limit which columns we want to be read
COLUMNS_TO_USE = [
    'id',
    'artist',
    'title',
    'medium',
    'year',
    'acquisitionYear',
    'height',
    'width',
    'units'
]
df = pd.read_csv(CSV_PATH, index_col='id', usecols=COLUMNS_TO_USE)

  interactivity=interactivity, compiler=compiler, result=result)


In [10]:
df

Unnamed: 0_level_0,artist,title,medium,year,acquisitionYear,width,height,units
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1035,"Blake, Robert",A Figure Bowing before a Seated Old Man with h...,"Watercolour, ink, chalk and graphite on paper....",,1922.0,394,419,mm
1036,"Blake, Robert","Two Drawings of Frightened Figures, Probably f...",Graphite on paper,,1922.0,311,213,mm
1037,"Blake, Robert",The Preaching of Warning. Verso: An Old Man En...,Graphite on paper. Verso: graphite on paper,1785,1922.0,343,467,mm
1038,"Blake, Robert",Six Drawings of Figures with Outstretched Arms,Graphite on paper,,1922.0,318,394,mm
1039,"Blake, William",The Circle of the Lustful: Francesca da Rimini...,Line engraving on paper,1826,1919.0,243,335,mm
...,...,...,...,...,...,...,...,...
122960,"P-Orridge, Genesis",Larvae (from Tampax Romana),"Perspex, Wood, hairpiece, tampon and human blood",1975,2013.0,305,305,mm
122961,"P-Orridge, Genesis",Living Womb (from Tampax Romana),"Wood, Perspex, plastic, photograph on paper, t...",1976,2013.0,305,305,mm
121181,"Hatoum, Mona",Present Tense,Soap and glass beads,1996,2013.0,45,2410,mm
112306,"Creed, Martin",Work No. 227: The lights going on and off,Gallery lighting,2000,2013.0,,,


In [11]:
df.to_pickle(os.path.join('./data', 'data_frame.pickle'))

Creating DataFrames from a list of tuples.

In [12]:
records = [("Espresso", "5$"), ("Flat White", "10$")]

In [13]:
pd.DataFrame.from_records(records)

Unnamed: 0,0,1
0,Espresso,5$
1,Flat White,10$


In [14]:
# Specify names for the columns
pd.DataFrame.from_records(records, columns=["Coffee", "Price"])

Unnamed: 0,Coffee,Price
0,Espresso,5$
1,Flat White,10$


Load data from json

In [15]:
KEYS_TO_USE = ['id', 'all_artists', 'title', 'medium', 'acquisitionYear', 'height', 'width', 'units']

In [16]:
import json

def get_record_from_file(file_path, keys_to_use):
    """
    Process a single json file and return a tuple containing specific fields
    """
    with open(file_path) as artwork_file:
        content = json.load(artwork_file)
    
    record = []
    for field in keys_to_use:
        record.append(content[field])
    
    return tuple(record)

In [17]:
SAMPLE_JSON = os.path.join('./data', 'artworks', 'a', '000', 'a00001-1035.json')

In [18]:
sample_record = get_record_from_file(SAMPLE_JSON, KEYS_TO_USE)

In [19]:
sample_record

(1035,
 'Robert Blake',
 'A Figure Bowing before a Seated Old Man with his Arm Outstretched in Benediction. Verso: Indecipherable Sketch',
 'Watercolour, ink, chalk and graphite on paper. Verso: graphite on paper',
 1922,
 '419',
 '394',
 'mm')

In [20]:
def read_artworks_from_json(keys_to_use):
    """
    Traverse the directories with JSON files. 
    For first file in each directory, call function for processing single file and go to the next directory. 
    """
    JSON_ROOT = os.path.join('./data', 'artworks')
    artworks = []
    for root, _, files in os.walk(JSON_ROOT):
        for f in files:
            if f.endswith('json'):
                record = get_record_from_file(os.path.join(root, f), keys_to_use)
                artworks.append(record)
            break;
    df = pd.DataFrame.from_records(artworks, columns=keys_to_use, index='id')
    return df


In [21]:
df = read_artworks_from_json(KEYS_TO_USE)

In [22]:
df

Unnamed: 0_level_0,all_artists,title,medium,acquisitionYear,height,width,units
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
6620,William Hogarth,A Scene from ‘The Beggar’s Opera’ VI,Oil paint on canvas,1909,762,572,mm
10413,William James Müller,"Study of Pines, Rheinwald",Watercolour on paper,1908,410,269,mm
496,Sir Lawrence Alma-Tadema,A Silent Greeting,Oil paint on wood,1894,229,305,mm
15803,Frederick Walker,The Vagrants,Oil paint on canvas,1886,1264,832,mm
1810,"Sir Edward Coley Burne-Jones, Bt",Head and Hand,Graphite on paper,1932,165,225,mm
...,...,...,...,...,...,...,...
80497,James Rosenquist,Sun Sets on the Time Zone,"Paper, acrylic paint, dye and lithograph on paper",2004,1473,2019,mm
19078,Hamish Fulton,Seven Winds. Scotland 1985,Lithograph on paper,1990,829,1070,mm
83640,Thomas Schütte,[no title],Screenprint on paper,2003,320,447,mm
21713,Philip Guston,Untitled,Lithograph on paper,1996,765,570,mm


In [30]:
df['all_artists']

id
6620                      William Hogarth
10413                William James Müller
496              Sir Lawrence Alma-Tadema
15803                    Frederick Walker
1810     Sir Edward Coley Burne-Jones, Bt
                       ...               
80497                    James Rosenquist
19078                       Hamish Fulton
83640                      Thomas Schütte
21713                       Philip Guston
21706                       Philip Guston
Name: all_artists, Length: 738, dtype: object

In [31]:
df[['all_artists', 'title']]

Unnamed: 0_level_0,all_artists,title
id,Unnamed: 1_level_1,Unnamed: 2_level_1
6620,William Hogarth,A Scene from ‘The Beggar’s Opera’ VI
10413,William James Müller,"Study of Pines, Rheinwald"
496,Sir Lawrence Alma-Tadema,A Silent Greeting
15803,Frederick Walker,The Vagrants
1810,"Sir Edward Coley Burne-Jones, Bt",Head and Hand
...,...,...
80497,James Rosenquist,Sun Sets on the Time Zone
19078,Hamish Fulton,Seven Winds. Scotland 1985
83640,Thomas Schütte,[no title]
21713,Philip Guston,Untitled
