#JSON
We will be going over JSON fields to help with future projects. <br>
JSON stands for JavaScript Object Notation. It is way of storing and transporting data.


## JSON Syntax
- JSON has six data types:
  - a string
  - a number
  - an object (JSON object)
  - an array
  - a boolean
  - null
- Objects are in key/value pairs and separated by commas
- Curly braces are used for objects
- Square brackets are used for arrays

When intepreted by Python, strings are strings and numbers become either integers or floats.  Objects become dictionaries and arrays become lists.  Booleans are booleans and null is null.

## JSON Example

Name/Value Pair: `'First Name':'Nevin'` <br>
Object: `{'First Name':'Nevin', 'Last Name': 'Martin'}` <br>
Array: `'employees':[{'First Name':'Nevin', 'Last Name': 'Martin'},
{'First Name':'Joe', 'Last Name': 'Olonia'}]`



## Extracting Data from JSON Fields

In [1]:
import pandas as pd
import json


In [4]:
%%capture
%%bash
apt-get update
apt-get install -y jq


In [None]:
# This function will extract an element within a JSON entry
def extract_json_fields(json_list, field_name):
  '''
  This function can pull an individual item within a JSON field. It takes the following arguments:
  - JSON list in which the item is located
  - The name of the item
  '''
  entry_list = json.loads(json_list)
  return '|'.join([ str(element[field_name]) for element in entry_list ])
#  return str(entry_list[0][field_name])


In [None]:
dat_file = 'https://ddc-datascience.s3.amazonaws.com/animals.xlsx'
animal_dat = pd.read_excel(dat_file, sheet_name = "Sheet3")
animal_dat


Unnamed: 0,Animal,Info
0,Dog,"[{""id"": 54, ""name"":""Precious""}]"
1,Cat,"[{""id"": 24, ""name"":""Midnight""}]"
2,Cow,"[{""id"": 32, ""name"":""Spots""}]"
3,Mouse,"[{""id"": 58, ""name"":""Fuzzy""}]"


In [None]:
animal_dat.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Animal  4 non-null      object
 1   Info    4 non-null      object
dtypes: object(2)
memory usage: 192.0+ bytes


In [None]:
info_ds = animal_dat['Info'][0]
info_ds

'[{"id": 54, "name":"Precious"}]'

In [None]:
# Notice that the item is a string
type(info_ds)


str

In [None]:
info_ds[0]


'['

In [None]:
# Interpret the string as a JSON object and convert to a Python object
json_ds = json.loads(info_ds)
json_ds


[{'id': 54, 'name': 'Precious'}]

In [None]:
type(json_ds)


list

In [None]:
json_ds[0]


{'id': 54, 'name': 'Precious'}

In [None]:
type(json_ds[0])


dict

In [None]:
json_ds[0]['id']


54

In [None]:
animal_dat_updated = animal_dat.copy()
animal_dat_updated['Name_Extract'] = animal_dat_updated['Info'].apply(extract_json_fields, field_name = 'name') # extract_json_fields(x, "name")
animal_dat_updated['ID_Extract']   = animal_dat_updated['Info'].apply(extract_json_fields, field_name = 'id')   # extract_json_fields(x, "id")
animal_dat_updated


Unnamed: 0,Animal,Info,Name_Extract,ID_Extract
0,Dog,"[{""id"": 54, ""name"":""Precious""}]",Precious,54
1,Cat,"[{""id"": 24, ""name"":""Midnight""}]",Midnight,24
2,Cow,"[{""id"": 32, ""name"":""Spots""}]",Spots,32
3,Mouse,"[{""id"": 58, ""name"":""Fuzzy""}]",Fuzzy,58


In [None]:
animal_dat_updated.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Animal        4 non-null      object
 1   Info          4 non-null      object
 2   Name_Extract  4 non-null      object
 3   ID_Extract    4 non-null      object
dtypes: object(4)
memory usage: 256.0+ bytes


In [None]:
animal_dat_updated['Info'][0]


'[{"id": 54, "name":"Precious"}]'

In [None]:
foo = '''[
  {
      "id": 54,
      "name":"Precious"
    }
]'''
x = json.loads(foo)
x

[{'id': 54, 'name': 'Precious'}]

In [None]:
type(x[0]['id'])

int

In [None]:
animal_dat_updated.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Animal        4 non-null      object
 1   Info          4 non-null      object
 2   Name_Extract  4 non-null      object
 3   ID_Extract    4 non-null      object
dtypes: object(4)
memory usage: 256.0+ bytes


In [None]:
animal_dat_updated["ID_Extract"][0]

'54'

In [None]:
animal_json = animal_dat_updated.to_json()
animal_json


'{"Animal":{"0":"Dog","1":"Cat","2":"Cow","3":"Mouse"},"Info":{"0":"[{\\"id\\": 54, \\"name\\":\\"Precious\\"}]","1":"[{\\"id\\": 24, \\"name\\":\\"Midnight\\"}]","2":"[{\\"id\\": 32, \\"name\\":\\"Spots\\"}]","3":"[{\\"id\\": 58, \\"name\\":\\"Fuzzy\\"}]"},"Name_Extract":{"0":"Precious","1":"Midnight","2":"Spots","3":"Fuzzy"},"ID_Extract":{"0":"54","1":"24","2":"32","3":"58"}}'

In [None]:
type(animal_json)

str

In [None]:
json_animal = json.loads(animal_json)
type(json_animal)


dict

In [None]:
json_animal


{'Animal': {'0': 'Dog', '1': 'Cat', '2': 'Cow', '3': 'Mouse'},
 'Info': {'0': '[{"id": 54, "name":"Precious"}]',
  '1': '[{"id": 24, "name":"Midnight"}]',
  '2': '[{"id": 32, "name":"Spots"}]',
  '3': '[{"id": 58, "name":"Fuzzy"}]'},
 'Name_Extract': {'0': 'Precious',
  '1': 'Midnight',
  '2': 'Spots',
  '3': 'Fuzzy'},
 'ID_Extract': {'0': '54', '1': '24', '2': '32', '3': '58'}}

In [None]:
animals_v02 = pd.DataFrame.from_dict(json_animal)
animals_v02



Unnamed: 0,Animal,Info,Name_Extract,ID_Extract
0,Dog,"[{""id"": 54, ""name"":""Precious""}]",Precious,54
1,Cat,"[{""id"": 24, ""name"":""Midnight""}]",Midnight,24
2,Cow,"[{""id"": 32, ""name"":""Spots""}]",Spots,32
3,Mouse,"[{""id"": 58, ""name"":""Fuzzy""}]",Fuzzy,58


In [None]:
animals_v02.info()


<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Animal        4 non-null      object
 1   Info          4 non-null      object
 2   Name_Extract  4 non-null      object
 3   ID_Extract    4 non-null      object
dtypes: object(4)
memory usage: 160.0+ bytes


## Reading JSON into a data frame

In [7]:
!jq . sample_data/anscombe.json | head -30

[
  {
    "Series": "I",
    "X": 10,
    "Y": 8.04
  },
  {
    "Series": "I",
    "X": 8,
    "Y": 6.95
  },
  {
    "Series": "I",
    "X": 13,
    "Y": 7.58
  },
  {
    "Series": "I",
    "X": 9,
    "Y": 8.81
  },
  {
    "Series": "I",
    "X": 11,
    "Y": 8.33
  },
  {
    "Series": "I",
    "X": 14,
    "Y": 9.96


In [2]:
df = pd.DataFrame( json.load( open( 'sample_data/anscombe.json', 'r') ))
df

Unnamed: 0,Series,X,Y
0,I,10.0,8.04
1,I,8.0,6.95
2,I,13.0,7.58
3,I,9.0,8.81
4,I,11.0,8.33
5,I,14.0,9.96
6,I,6.0,7.24
7,I,4.0,4.26
8,I,12.0,10.84
9,I,7.0,4.81


## Writing a data frame to JSON

In [21]:
df.to_json(path_or_buf="anscombe.json", orient = "table")

In [22]:
!jq . anscombe.json

[1;39m{
  [0m[34;1m"schema"[0m[1;39m: [0m[1;39m{
    [0m[34;1m"fields"[0m[1;39m: [0m[1;39m[
      [1;39m{
        [0m[34;1m"name"[0m[1;39m: [0m[0;32m"index"[0m[1;39m,
        [0m[34;1m"type"[0m[1;39m: [0m[0;32m"integer"[0m[1;39m
      [1;39m}[0m[1;39m,
      [1;39m{
        [0m[34;1m"name"[0m[1;39m: [0m[0;32m"Series"[0m[1;39m,
        [0m[34;1m"type"[0m[1;39m: [0m[0;32m"string"[0m[1;39m
      [1;39m}[0m[1;39m,
      [1;39m{
        [0m[34;1m"name"[0m[1;39m: [0m[0;32m"X"[0m[1;39m,
        [0m[34;1m"type"[0m[1;39m: [0m[0;32m"integer"[0m[1;39m
      [1;39m}[0m[1;39m,
      [1;39m{
        [0m[34;1m"name"[0m[1;39m: [0m[0;32m"Y"[0m[1;39m,
        [0m[34;1m"type"[0m[1;39m: [0m[0;32m"number"[0m[1;39m
      [1;39m}[0m[1;39m
    [1;39m][0m[1;39m,
    [0m[34;1m"primaryKey"[0m[1;39m: [0m[1;39m[
      [0;32m"index"[0m[1;39m
    [1;39m][0m[1;39m,
    [0m[34;1m"pandas_version"[0m[1;39m: [

In [25]:
df = pd.read_json( 'anscombe.json', orient = "table")
df

Unnamed: 0,Series,X,Y
0,I,10,8.04
1,I,8,6.95
2,I,13,7.58
3,I,9,8.81
4,I,11,8.33
5,I,14,9.96
6,I,6,7.24
7,I,4,4.26
8,I,12,10.84
9,I,7,4.81


In [31]:
df_dict= json.load( open("anscombe.json", 'r') )
df_small = { "schema": df_dict["schema"],
  "data": df_dict["data"][0]
}

In [36]:
df_small

{'schema': {'fields': [{'name': 'index', 'type': 'integer'},
   {'name': 'Series', 'type': 'string'},
   {'name': 'X', 'type': 'integer'},
   {'name': 'Y', 'type': 'number'}],
  'primaryKey': ['index'],
  'pandas_version': '1.4.0'},
 'data': {'index': 0, 'Series': 'I', 'X': 10, 'Y': 8.04}}

In [35]:
pd.read_json( json.dumps(df_small), orient="table" )

ValueError: If using all scalar values, you must pass an index