# Reading JSON from file

### Loading JSON to Python dictionary

In [1]:
import json

# use the function open to read the JSON file and 
# then the method json.load() to parse the JSON string into a Python dictionary called superHeroSquad.

with open('superheroes.json') as f:  # Open the file in read mode with the Open() function and pass the file path as argument
    superHeroSquad = json.load(f)   # Parse the json data from file using the json.load(), it reads the file and converts json data to python obj
    
# Note: the json library has both load() and loads() .
# loads() is to create a Python object from a JSON string. the extra ‘s’ in loads() as “load for strings”. 
# load() is to create a Python object from a JSON file.

In [2]:
for i in superHeroSquad:
    print(i)

squadName
homeTown
formed
secretBase
active
members


In [3]:
print(superHeroSquad)

{'squadName': 'Super Hero Squad', 'homeTown': 'Metro City', 'formed': 2016, 'secretBase': 'Super tower', 'active': True, 'members': [{'name': 'Molecule Man', 'age': 29, 'secretIdentity': 'Dan Jukes', 'powers': ['Radiation resistance', 'Turning tiny', 'Radiation blast']}, {'name': 'Madame Uppercut', 'age': 39, 'secretIdentity': 'Jane Wilson', 'powers': ['Million tonne punch', 'Damage resistance', 'Superhuman reflexes']}, {'name': 'Eternal Flame', 'age': 1000000, 'secretIdentity': 'Unknown', 'powers': ['Immortality', 'Heat Immunity', 'Inferno', 'Teleportation', 'Interdimensional travel']}]}


### Loading JSON to Pandas Dataframe

In [4]:
# Use the method read_json() if you would like to transform the JSON file to a Pandas Dataframe.

import pandas as pd
df = pd.read_json("superheroes.json")
df

Unnamed: 0,squadName,homeTown,formed,secretBase,active,members
0,Super Hero Squad,Metro City,2016,Super tower,True,"{'name': 'Molecule Man', 'age': 29, 'secretIde..."
1,Super Hero Squad,Metro City,2016,Super tower,True,"{'name': 'Madame Uppercut', 'age': 39, 'secret..."
2,Super Hero Squad,Metro City,2016,Super tower,True,"{'name': 'Eternal Flame', 'age': 1000000, 'sec..."


### Loading Nested JSON to Pandas

- Nested JSON is similar to the idea of nested dictionaries in python, that is, a dictionary within a dictionary.
- much easier for performing data manipulation and transformation.

##### Use the apply method on the ‘members’ column like this.

In [5]:
df["members"].apply(pd.Series)

Unnamed: 0,name,age,secretIdentity,powers
0,Molecule Man,29,Dan Jukes,"[Radiation resistance, Turning tiny, Radiation..."
1,Madame Uppercut,39,Jane Wilson,"[Million tonne punch, Damage resistance, Super..."
2,Eternal Flame,1000000,Unknown,"[Immortality, Heat Immunity, Inferno, Teleport..."


- To combine the new columns with the original dataframe, we can use pd.concat

In [6]:
df = pd.concat([df["members"].apply(pd.Series), df.drop("members", axis = 1)], axis = 1)

In [7]:
df

Unnamed: 0,name,age,secretIdentity,powers,squadName,homeTown,formed,secretBase,active
0,Molecule Man,29,Dan Jukes,"[Radiation resistance, Turning tiny, Radiation...",Super Hero Squad,Metro City,2016,Super tower,True
1,Madame Uppercut,39,Jane Wilson,"[Million tonne punch, Damage resistance, Super...",Super Hero Squad,Metro City,2016,Super tower,True
2,Eternal Flame,1000000,Unknown,"[Immortality, Heat Immunity, Inferno, Teleport...",Super Hero Squad,Metro City,2016,Super tower,True


##### Use built-in function json_normalize() that will allow you to flatten nested JSONs.

- pass in our superHeroSquad dictionary.
- record_path contains the column that we want parsed out.
- meta is a list of columns we want to keep for the dataframe.
- add in the parameter meta_prefix if you would like to add in a particular naming convention to the parsed data.

In [8]:
pd.json_normalize(superHeroSquad, record_path = ["members"], meta = ["squadName", "homeTown", "formed", "secretBase", "active"])

Unnamed: 0,name,age,secretIdentity,powers,squadName,homeTown,formed,secretBase,active
0,Molecule Man,29,Dan Jukes,"[Radiation resistance, Turning tiny, Radiation...",Super Hero Squad,Metro City,2016,Super tower,True
1,Madame Uppercut,39,Jane Wilson,"[Million tonne punch, Damage resistance, Super...",Super Hero Squad,Metro City,2016,Super tower,True
2,Eternal Flame,1000000,Unknown,"[Immortality, Heat Immunity, Inferno, Teleport...",Super Hero Squad,Metro City,2016,Super tower,True


In [9]:
pd.json_normalize(superHeroSquad, record_path = ['members'], meta = ['squadName', 'homeTown', 'formed', 'secretBase', 'active'], meta_prefix = 'members_')

Unnamed: 0,name,age,secretIdentity,powers,members_squadName,members_homeTown,members_formed,members_secretBase,members_active
0,Molecule Man,29,Dan Jukes,"[Radiation resistance, Turning tiny, Radiation...",Super Hero Squad,Metro City,2016,Super tower,True
1,Madame Uppercut,39,Jane Wilson,"[Million tonne punch, Damage resistance, Super...",Super Hero Squad,Metro City,2016,Super tower,True
2,Eternal Flame,1000000,Unknown,"[Immortality, Heat Immunity, Inferno, Teleport...",Super Hero Squad,Metro City,2016,Super tower,True


##### Make sure its the correct type

In [10]:
type(superHeroSquad)

dict

##### Get the keys 

In [11]:
superHeroSquad.keys()

dict_keys(['squadName', 'homeTown', 'formed', 'secretBase', 'active', 'members'])

# Parsing json from a string

In [12]:
json_str = '{"name": "John Doe", "age": 40}'

In [13]:
data = json.loads(json_str)
data

{'name': 'John Doe', 'age': 40}

# Writing JSON Data to a file
- Parse the json data by creating a python object that you want to write.
- Object can be a dictionary or list

In [14]:
data = {
    "name": "Siphamandla",
    "age": "28",
    "isStudent": False,
    "hobbies": ['reading', 'Golf', 'coding']
}

In [15]:
with open('demographics.json', 'w') as f:  # Open the file in write mode with the Open() function and pass the file path as argument
    json.dump(data, f)   # Parse the json data to file using the json.dump(), it writes the json data to the file

### Pretty-Printing
- Output file shows up as a single string
- use the indent parameter in your json.dump method.

In [None]:
with open('superheroes.json', 'w') as file:
    json.dump(superHeroSquad, file, indent = 4)

### Sorting
- pass in a sort_key parameter, set to True, in order to sort your keys. 
- Notice that all keys including the nested ones are all sorted.

In [None]:
with open('superheroes.json', 'w') as file:
    json.dump(superHeroSquad, file, indent = 4, sort_keys = True)

# Generate JSON from a Python Object
- prepare python object by creating obj to be converted to JSON
- This function will return a JSON-formatted string representation of the object 

In [16]:
json_str = json.dumps(data)
print(json_str)

{"name": "Siphamandla", "age": "28", "isStudent": false, "hobbies": ["reading", "Golf", "coding"]}


# Generating JSON file from Pandas DF
- if you are working with the Pandas Dataframe and would like to export to JSON, you can use the to_json() method.

In [17]:
df

Unnamed: 0,name,age,secretIdentity,powers,squadName,homeTown,formed,secretBase,active
0,Molecule Man,29,Dan Jukes,"[Radiation resistance, Turning tiny, Radiation...",Super Hero Squad,Metro City,2016,Super tower,True
1,Madame Uppercut,39,Jane Wilson,"[Million tonne punch, Damage resistance, Super...",Super Hero Squad,Metro City,2016,Super tower,True
2,Eternal Flame,1000000,Unknown,"[Immortality, Heat Immunity, Inferno, Teleport...",Super Hero Squad,Metro City,2016,Super tower,True


In [18]:
df.secretIdentity[2] = "Will Smith"
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.secretIdentity[2] = "Will Smith"


Unnamed: 0,name,age,secretIdentity,powers,squadName,homeTown,formed,secretBase,active
0,Molecule Man,29,Dan Jukes,"[Radiation resistance, Turning tiny, Radiation...",Super Hero Squad,Metro City,2016,Super tower,True
1,Madame Uppercut,39,Jane Wilson,"[Million tonne punch, Damage resistance, Super...",Super Hero Squad,Metro City,2016,Super tower,True
2,Eternal Flame,1000000,Will Smith,"[Immortality, Heat Immunity, Inferno, Teleport...",Super Hero Squad,Metro City,2016,Super tower,True


In [19]:
df.to_json('updated superhero.json')

# Manipulating JSON Data

### Accessing JSON Data/Array
- use brackets ([]) to access the keys and lists/Array
- Use dot notation (.)
- use data.get()

In [20]:
data["name"]

'Siphamandla'

In [21]:
data["hobbies"][1]

'Golf'

### Modifying JSON Data
- directly assign new values to the desired keys

In [22]:
data['hobbies'].append("swimming")

In [23]:
data

{'name': 'Siphamandla',
 'age': '28',
 'isStudent': False,
 'hobbies': ['reading', 'Golf', 'coding', 'swimming']}

### Adding JSON Data
- use dictionary-like behaviour

In [24]:
data['isMarried'] = False
data['address'] = '40 Warren street'

In [25]:
data

{'name': 'Siphamandla',
 'age': '28',
 'isStudent': False,
 'hobbies': ['reading', 'Golf', 'coding', 'swimming'],
 'isMarried': False,
 'address': '40 Warren street'}

### Removing JSON Data

In [26]:
del data['isMarried']

In [27]:
data

{'name': 'Siphamandla',
 'age': '28',
 'isStudent': False,
 'hobbies': ['reading', 'Golf', 'coding', 'swimming'],
 'address': '40 Warren street'}

# Cleaning Json/ Dictionary

In [None]:
### Remove HTML Tags ##########################################################################################################
def cleanhtml(raw_html):  
    cleanr = re.compile('<.*?>')  
    cleantext = re.sub(cleanr, '', raw_html)  
    return cleantext

In [None]:
### Clean Json Data by removing unwanted text #################################################################################
def CleanData(data):
    jsondata = json.dumps(data)        
    # Cleaning Data ------------------------------------------------------------------------------------------------------------------    
    #replace all occurrences of the required string    
    #remove next line    
    "\n".join(jsondata.splitlines())    
    "\r\n".join(jsondata.splitlines())    
    # Remove the following text.    
    scrubdatarm = jsondata.replace(r'\r\n', '').replace(r'\r', '').replace(r'\n', '').replace(r'\n\n', '').replace(r"\t"," ") \
                .replace("&nbsp;", ' ').replace("&quot", ' ').replace("&ldquo", ' ').replace(";", ' ').replace("&rsquo ", "'") \
                .replace("  ", " ").replace("&rdquo"," ").replace("\u00a0"," ").replace("&gt"," ").replace("&bull"," ").replace("&amp"," ") \
                .replace("&ndash"," ").replace("&#39"," ")
                
    re.sub(' +', ' ', jsondata)        
    # Remove html tags    
    scrubdatahtml = cleanhtml(scrubdatarm)    
    # Load JSON into Dataframe.    
    try:        
        cleandata = json.loads(scrubdatahtml)    
    except:        
        cleandata = json.loads(jsondata)
        
    return cleandata