# Data Loading and Storage: A Tutorial

In [None]:
# import pandas
import pandas as pd
from pandas import Series, DataFrame

## Reading and Writing Data

- pandas features a number of functions for reading tabular data as a DataFrame object.
    - read_csv
    - read_excel
    - read_json
    - read_sql

- DataFrame has methods to write data to a file.
    - to_csv
    - to_json

### Reading Data in Text Format

In [None]:
# a csv file without a header row
!cat students2.csv

In [None]:
# pandas assigns default column names
df3 = pd.read_csv('students2.csv', header = None)
df3

In [None]:
# specify column names
df4 = pd.read_csv('students2.csv', names = ['Fname', 'Gender', 'Major', 'GPA'])
df4

In [None]:
# a text file where fields are separated by a variable amount of whitespace
!cat students3.txt

In [None]:
# read the text file using a regular expression as a delimiter
df5 = pd.read_csv('students3.txt', sep = '\s+')
df5

In [None]:
# a csv file with missing values
!cat students4.csv

In [None]:
# read a file missing missing values
df6 = pd.read_csv('students4.csv')
df6

### Writing Data to Text Format

In [None]:
# write a subset of the columns
df7.to_csv('out2.csv', index = False, columns = ['Fname', 'GPA'])
!cat out2.csv

### JSON Data

- JSON has become one of the standard formats for sending data by HTTP request between web browsers and other applications.
- JSON is close to valid Python code. 
- Basic types: objects (dicts), arrays (lists), strings, numbers, booleans, and nulls.
- All of the keys in an JSON object must be strings. 

In [None]:
# a JSON string
obj = """
[
    {
        "Name": "Mike",
        "Gender": "M", 
        "Major": "FIN",
        "GPA": 3.4
    },
    {
        "Name": "Mary",
        "Gender": "F", 
        "Major": "MGT",
        "GPA": 3.7
    },
    {
        "Name": "Lily",
        "Gender": "F", 
        "Major": null,
        "GPA": 3.2
    }
]
"""

In [None]:
import json

In [None]:
# convert a JSON string to a Pyathon form
lst = json.loads(obj)
lst

In [None]:
# convert a JSON object to a DataFrame
df8 = pd.DataFrame(lst, columns = ['Name', 'Major'])
df8

In [None]:
# a JSON file
!cat students5.json

In [None]:
# read a JSON file
df9 = pd.read_json('students5.json')
df9

In [None]:
# write data to a JSON file
df9.to_json('students6.json', orient = 'records')
!cat students6.json

In [None]:
# convert a Python object back to JSON
lst2 = df9.to_dict(orient = 'records')
asjson = json.dumps(lst2)
asjson

## Web Scraping

- Web scraping is the practice of automated gathering data from the internet.
- This is accomplished by writing an antomated program that queries a web server, requests data, and then parses the data to extract needed information.
- Web scraping process flow
    - Retrieving HTML data -- **requests** library
    - Parsing the data -- **BeautifulSoup** library and **re** module

### Connecting to Retrieve Data

In [None]:
import requests

In [None]:
html = requests.get('http://dataquestio.github.io/web-scraping-pages/simple.html')
# a status code of 200 indicates that the page was downloaded successfully
print(html.status_code)

In [None]:
# print out the content of the page
print(html.content)

### Parsing HTML Data

In [None]:
from bs4 import BeautifulSoup

In [None]:
# transform HTML into a BeautifulSoup object
bs = BeautifulSoup(html.content, 'html.parser')

In [None]:
# print out the content of the page with a nice format
print(bs.prettify())

In [None]:
# print out the text inside the title tag
print(bs.title)

In [None]:
# find all instances of the 'p' tag
ptag = bs.find_all('p')
print(ptag)
ptag[0].get_text()

In [None]:
# generate a list of the elements at the top level
lst1 = list(bs.children)
print(lst1)
lst1[2]

In [None]:
# type of an item
[type(item) for item in lst1]

In [None]:
# all elements inside the html tag
lst2 = list(lst1[2].children)
lst2

In [None]:
# Downloading weather data
page = requests.get('https://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168#.XcroLFdKjb0')
page.status_code

In [None]:
soup = BeautifulSoup(page.content, 'html.parser')
print(soup.title)

In [None]:
seven_day = soup.find(id = 'seven-day-forecast') 
daily_item = seven_day.find_all(class_ = 'tombstone-container')
today = daily_item[0]
print(today)
daily_item[1]

In [None]:
periods = [item.find(class_ = 'period-name').get_text() for item in daily_item]
descs = [item.find('img')['title'] for item in daily_item]
temps = [item.find(class_ = "temp").get_text() for item in daily_item]

In [None]:
weather_df = pd.DataFrame({'period': periods, 'description': descs, 'temperature': temps})
weather_df