In [2]:
import pandas as pd
import json
import pprint

# 导入json

In [None]:
from collections import Counter
pd.set_option('display.width', 85)
pd.set_option('display.max_columns', 8)

# load tabular structure JSON data
with open('data/allcandidatenewssample.json') as f:
  candidatenews = json.load(f)


len(candidatenews)
pprint.pprint(candidatenews[0:2])
pprint.pprint(candidatenews[0]['source'])

Counter([len(item) for item in candidatenews])
pprint.pprint(next(item for item in candidatenews if len(item)<9))
pprint.pprint(next(item for item in candidatenews if len(item)>9))
pprint.pprint([item for item in candidatenews if len(item)==2][0:2])

candidatenews = [item for item in candidatenews if len(item)>2]
len(candidatenews)

# generate counts from JSON data
politico = [item for item in candidatenews if item["source"] == "Politico"]
len(politico)
pprint.pprint(politico[0:2])
sources = [item.get('source') for item in candidatenews]
type(sources)
len(sources)
sources[0:5]
pprint.pprint(Counter(sources).most_common(10))

# fix errors in values in dictionary
for newsdict in candidatenews:
    newsdict.update((k, "The Hill") for k, v in newsdict.items()
     if k == "source" and v == "TheHill")

sources = [item.get('source') for item in candidatenews]
pprint.pprint(Counter(sources).most_common(10))

# create a pandas data frame
candidatenewsdf = pd.DataFrame(candidatenews)
candidatenewsdf.dtypes
candidatenewsdf.rename(columns={'date':'storydate'}, inplace=True)
candidatenewsdf.storydate = candidatenewsdf.storydate.astype('datetime64[ns]')
candidatenewsdf.shape
candidatenewsdf.source.value_counts(sort=True).head(10)


# 导入 json api

In [None]:
import requests

pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 8)

# load more complicated data
response = requests.get("https://openaccess-api.clevelandart.org/api/artworks/?african_american_artists")
camcollections = json.loads(response.text)

len(camcollections['data'])
pprint.pprint(camcollections['data'][0])

# flatten the data
camcollectionsdf = \
  pd.json_normalize(camcollections['data'], 
  'citations', 
  ['accession_number','title','creation_date',
  'collection','creators','type'])
camcollectionsdf.head(2).T
creator = camcollectionsdf[:1].creators[0]
type(creator[0])
pprint.pprint(creator)

camcollectionsdf['birthyear'] = camcollectionsdf.\
 creators.apply(lambda x: x[0]['birth_year'])

camcollectionsdf.birthyear.value_counts().\
 sort_index().head()

# 导入Web

In [None]:
import requests
from bs4 import BeautifulSoup

pd.set_option('display.width', 85)
pd.set_option('display.max_columns',8)

# parse the web page and get the header row of the table

webpage = requests.get("http://www.alrb.org/datacleaning/highlowcases.html")

bs = BeautifulSoup(webpage.text, 'html.parser')
theadrows = bs.find('table', {'id':'tblLowCases'}).thead.find_all('th')
type(theadrows)
labelcols = [j.get_text() for j in theadrows]
labelcols[0] = "rowheadings"
labelcols

# get the data from the table cells
rows = bs.find('table', {'id':'tblLowCases'}).tbody.find_all('tr')
datarows = []
labelrows = []
for row in rows:
  rowlabels = row.find('th').get_text()
  cells = row.find_all('td', {'class':'data'})
  if (len(rowlabels)>3):
    labelrows.append(rowlabels)
  if (len(cells)>0):
    cellvalues = [j.get_text() for j in cells]
    datarows.append(cellvalues)

pprint.pprint(datarows[0:2])
pprint.pprint(labelrows[0:2])

for i in range(len(datarows)):
  datarows[i].insert(0, labelrows[i])

pprint.pprint(datarows[0:2])

labelcols

# load the data into pandas
lowcases = pd.DataFrame(datarows, columns=labelcols)
lowcases.iloc[:,1:5].head()
lowcases.dtypes
lowcases.columns = lowcases.columns.str.replace(" ", "_").str.lower()

#lowcases.columns
#fixcols = ['total_cases','total_deaths','total_cases_pm','total_deaths_pm','population','gdp_per_capita']

for col in lowcases.columns[2:-1]:
  lowcases[col] = lowcases[col].\
    str.replace("[^0-9]","",regex=True).astype('int64')

lowcases['last_date'] = pd.to_datetime(lowcases.last_date)
lowcases['median_age'] = lowcases['median_age'].astype('float')

lowcases.dtypes

# importing spark

In [None]:
from pyspark.sql import SparkSession

pd.set_option('display.width', 78)
pd.set_option('display.max_columns',6)

# initiate a Spark session and import CSV data
spark = SparkSession.builder \
   .getOrCreate()
   
landtemps = spark.read.option("header",True) \
     .csv("data/landtemps.tar.gz")

type(landtemps)

# look at the structure of the Spark DataFrame
landtemps.count()

landtemps.printSchema()

landtemps.select("station",'country','month','year','temp') \
    .show(5, False)

# chagne a data type
landtemps = landtemps \
  .withColumn("temp",landtemps.temp.cast('float'))

landtemps.select("temp").dtypes

landtemps.describe('temp').show()

# load JSON data
allcandidatenews = spark.read \
     .json("data/allcandidatenewssample.json")

allcandidatenews \
  .select("source","title","story_position") \
  .show(5)

# display structure of JSON data
allcandidatenews.count()

allcandidatenews.printSchema()

allcandidatenews \
   .describe('story_position') \
   .show()
    
allcandidatenewsdf = allcandidatenews.toPandas()

allcandidatenewsdf.info()

# 持久化json

In [None]:
import requests
import msgpack

pd.set_option('display.width', 85)
pd.set_option('display.max_columns', 8)

# load complicated JSON data from an API
response = requests.get("https://openaccess-api.clevelandart.org/api/artworks/?african_american_artists")

camcollections = json.loads(response.text)
len(camcollections['data'])
pprint.pprint(camcollections['data'][0])

# save to a json file
with open("data/camcollections.json","w") as f:
  json.dump(camcollections, f)

# read the json file
with open("data/camcollections.json","r") as f:
  camcollections = json.load(f)

pprint.pprint(camcollections['data'][0]['creators'])

# Write msgpack file
with open("data/camcollections.msgpack", "wb") as outfile:
    packed = msgpack.packb(camcollections)
    outfile.write(packed)

# Read msgpack file
with open("data/camcollections.msgpack", "rb") as data_file:
    msgbytes = data_file.read()

camcollections = msgpack.unpackb(msgbytes)

pprint.pprint(camcollections['data'][0]['creators'])

# 数据版本控制

In [None]:
from deltalake.writer import write_deltalake
from deltalake import DeltaTable
import os

pd.set_option('display.width', 78)
pd.set_option('display.max_columns',6)

os.makedirs("data/temps_lake", exist_ok=True)

landtemps = pd.read_csv('data/landtempssample.csv',
    names=['stationid','year','month','avgtemp','latitude',
      'longitude','elevation','station','countryid','country'],
    skiprows=1,
    parse_dates=[['month','year']])

landtemps.shape

write_deltalake("data/temps_lake", landtemps)

tempsdelta = DeltaTable("data/temps_lake", version=0)
type(tempsdelta)
tempsdfv1 = tempsdelta.to_pandas()
tempsdfv1.shape

write_deltalake("data/temps_lake", landtemps.head(1000), mode="overwrite")

tempsdfv2 = DeltaTable("data/temps_lake", version=1).to_pandas()
tempsdfv2.shape

write_deltalake("data/temps_lake", landtemps.head(1000), mode="append")

tempsdfv3 = DeltaTable("data/temps_lake", version=2).to_pandas()
tempsdfv3.shape

DeltaTable("data/temps_lake", version=0).to_pandas().shape

CommitFailedError: Writer features must be specified for writerversion >= 7, please specify: TimestampWithoutTimezone