# Retrieving, Processing, and Storing Data

## Comparing Numpy .npy binary and pandas pickling

In [1]:
import numpy as np
import pandas as pd
from tempfile import NamedTemporaryFile
from os.path import getsize

np.random.seed(40)
a = np.random.randn(365, 4)

tmpf = NamedTemporaryFile()
np.savetxt(tmpf, a, delimiter=',')
print("Size CSV file:", getsize(tmpf.name))

Size CSV file: 36684


In [3]:
tmpf = NamedTemporaryFile()
np.save(tmpf, a)
tmpf.seek(0)
loaded = np.load(tmpf)
print("Shape:", loaded.shape)
print("Size .npy file", getsize(tmpf.name))

Shape: (365, 4)
Size .npy file 11808


In [4]:
tmpf = NamedTemporaryFile()
df = pd.DataFrame(a)
df.to_pickle(tmpf.name)
print("Size pickled dataframe:", getsize(tmpf.name))
print("DF from pickle\n", pd.read_pickle(tmpf.name))

Size pickled dataframe: 12254
DF from pickle
             0         1         2         3
0   -0.607548 -0.126136 -0.684606  0.928715
1   -1.844401 -0.467002  2.292490  0.488810
2    0.710267  1.055534  0.054073  0.257953
3    0.588282  0.885244 -1.017007 -0.133693
4   -0.438186  0.493443 -0.199009 -1.274984
5    0.293494  0.108950  0.031727  1.272640
6    1.071448  0.415818  1.550679 -0.311379
7   -1.379240  1.371409  0.027712 -0.320400
8   -0.846170 -0.433429 -1.337035  0.209172
9   -1.424321 -0.553477  0.074799 -0.505620
10   1.052408  0.971400  0.076832 -0.435001
11   0.552994  0.266716  0.008989  0.641103
12  -0.177707  0.696278 -1.188725 -0.331697
13   0.030076 -1.107915 -0.549925 -2.032910
14   1.407918  0.633108  2.212747 -0.526602
15   0.542882 -0.084480  1.292015 -0.176711
16   1.687787 -1.046614  0.642120 -0.172962
17  -1.112065 -0.020703 -1.813522  0.203522
18   0.531874 -1.017023 -1.848051  0.172549
19   0.786202 -0.071470  0.836349 -0.198229
20  -0.092273  0.870726 -0.839

## Storing data with PyTables(HDF format)

In [14]:
import numpy as np
import tables
from tempfile import NamedTemporaryFile
from os.path import getsize

np.random.seed(42)
a = np.random.randn(365, 4)

tmpf = NamedTemporaryFile()
h5file = tables.open_file(tmpf.name, mode='w', title='Numpy Array')
root = h5file.root
h5file.create_array(root, "array", a)
print(type(root))
h5file.close()

h5file = tables.open_file(tmpf.name, 'r')
print(getsize(tmpf.name))

for node in h5file.iter_nodes(h5file.root):
    b = node.read()
    print(type(b), b.shape)

h5file.close()

<class 'tables.group.RootGroup'>
13728
<class 'numpy.ndarray'> (365, 4)


## Reading and writing pandas DataFrame to HDF5 stores

In [23]:
tmpf = NamedTemporaryFile()
store = pd.HDFStore(tmpf.name)
print(store)

<class 'pandas.io.pytables.HDFStore'>
File path: /var/folders/rd/fxns6vhj1m12_gpmxw405gq40000gn/T/tmpn6zfvs4o
Empty


In [17]:
store['df'] = df
print(store)

<class 'pandas.io.pytables.HDFStore'>
File path: /var/folders/rd/fxns6vhj1m12_gpmxw405gq40000gn/T/tmpp5lne_88
/df            frame        (shape->[365,4])


### Read value from store

In [18]:
print(store.get('df').shape)
print(store['df'].shape)
print(store.df.shape)

(365, 4)
(365, 4)
(365, 4)


In [20]:
del store['df']
print(store)

<class 'pandas.io.pytables.HDFStore'>
File path: /var/folders/rd/fxns6vhj1m12_gpmxw405gq40000gn/T/tmpp5lne_88
Empty


- DataFrame支持直接保存为HDF格式文件 DataFrame.to_hdf()
- Pandas顶层函数也支持从HDF文件读取 pd.read_hdf()

In [24]:
tmpf = NamedTemporaryFile()
df.to_hdf(tmpf.name, 'mydata', format='table', mode='w')
print(pd.read_hdf(tmpf.name, 'mydata', where=['index>360']))

            0         1         2         3
361 -0.845906  1.987754 -0.971906  1.900442
362 -0.195155 -0.587276 -0.516338  0.164434
363 -1.191338  1.043126 -0.125925 -0.702099
364  0.885382  0.684500  1.409991  1.278488


## Reading and writing to Excel with Pandas

In [25]:
import numpy as np
import pandas as pd
from tempfile import NamedTemporaryFile

tmpf = NamedTemporaryFile(suffix='.xlsx')
np.random.seed(53)
a = np.random.randn(365, 4)
df = pd.DataFrame(a)
df.to_excel(tmpf.name, sheet_name='Random Data')
print("Mean\n", pd.read_excel(tmpf.name, 'Random Data').mean())

Mean
 0   -0.008083
1    0.008668
2    0.015442
3   -0.061955
dtype: float64


## Using JSON

In [27]:
import json

json_str="""{"country":"Netherlands", 
        "dma_code":"0",
        "timezone":"Europe\/Amsterdam",
        "area_code":"0",
        "ip":"46.19.37.108",
        "asn":"AS196752",
        "continent_code":"EU",
        "isp":"Tilaa V.O.F.",
        "longitude":5.75,
        "latitude":52.5,
        "country_code":"NL",
        "country_code3":"NLD"}"""
data = json.loads(json_str)
print(data['country'])
data['country'] = 'Brazil'
print(json.dumps(data))

Netherlands
{"country": "Brazil", "dma_code": "0", "timezone": "Europe/Amsterdam", "area_code": "0", "ip": "46.19.37.108", "asn": "AS196752", "continent_code": "EU", "isp": "Tilaa V.O.F.", "longitude": 5.75, "latitude": 52.5, "country_code": "NL", "country_code3": "NLD"}


In [29]:
data = pd.read_json(json_str, typ='series')
print(data)
data['country'] = 'China'
print(data.to_json())

area_code                        0
asn                       AS196752
continent_code                  EU
country                Netherlands
country_code                    NL
country_code3                  NLD
dma_code                         0
ip                    46.19.37.108
isp                   Tilaa V.O.F.
latitude                      52.5
longitude                     5.75
timezone          Europe/Amsterdam
dtype: object
{"area_code":"0","asn":"AS196752","continent_code":"EU","country":"China","country_code":"NL","country_code3":"NLD","dma_code":"0","ip":"46.19.37.108","isp":"Tilaa V.O.F.","latitude":52.5,"longitude":5.75,"timezone":"Europe\/Amsterdam"}


## Parsing HTML with BeautifulSoup

In [32]:
from bs4 import BeautifulSoup
import requests
import re

html = requests.get('http://www.kaggle.com').content
soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
print("First link:\n", soup.a)

First link:
 <a class="home-signup__button" href="/?login=true">
                    Create an account
                </a>


In [33]:
print(soup.a['href'])

/?login=true


In [36]:
# soup('a') is same with soup.find_all('a')
for link in soup('a'):
    print(link.string, ":", link.get('href'))


                    Create an account
                 : /?login=true

                    Host a competition
                 : https://www.kaggle.com/host
None : /jobs
None : /competitions
Want to host a competition? : /host
None : /datasets
None : /kernels
None : /c/titanic
None : /kernels
None : http://blog.kaggle.com/
None : https://www.kaggle.com/inclass

                Create a profile
             : /?login=true
kernels : /kernels
user ranking : /users
forums : /forums
Follow our jobs board : /jobs
None : /competitions
None : /datasets
None : /kernels

            Create an account
         : /?login=true

            Host a competition
         : /host
Our Team : /team
Terms : /terms
Privacy : /privacy
Contact/Support : /contact
