<h1>Data Loading, Storage, and File Formats</h1>
<p>The 6th chapter of the Python for Data Analysis, 3E</p>
<br>
<h3>Reading and Writing Data in Text Format</h3>

In [12]:
!cat pydata-book/examples/ex1.csv
print()

# How to import a CSV file
import pandas as pd
df1 = pd.read_csv('pydata-book/examples/ex1.csv')
print(df1) # In this case the contents of the file are already perfect

# What if the file had no column names
!cat pydata-book/examples/ex2.csv
print()
df2 = pd.read_csv('pydata-book/examples/ex2.csv', header=None, names=['a', 'b', 'c', 'd', 'message'], index_col='message')
print(df2)

# What if the file has no delimeter and used something like a space to seperate values
!cat pydata-book/examples/ex3.txt
print()
df3 = pd.read_csv('pydata-book/examples/ex3.txt', index_col=0, sep='\s+')
print(df3)

# What if the first, third and fourth rows were not useful
!cat pydata-book/examples/ex4.csv
df4 = pd.read_csv('pydata-book/examples/ex4.csv', index_col='message', skiprows=[0, 2, 3])
print(df4)

# If data has missing values represented with a special symbol or word use the arguement na_values so that
# np.nan can be put in place of those special symbols in the dataframe
df5 = pd.read_csv('pydata-book/examples/ex5.csv', na_values=['NULL', 'NA'])
print(df5)

a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo
   a   b   c   d message
0  1   2   3   4   hello
1  5   6   7   8   world
2  9  10  11  12     foo
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo
         a   b   c   d
message               
hello    1   2   3   4
world    5   6   7   8
foo      9  10  11  12
            A         B         C
aaa -0.264438 -1.026059 -0.619500
bbb  0.927272  0.302904 -0.032399
ccc -0.264273 -0.386314 -0.217601
ddd -0.871858 -0.348382  1.100491

            A         B         C
aaa -0.264438 -1.026059 -0.619500
bbb  0.927272  0.302904 -0.032399
ccc -0.264273 -0.386314 -0.217601
ddd -0.871858 -0.348382  1.100491
# hey!
a,b,c,d,message
# just wanted to make things more difficult for you
# who reads CSV files with computers, anyway?
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo
         a   b   c   d
message               
hello    1   2   3   4
world    5   6   7   8
foo      9  10  11  12
  something  a   b     c   d message
0       one  1   2   3.0  

In [13]:
# Reading text files in pieces
# What if you only want to read n rows in the data, ignoring the header 
df6 = pd.read_csv('pydata-book/examples/ex6.csv', nrows=5)
print(df6)

# You want to read the file in chunks of x rows
df6_iter = pd.read_csv('pydata-book/examples/ex6.csv', chunksize=1000) # Returns an iterable
totals = pd.Series([], dtype='int64')
for chunk in df6_iter:
    totals = totals.add(chunk.key.value_counts(), fill_value=0)

totals = totals.sort_values(ascending=False)
print(totals)

        one       two     three      four key
0  0.467976 -0.038649 -0.295344 -1.824726   L
1 -0.358893  1.404453  0.704965 -0.200638   B
2 -0.501840  0.659254 -0.421691 -0.057688   G
3  0.204886  1.074134  1.388361 -0.982404   R
4  0.354628 -0.133116  0.283763 -0.837063   Q
E    368.0
X    364.0
L    346.0
O    343.0
Q    340.0
M    338.0
J    337.0
F    335.0
K    334.0
H    330.0
V    328.0
I    327.0
U    326.0
P    324.0
D    320.0
A    320.0
R    318.0
Y    314.0
G    308.0
S    308.0
N    306.0
W    305.0
T    304.0
B    302.0
Z    288.0
C    286.0
4    171.0
6    166.0
7    164.0
8    162.0
3    162.0
5    157.0
2    152.0
0    151.0
9    150.0
1    146.0
dtype: float64


In [16]:
import sys
# Writing dataframes to an output (can be a file or the console sys.stdout)
df5 = pd.read_csv('pydata-book/examples/ex5.csv', na_values=['NULL', 'NA'])
print(df5)

# Export using to_csv method
df5.to_csv(sys.stdout, na_rep='NULL', sep=' ') # sep specifies seperator and na_rep specifies how to 
# represent null values in output

  something  a   b     c   d message
0       one  1   2   3.0   4     NaN
1       two  5   6   NaN   8   world
2     three  9  10  11.0  12     foo
 something a b c d message
0 one 1 2 3.0 4 NULL
1 two 5 6 NULL 8 world
2 three 9 10 11.0 12 foo


In [23]:
# Working with JSON in python
import json
obj = """
{"name": "Wes",
 "cities_lived": ["Akron", "Nashville", "New York", "San Francisco"],
 "pet": null,
 "siblings": [{"name": "Scott", "age": 34, "hobbies": ["guitars", "soccer"]},
              {"name": "Katie", "age": 42, "hobbies": ["diving", "art"]}]
}
""" # A JSON string

# Convert to python obj
data = json.loads(obj)
print(data)

# Convert python obj back to JSON
data_json = json.dumps(data)
print(data_json)

# Create a dataframe from list of JSON objects
df = pd.DataFrame(data['siblings'], columns=['name', 'age'])
print(df)

# You can also convert JSON directly to a dataframe or series using pd.read_json
df2 = pd.read_json('pydata-book/examples/example.json')
print(df2)

# To convert dataframe to JSON use df.to_json
df_json = df2.to_json()
print(df_json)

{'name': 'Wes', 'cities_lived': ['Akron', 'Nashville', 'New York', 'San Francisco'], 'pet': None, 'siblings': [{'name': 'Scott', 'age': 34, 'hobbies': ['guitars', 'soccer']}, {'name': 'Katie', 'age': 42, 'hobbies': ['diving', 'art']}]}
{"name": "Wes", "cities_lived": ["Akron", "Nashville", "New York", "San Francisco"], "pet": null, "siblings": [{"name": "Scott", "age": 34, "hobbies": ["guitars", "soccer"]}, {"name": "Katie", "age": 42, "hobbies": ["diving", "art"]}]}
    name  age
0  Scott   34
1  Katie   42
   a  b  c
0  1  2  3
1  4  5  6
2  7  8  9
{"a":{"0":1,"1":4,"2":7},"b":{"0":2,"1":5,"2":8},"c":{"0":3,"1":6,"2":9}}


In [27]:
# Working with HTML
tables = pd.read_html('pydata-book/examples/fdic_failed_bank_list.html')
print(len(tables))
print(tables[0].head())

1
                      Bank Name             City  ST   CERT  \
0                   Allied Bank         Mulberry  AR     91   
1  The Woodbury Banking Company         Woodbury  GA  11297   
2        First CornerStone Bank  King of Prussia  PA  35312   
3            Trust Company Bank          Memphis  TN   9956   
4    North Milwaukee State Bank        Milwaukee  WI  20364   

                 Acquiring Institution        Closing Date       Updated Date  
0                         Today's Bank  September 23, 2016  November 17, 2016  
1                          United Bank     August 19, 2016  November 17, 2016  
2  First-Citizens Bank & Trust Company         May 6, 2016  September 6, 2016  
3           The Bank of Fayette County      April 29, 2016  September 6, 2016  
4  First-Citizens Bank & Trust Company      March 11, 2016      June 16, 2016  


In [35]:
# Working with Excel Files

# Importing the entire file with all of its sheets
excel_file = pd.ExcelFile('pydata-book/examples/ex1.xlsx')

# Attributes include the names of sheet in the file
print(excel_file.sheet_names)

# You can then parse a sheet from the file as a DataFrame
excel_df = excel_file.parse(sheet_name='Sheet1', index_col=0)
print(excel_df)

# If you only need a sheet from the excel file as a dataframe you can use read_excel command
excel_df = pd.read_excel('pydata-book/examples/ex1.xlsx', sheet_name='Sheet1', index_col=0)
print(excel_df)

# To convert dataframe to excel use to_excel method
excel_df.to_excel('new.xlsx')

['Sheet1']
   a   b   c   d message
0  1   2   3   4   hello
1  5   6   7   8   world
2  9  10  11  12     foo
   a   b   c   d message
0  1   2   3   4   hello
1  5   6   7   8   world
2  9  10  11  12     foo


In [44]:
# Working with web APIs
import requests

# Get request
url = 'https://api.github.com/repos/pandas-dev/pandas/issues'
# Make request to URL
resp = requests.get(url)
resp.raise_for_status() # Raises an error if HTTPError occured
print(resp)

body = resp.json()
print(body[0]['title'])

# Create a dataframe from body and choose columns of interest
github_df = pd.DataFrame(body, columns=['number', 'title', 'labels', 'state'])
print(github_df)

<Response [200]>
TST/CLN: Avoid subprocess shell=True
    number                                              title  \
0    49033               TST/CLN: Avoid subprocess shell=True   
1    49032  REG: fix regression in df.corrwith on tied dat...   
2    49031  Suppress false positive pylint findings for 'n...   
3    49029  CI: Investigate slow macOS build time for Pyth...   
4    49025  pylint: disable invalid-repr-returned in Serie...   
5    49024                           PDEP0004: implementation   
6    49023  QST: Python pandas 1.3.5 to 1.4.0 breaking cha...   
7    49021  BUG: comparing pd.Timedelta with timedelta.max...   
8    49018                   pylint: fix misplaced-bare-raise   
9    49017                    CLN: tseries/offsets base tests   
10   49016  BUG: `MultiIndex.sortlevel` not working correc...   
11   49014  API: retain non-nano timedelta64 dtype in Data...   
12   49011  differences in Series.map with defaultdict wit...   
13   49010             PERF: MultiIn