# READ AND WRITE TEXTFILES

In [21]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame

## FILE WITH USEFUL HEADER

In [22]:
!cat ../examples/ex1.csv

a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

In [23]:
path1 = '../examples/ex1.csv'
df1 = pd.read_csv(path1, sep=',')
df1

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


## FILE WITHOUT USEFUL HEADER

In [24]:
!cat ../examples/ex2.csv

path2 = '../examples/ex2.csv'
df2 = pd.read_csv(path2, sep=',', header=None) # header just numbers
df2

1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [25]:
cols_ex2 = ['a', 'b', 'c', 'd', 'message']
df2 = pd.read_csv(path2, sep=',', names=cols_ex2, index_col='message') 
df2

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


## FILE WITHOUT DELIMITER (TXT)

In [26]:
path = '../examples/ex3.txt'
list(open(path))
df3 = pd.read_table(path)
df3

Unnamed: 0,A B C
0,aaa -0.264438 -1.026059 -0.619500
1,bbb 0.927272 0.302904 -0.032399
2,ccc -0.264273 -0.386314 -0.217601
3,ddd -0.871858 -0.348382 1.100491


In [27]:
df3 = pd.read_table(path, sep='\s+')
df3

Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


## SKIP ROWS

In [28]:
!cat ../examples/ex4.csv
path = '../examples/ex4.csv'

# hey!
a,b,c,d,message
# just wanted to make things more difficult for you
# who reads CSV files with computers, anyway?
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

In [29]:
df4 = pd.read_csv(path, skiprows=[0,2,3])
df4

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


## MISSING DATA

In [30]:
!cat ../examples/ex5.csv
path5 = '../examples/ex5.csv'

something,a,b,c,d,message
one,1,2,3,4,NA
two,5,6,,8,world
three,9,10,11,12,foo

In [31]:
df5 = pd.read_csv(path5)
df5

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


## PIECEMEAL

In [32]:
path6 = '../examples/ex6.csv'
subset = pd.read_csv(path, nrows=5)
subset

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,# hey!
a,b,c,d,message
# just wanted to make things more difficult for you,,,,
# who reads CSV files with computers,anyway?,,,
1,2,3,4,hello
5,6,7,8,world


In [51]:
chunk = pd.read_csv(path6, chunksize=10)
chunk

<pandas.io.parsers.TextFileReader at 0x11bf04850>

In [52]:
tot_keys = Series([])
for piece in chunk:
    tot_keys = tot_keys.add(piece['key'].value_counts(), fill_value=0)

#dir(tot_keys)

tot_keys = tot_keys.sort_index(ascending=False)
tot_keys[:10]

Z    288.0
Y    314.0
X    364.0
W    305.0
V    328.0
U    326.0
T    304.0
S    308.0
R    318.0
Q    340.0
dtype: float64

## WRITE DATA

In [35]:
path_in = '../examples/ex5.csv'
path_out = '../examples/out5.csv'
data = pd.read_csv(path_in)
data.to_csv(path_out, sep='|')

data = pd.read_csv(path_out)
data


Unnamed: 0,|something|a|b|c|d|message
0,0|one|1|2|3.0|4|
1,1|two|5|6||8|world
2,2|three|9|10|11.0|12|foo


## MANUALLY WORKING WITH DELIMITED FORMATS

In [36]:
!cat ../examples/ex7.csv

"a","b","c"
"1","2","3"
"1","2","3","4"


In [37]:
import csv
path = '../examples/ex7.csv'
f = open(path)
reader = csv.reader(f)

for line in reader:
    print(line)

['a', 'b', 'c']
['1', '2', '3']
['1', '2', '3', '4']


In [38]:
lines = pd.DataFrame(list(csv.reader(open(path))))
lines

Unnamed: 0,0,1,2,3
0,a,b,c,
1,1,2,3,
2,1,2,3,4.0


## JSON DATA
####  JAVA SCRIPT OBJECT NOTATION

In [45]:
import json

# JSON is nearly perfect Python code

obj = """
{"name"        : "Andre", 
 "places_lived": ["GERMANY", "USA", "IRELAND"],
 "pet"         : "Erwin",
 "siblings"    : [{"name": "Uta", "age": 25, "pet": "Zimba"}]
 }
 """

obj

'\n{"name"        : "Andre", \n "places_lived": ["GERMANY", "USA", "IRELAND"],\n "pet"         : "Erwin",\n "siblings"    : [{"name": "Uta", "age": 25, "pet": "Zimba"}]\n }\n '

In [53]:
data = json.loads(obj)
data

{'name': 'Andre',
 'places_lived': ['GERMANY', 'USA', 'IRELAND'],
 'pet': 'Erwin',
 'siblings': [{'name': 'Uta', 'age': 25, 'pet': 'Zimba'}]}

In [57]:
siblings = DataFrame(data['siblings'], columns=['name', 'age'])
siblings

Unnamed: 0,name,age
0,Uta,25


In [60]:
data_to_json = json.dumps(data)
data_to_json

'{"name": "Andre", "places_lived": ["GERMANY", "USA", "IRELAND"], "pet": "Erwin", "siblings": [{"name": "Uta", "age": 25, "pet": "Zimba"}]}'

## HTML

In [65]:
# lxml.de parses very large files
from lxml.html import parse
from urllib.request import urlopen


In [75]:
parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))
doc = parsed.getroot()
doc

<Element html at 0x11d4f3350>

In [76]:
 links = doc.findall('.//a')
 links[:5]
 lnk = links[28]
 lnk.get('href')

'/quote/AAPL200327C00170000?p=AAPL200327C00170000'

In [77]:
lnk.text_content()

'AAPL200327C00170000'

In [85]:
urls = [lnk.get('href') for lnk in doc.findall('.//a')]
urls[-10:]

['/watchlists',
 '/portfolios',
 '/screener',
 '/premium?ncid=navbarprem_fqbo1nu0ks0',
 '/calendar',
 '/news/',
 'https://money.yahoo.com',
 '/videos/',
 '/industries',
 '/tech']

## XML

In [86]:
from lxml import objectify

In [99]:
path = '../datasets/mta_perf/Performance_MNR.xml'
parsed = objectify.parse(path)
root = parsed.getroot()

In [100]:
data = []
skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ', 'DESIRED_CHANGE', 'DECIMAL_PLACES']

In [107]:
for elt in root.INDICATOR:
    elt_data = {}
    for child in elt.getchildren():
        if child.tag in skip_fields:
            continue
        elt_data[child.tag] = child.pyval
    data.append(elt_data)

#dir(data)

perf = DataFrame(data)
perf.head()

Unnamed: 0,AGENCY_NAME,INDICATOR_NAME,DESCRIPTION,PERIOD_YEAR,PERIOD_MONTH,CATEGORY,FREQUENCY,INDICATOR_UNIT,YTD_TARGET,YTD_ACTUAL,MONTHLY_TARGET,MONTHLY_ACTUAL
0,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,1,Service Indicators,M,%,95,96.9,95,96.9
1,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,2,Service Indicators,M,%,95,96.0,95,95.0
2,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,3,Service Indicators,M,%,95,96.3,95,96.9
3,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,4,Service Indicators,M,%,95,96.8,95,98.3
4,Metro-North Railroad,On-Time Performance (West of Hudson),Percent of commuter trains that arrive at thei...,2008,5,Service Indicators,M,%,95,96.6,95,95.8


In [None]:
# TBD: modify the dataframe systematically and save back as XML.