In [1]:
from typing import Union
from pathlib import Path
import numpy as np
import pandas as pd
pd.options.display.max_rows=10

In [2]:
#when we process large chunks of a file, we may want to only read in a small piece, or iterate through
#smaller chunks
result = pd.read_csv('examples/ex6.csv')
result

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.501840,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q
...,...,...,...,...,...
9995,2.311896,-0.417070,-1.409599,-0.515821,L
9996,-0.479893,-0.650419,0.745152,-0.646038,E
9997,0.523331,0.787112,0.486066,1.093156,K
9998,-0.362559,0.598894,-1.843201,0.887292,G


In [3]:
#if we only want to read a small number of rows
result = pd.read_csv('examples/ex6.csv', nrows=5)
result

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.50184,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q


In [4]:
#reading a file in pieces: specify chunksize as a number of rows
chunker = pd.read_csv('examples/ex6.csv', chunksize=1000)
tot = pd.Series([], dtype=object)
for piece in chunker:
    tot = tot.add(piece['key'].value_counts(), fill_value=0)

tot = tot.sort_values(ascending=False)
tot[:10]

E    368
X    364
L    346
O    343
Q    340
M    338
J    337
F    335
K    334
H    330
dtype: object

In [5]:
#textparser is also equipped with a get-chunk attribute enabling reading of pieces of arbitrary size

In [6]:
#reading a file in pieces: iterator=True also works
chunker = pd.read_csv('examples/ex6.csv', iterator=True)
print(chunker)
tot = pd.Series([], dtype=object)
for piece in chunker:
    tot = tot.add(piece['key'].value_counts(), fill_value=0)

tot = tot.sort_values(ascending=False)
tot

<pandas.io.parsers.readers.TextFileReader object at 0x7f8539e3c100>


E    368
X    364
L    346
O    343
Q    340
    ... 
5    157
2    152
0    151
9    150
1    146
Length: 36, dtype: object

In [7]:
#let's use a generator
def read_file_in_parts(path: Union[Path, str], chunksize: int):
    '''Takes a Path or str object and yields each part of the file'''
    chunker = pd.read_csv(path, chunksize=chunksize)
    for piece in chunker:
        yield piece
        
def file_value_counts(path: Union[Path, str], chunksize: int, fill_value: int=0):
    '''Takes a filepath and a chunksize and returns the value counts of 
    the specified column. Alternative implementation of pg. 176.'''
    file_chunks = read_file_in_parts(path, chunksize)
    tot = pd.Series([], dtype=float)
    for piece in file_chunks:
        tot = tot.add(piece['key'].value_counts(), fill_value=fill_value)
    return tot.sort_values(ascending=False)
    
        
DATA_PATH = Path('examples/ex6.csv')

file_value_counts('examples/ex6.csv', chunksize=1000)

E    368.0
X    364.0
L    346.0
O    343.0
Q    340.0
     ...  
5    157.0
2    152.0
0    151.0
9    150.0
1    146.0
Length: 36, dtype: float64

In [8]:
#we can also export data to a delimited format
data = pd.read_csv('examples/ex5.csv')

data

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [9]:
#using the df.to_csv method we can export it to a csv
DESTINATION_PATH = Path('saved_data_practice/examples.csv')
data.to_csv(DESTINATION_PATH)

In [10]:
#we can use other delimiters, such as the pipe
import sys

data.to_csv(sys.stdout, sep='|')

|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo


In [11]:
#missing values are empty srings in the output, we might want to assign them to an NA value
data.to_csv(sys.stdout, na_rep='NULL')

,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo


In [12]:
#we can disable the displaying of the row or columns or both
#display both rows and columns
data.to_csv(sys.stdout, index=True, header=True)

,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [13]:
#we can disable the displaying of the row or columns or both
#display only columns
data.to_csv(sys.stdout, index=False, header=True)

something,a,b,c,d,message
one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo


In [14]:
#display only rows:
data.to_csv(sys.stdout, index=True, header=False)

0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [15]:
#display neither rows nor columns
data.to_csv(sys.stdout, index=False, header=False)

one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo


In [16]:
#we can also write a subset of the columns in the order of our choosing
data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])

a,b,c
1,2,3.0
5,6,
9,10,11.0


In [17]:
#series to csv
dates = pd.date_range('1/1/2000', periods=7)
dates

DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07'],
              dtype='datetime64[ns]', freq='D')

In [18]:
ts = pd.Series(np.arange(7), index=dates)
ts

2000-01-01    0
2000-01-02    1
2000-01-03    2
2000-01-04    3
2000-01-05    4
2000-01-06    5
2000-01-07    6
Freq: D, dtype: int64

In [19]:
ts.to_csv('saved_data_practice/tseries.csv')

In [20]:
#display the series on the screen
!cat examples/tseries.csv

,0
2000-01-01,0
2000-01-02,1
2000-01-03,2
2000-01-04,3
2000-01-05,4
2000-01-06,5
2000-01-07,6


In [21]:
#working with delimited formats
#errors: sometimes there are files that are poorly formatted
!cat examples/ex7.csv

"a","b","c"
"1","2","3"
"1","2","3"


In [22]:
#we can use the csv module for any file that uses a character as its delimiter
import csv

f = open('examples/ex7.csv')
reader = csv.reader(f)

#iterating through the reader like a file yields tuples with any quote characters removed
for line in reader:
    print(line)

['a', 'b', 'c']
['1', '2', '3']
['1', '2', '3']


In [23]:
#Wrangling the data:
#1) read the file into a list of lines
with open('examples/ex7.csv') as f:
    lines = list(csv.reader(f))

In [24]:
header, values = lines[0], lines[1:]

In [25]:
#afterwards, we can make a dictionary of data columns with dict comprehension, and unpack
#values (e.g. zip(*values)) to transpose rows to columns
data_dict = {'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}

In [30]:
#csv files have many different dialects. We create a subclass of csv.Dialect to
#define new formats with different line termination characters, attributes, etc
class MyDialect(csv.Dialect):
    lineterminator = '\n'
    delimiter = ';'
    quotechar = '"'
    quoting = csv.QUOTE_MINIMAL
    
with open('examples/ex7.csv') as f:
    reader = csv.reader(f, dialect=MyDialect)
    for row in reader:
        print(row)

['a,"b","c"']
['1,"2","3"']
['1,"2","3"']


In [43]:
#we can also give individual parameters to csv.Reader without defining a subclass
with open('examples/ex7.csv') as in_file:
    reader = csv.reader(in_file, delimiter='|')
    for row in reader:
        print(row)

['a,"b","c"']
['1,"2","3"']
['1,"2","3"']


In [42]:
#csv files have many different dialects. We create a subclass of csv.Dialect to
#define new formats with different line termination characters, attributes, etc

#NOTE: We cannot use the csv module for more complicated files

class NewMyDialect(csv.Dialect):
    lineterminator = '\n'
    delimiter = ','
    quotechar = '"'
    quoting = csv.QUOTE_MINIMAL
    
with open('examples/ex7.csv') as f:
    reader = csv.reader(f, dialect=NewMyDialect)
    for row in reader:
        print(row)

['a', 'b', 'c']
['1', '2', '3']
['1', '2', '3']


In [None]:
#writing to delimited files
#csv.writer accepts an open 