In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from io import StringIO

In [2]:
data = ('col1,col2,col3\n'
        'a,b,1\n'
        'a,b,2\n'
        'c,d,3')

In [3]:
pd.read_csv(StringIO(data))

Unnamed: 0,col1,col2,col3
0,a,b,1
1,a,b,2
2,c,d,3


In [4]:
pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ['COL1', 'COL3'])

Unnamed: 0,col1,col3
0,a,1
1,a,2
2,c,3


In [5]:
data = ('col1,col2,col3\n'
        'a,b,1\n'
        'a,b,2\n'
        'c,d,3')

In [6]:
pd.read_csv(StringIO(data))

Unnamed: 0,col1,col2,col3
0,a,b,1
1,a,b,2
2,c,d,3


In [7]:
pd.read_csv(StringIO(data), skiprows=lambda x: x % 2 != 0)

Unnamed: 0,col1,col2,col3
0,a,b,2


In [8]:
data = ('a,b,c,d\n'
        '1,2,3,4\n'
        '5,6,7,8\n'
        '9,10,11')

In [9]:
print(data)

a,b,c,d
1,2,3,4
5,6,7,8
9,10,11


In [10]:
df = pd.read_csv(StringIO(data), dtype=object)

In [11]:
df

Unnamed: 0,a,b,c,d
0,1,2,3,4.0
1,5,6,7,8.0
2,9,10,11,


In [12]:
df['a'][0]

'1'

In [13]:
df = pd.read_csv(StringIO(data),
                 dtype={'b': object, 'c': np.float64, 'd': 'Int64'})

In [14]:
df.dtypes

a      int64
b     object
c    float64
d      Int64
dtype: object

In [15]:
data = ("col_1\n"
        "1\n"
        "2\n"
        "'A'\n"
        "4.22")

In [16]:
df = pd.read_csv(StringIO(data), converters={'col_1': str})

In [17]:
df

Unnamed: 0,col_1
0,1
1,2
2,'A'
3,4.22


In [18]:
df['col_1'].apply(type).value_counts()

<class 'str'>    4
Name: col_1, dtype: int64

In [19]:
df2 = pd.read_csv(StringIO(data))

In [20]:
df2['col_1'] = pd.to_numeric(df2['col_1'], errors='coerce')

In [21]:
df2

Unnamed: 0,col_1
0,1.0
1,2.0
2,
3,4.22


In [22]:
df2['col_1'].apply(type).value_counts()

<class 'float'>    4
Name: col_1, dtype: int64

In [23]:
col_1 = list(range(500000)) + ['a', 'b'] + list(range(500000))

In [24]:
df = pd.DataFrame({'col_1': col_1})

In [25]:
df.to_csv('foo.csv')

In [26]:
mixed_df = pd.read_csv('foo.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [27]:
mixed_df['col_1'].apply(type).value_counts()

<class 'int'>    737858
<class 'str'>    262144
Name: col_1, dtype: int64

In [28]:
mixed_df['col_1'].dtype

dtype('O')

In [29]:
data = ('col1,col2,col3\n'
        'a,b,1\n'
        'a,b,2\n'
        'c,d,3')

In [30]:
pd.read_csv(StringIO(data))

Unnamed: 0,col1,col2,col3
0,a,b,1
1,a,b,2
2,c,d,3


In [31]:
pd.read_csv(StringIO(data)).dtypes

col1    object
col2    object
col3     int64
dtype: object

In [32]:
pd.read_csv(StringIO(data), dtype='category').dtypes

col1    category
col2    category
col3    category
dtype: object

In [33]:
pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes

col1    category
col2      object
col3       int64
dtype: object

In [34]:
from pandas.api.types import CategoricalDtype

In [35]:
dtype = CategoricalDtype(['d', 'c', 'b', 'a'], ordered=True)

In [36]:
pd.read_csv(StringIO(data), dtype={'col1': dtype}).dtypes

col1    category
col2      object
col3       int64
dtype: object

In [37]:
dtype = CategoricalDtype(['a', 'b', 'd']) # No 'c'

In [38]:
pd.read_csv(StringIO(data), dtype={'col1': dtype}).col1

0      a
1      a
2    NaN
Name: col1, dtype: category
Categories (3, object): [a, b, d]

In [39]:
df = pd.read_csv(StringIO(data), dtype='category')

In [40]:
df.dtypes

col1    category
col2    category
col3    category
dtype: object

In [41]:
df['col3']

0    1
1    2
2    3
Name: col3, dtype: category
Categories (3, object): [1, 2, 3]

In [42]:
df['col3'].cat.categories = pd.to_numeric(df['col3'].cat.categories)

In [43]:
df['col3']

0    1
1    2
2    3
Name: col3, dtype: category
Categories (3, int64): [1, 2, 3]

In [44]:
data = ('a,b,c\n'
        '1,2,3\n'
        '4,5,6\n'
        '7,8,9')

In [45]:
print(data)

a,b,c
1,2,3
4,5,6
7,8,9


In [46]:
pd.read_csv(StringIO(data))

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [47]:
print(data)

a,b,c
1,2,3
4,5,6
7,8,9


In [48]:
pd.read_csv(StringIO(data), names=['foo', 'bar', 'baz'], header=0)

Unnamed: 0,foo,bar,baz
0,1,2,3
1,4,5,6
2,7,8,9


In [49]:
pd.read_csv(StringIO(data), names=['foo', 'bar', 'baz'], header=None)

Unnamed: 0,foo,bar,baz
0,a,b,c
1,1,2,3
2,4,5,6
3,7,8,9


In [50]:
data = ('skip this skip it\n'
        'a,b,c\n'
        '1,2,3\n'
        '4,5,6\n'
        '7,8,9')

In [51]:
pd.read_csv(StringIO(data), header=1)

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [52]:
data = ('a,b,a\n'
        '0,1,2\n'
        '3,4,5')

In [53]:
pd.read_csv(StringIO(data))

Unnamed: 0,a,b,a.1
0,0,1,2
1,3,4,5


In [54]:
# To prevent users from encountering this problem with duplicate data, a ValueError exception is raised if
# mangle_dupe_cols != True:

data = 'a,b,a\n0,1,2\n3,4,5'
# pd.read_csv(StringIO(data), mangle_dupe_cols=False)

In [55]:
data = 'a,b,c,d\n1,2,3,foo\n4,5,6,bar\n7,8,9,baz'

In [56]:
pd.read_csv(StringIO(data))

Unnamed: 0,a,b,c,d
0,1,2,3,foo
1,4,5,6,bar
2,7,8,9,baz


In [57]:
pd.read_csv(StringIO(data), usecols=['b', 'd'])

Unnamed: 0,b,d
0,2,foo
1,5,bar
2,8,baz


In [58]:
pd.read_csv(StringIO(data), usecols=[0, 2, 3])

Unnamed: 0,a,c,d
0,1,3,foo
1,4,6,bar
2,7,9,baz


In [59]:
pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ['A', 'C'])

Unnamed: 0,a,c
0,1,3
1,4,6
2,7,9


In [60]:
pd.read_csv(StringIO(data), usecols=lambda x: x not in ['a', 'c'])

Unnamed: 0,b,d
0,2,foo
1,5,bar
2,8,baz


In [61]:
data = ('\n'
        'a,b,c\n'
        ' \n'
       '# commented line\n'
        '1,2,3\n'
        '\n'
        '4,5,6')

In [62]:
print(data)


a,b,c
 
# commented line
1,2,3

4,5,6


In [63]:
pd.read_csv(StringIO(data), comment='#')

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6


In [64]:
data = ('a,b,c\n'
        '\n'
        '1,2,3\n'
        '\n'
        '\n'
        '4,5,6')

In [65]:
pd.read_csv(StringIO(data), skip_blank_lines=False)

Unnamed: 0,a,b,c
0,,,
1,1.0,2.0,3.0
2,,,
3,,,
4,4.0,5.0,6.0


In [66]:
data = ('#comment\n'
        'a,b,c\n'
        'A,B,C\n'
        '1,2,3')

In [67]:
pd.read_csv(StringIO(data), comment='#', header=1)

Unnamed: 0,A,B,C
0,1,2,3


In [68]:
data = ('A,B,C\n'
        '#comment\n'
        'a,b,c\n'
        '1,2,3')

In [69]:
pd.read_csv(StringIO(data), comment='#', skiprows=2)

Unnamed: 0,a,b,c
0,1,2,3


In [70]:
data = ('# empty\n'
        '# second empty line\n'
        '# third emptyline\n'
        'X,Y,Z\n'
        '1,2,3\n'
        'A,B,C\n'
        '1,2.,4.\n'
        '5.,NaN,10.0\n')

In [71]:
print(data)

# empty
# second empty line
# third emptyline
X,Y,Z
1,2,3
A,B,C
1,2.,4.
5.,NaN,10.0



In [72]:
pd.read_csv(StringIO(data), comment='#', skiprows=4, header=1)

Unnamed: 0,A,B,C
0,1.0,2.0,4.0
1,5.0,,10.0


In [73]:
print(open('tmp.csv').read())

ID,level,category
Patient1,123000,x # really unpleasant
Patient2,23000,y # wouldn't take his medicine
Patient3,1234018,z # awesome


In [74]:
df = pd.read_csv('tmp.csv')
df

Unnamed: 0,ID,level,category
0,Patient1,123000,x # really unpleasant
1,Patient2,23000,y # wouldn't take his medicine
2,Patient3,1234018,z # awesome


In [75]:
df = pd.read_csv('tmp.csv', comment='#')
df

Unnamed: 0,ID,level,category
0,Patient1,123000,x
1,Patient2,23000,y
2,Patient3,1234018,z


In [76]:
from io import BytesIO

In [77]:
data = (b'word,length\n'
        b'Tr\xc3\xa4umen,7\n'
        b'Gr\xc3\xbc\xc3\x9fe,5')

In [78]:
data = data.decode('utf8').encode('latin-1')

In [79]:
df = pd.read_csv(BytesIO(data), encoding='latin-1')
df

Unnamed: 0,word,length
0,Träumen,7
1,Grüße,5


In [80]:
df['word'][1]

'Grüße'

In [81]:
data = ('a,b,c\n'
        '4,apple,bat,5.7\n'
        '8,orange,cow,10')

In [82]:
pd.read_csv(StringIO(data))

Unnamed: 0,a,b,c
4,apple,bat,5.7
8,orange,cow,10.0


In [83]:
data = ('index,a,b,c\n'
        '4,apple,bat,5.7\n'
        '8,orange,cow,10')

In [84]:
pd.read_csv(StringIO(data), index_col=0)

Unnamed: 0_level_0,a,b,c
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,apple,bat,5.7
8,orange,cow,10.0


In [85]:
data = ('a,b,c\n'
        '4,apple,bat,\n'
        '8,orange,cow,')

In [86]:
print(data)

a,b,c
4,apple,bat,
8,orange,cow,


In [87]:
pd.read_csv(StringIO(data))

Unnamed: 0,a,b,c
4,apple,bat,
8,orange,cow,


In [88]:
pd.read_csv(StringIO(data), index_col=False)

Unnamed: 0,a,b,c
0,4,apple,bat
1,8,orange,cow


In [89]:
data = ('a,b,c\n'
        '4,apple,bat,\n'
        '8,orange,cow,')

In [90]:
print(data)

a,b,c
4,apple,bat,
8,orange,cow,


In [91]:
pd.read_csv(StringIO(data), usecols=['b', 'c'])

Unnamed: 0,b,c
4,bat,
8,cow,


In [92]:
pd.read_csv(StringIO(data), usecols=['b', 'c'], index_col=0)

Unnamed: 0,b,c
4,bat,
8,cow,


In [93]:
df = pd.read_csv('foo2.csv', index_col=0, parse_dates=True)

In [94]:
df

Unnamed: 0_level_0,A,B,C
data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2009-01-01,a,1,2
2009-02-01,b,3,4
2009-03-01,c,4,5


In [95]:
df.index

DatetimeIndex(['2009-01-01', '2009-02-01', '2009-03-01'], dtype='datetime64[ns]', name='data', freq=None)

In [96]:
print(open('tmp2.csv').read())

KORD,19990127, 19:00:00, 18:56:00, 0.8100
KORD,19990127, 20:00:00, 19:56:00, 0.0100
KORD,19990127, 21:00:00, 20:56:00, -0.5900
KORD,19990127, 21:00:00, 21:18:00, -0.9900
KORD,19990127, 22:00:00, 21:56:00, -0.5900
KORD,19990127, 23:00:00, 22:56:00, -0.5900


In [97]:
df = pd.read_csv('tmp2.csv', header=None, parse_dates=[[1, 2], [1, 3]])
df

Unnamed: 0,1_2,1_3,0,4
0,1999-01-27 19:00:00,1999-01-27 18:56:00,KORD,0.81
1,1999-01-27 20:00:00,1999-01-27 19:56:00,KORD,0.01
2,1999-01-27 21:00:00,1999-01-27 20:56:00,KORD,-0.59
3,1999-01-27 21:00:00,1999-01-27 21:18:00,KORD,-0.99
4,1999-01-27 22:00:00,1999-01-27 21:56:00,KORD,-0.59
5,1999-01-27 23:00:00,1999-01-27 22:56:00,KORD,-0.59


In [98]:
df = pd.read_csv('tmp2.csv', header=None, parse_dates=[[1, 2], [1, 3]],
                 keep_date_col=True)
df

Unnamed: 0,1_2,1_3,0,1,2,3,4
0,1999-01-27 19:00:00,1999-01-27 18:56:00,KORD,19990127,19:00:00,18:56:00,0.81
1,1999-01-27 20:00:00,1999-01-27 19:56:00,KORD,19990127,20:00:00,19:56:00,0.01
2,1999-01-27 21:00:00,1999-01-27 20:56:00,KORD,19990127,21:00:00,20:56:00,-0.59
3,1999-01-27 21:00:00,1999-01-27 21:18:00,KORD,19990127,21:00:00,21:18:00,-0.99
4,1999-01-27 22:00:00,1999-01-27 21:56:00,KORD,19990127,22:00:00,21:56:00,-0.59
5,1999-01-27 23:00:00,1999-01-27 22:56:00,KORD,19990127,23:00:00,22:56:00,-0.59


In [99]:
date_spec = {'nominal': [1, 2], 'actual': [1, 3]}

In [100]:
df = pd.read_csv('tmp2.csv', header=None, parse_dates=date_spec)
df

Unnamed: 0,nominal,actual,0,4
0,1999-01-27 19:00:00,1999-01-27 18:56:00,KORD,0.81
1,1999-01-27 20:00:00,1999-01-27 19:56:00,KORD,0.01
2,1999-01-27 21:00:00,1999-01-27 20:56:00,KORD,-0.59
3,1999-01-27 21:00:00,1999-01-27 21:18:00,KORD,-0.99
4,1999-01-27 22:00:00,1999-01-27 21:56:00,KORD,-0.59
5,1999-01-27 23:00:00,1999-01-27 22:56:00,KORD,-0.59


In [101]:
date_spec = {'nominal': [1, 2], 'actual': [1, 3]}
df = pd.read_csv('tmp2.csv', header=None, parse_dates=date_spec,
                 index_col=0) # index is the nominal column
df

Unnamed: 0_level_0,actual,0,4
nominal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1999-01-27 19:00:00,1999-01-27 18:56:00,KORD,0.81
1999-01-27 20:00:00,1999-01-27 19:56:00,KORD,0.01
1999-01-27 21:00:00,1999-01-27 20:56:00,KORD,-0.59
1999-01-27 21:00:00,1999-01-27 21:18:00,KORD,-0.99
1999-01-27 22:00:00,1999-01-27 21:56:00,KORD,-0.59
1999-01-27 23:00:00,1999-01-27 22:56:00,KORD,-0.59


In [102]:
df = pd.read_csv('tmp2.csv', header=None, parse_dates=date_spec,
                 date_parser=pd.io.date_converters.parse_date_time)
df

Unnamed: 0,nominal,actual,0,4
0,1999-01-27 19:00:00,1999-01-27 18:56:00,KORD,0.81
1,1999-01-27 20:00:00,1999-01-27 19:56:00,KORD,0.01
2,1999-01-27 21:00:00,1999-01-27 20:56:00,KORD,-0.59
3,1999-01-27 21:00:00,1999-01-27 21:18:00,KORD,-0.99
4,1999-01-27 22:00:00,1999-01-27 21:56:00,KORD,-0.59
5,1999-01-27 23:00:00,1999-01-27 22:56:00,KORD,-0.59


In [103]:
content = """\
a
2000-01-01T00:00:00+05:00
2000-01-01T00:00:00+06:00"""

In [104]:
df = pd.read_csv(StringIO(content), parse_dates=['a'])

In [105]:
df['a']

0    2000-01-01 00:00:00+05:00
1    2000-01-01 00:00:00+06:00
Name: a, dtype: object

In [106]:
df = pd.read_csv(StringIO(content), parse_dates=['a'],
                 date_parser=lambda col: pd.to_datetime(col, utc=True))
df['a']

0   1999-12-31 19:00:00+00:00
1   1999-12-31 18:00:00+00:00
Name: a, dtype: datetime64[ns, UTC]

In [107]:
df = pd.read_csv('foo2.csv', index_col=0, parse_dates=True,
                 infer_datetime_format=True)
df

Unnamed: 0_level_0,A,B,C
data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2009-01-01,a,1,2
2009-02-01,b,3,4
2009-03-01,c,4,5


In [108]:
df = pd.read_csv('foo2.csv', index_col=0, parse_dates=True,
                 infer_datetime_format=True, dayfirst=True)
df

Unnamed: 0_level_0,A,B,C
data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2009-01-01,a,1,2
2009-01-02,b,3,4
2009-01-03,c,4,5


In [109]:
print(open('tmp3.csv').read())

date,value,cat
1/6/2000,5,a
2/6/2000,10,b
3/6/2000,15,c


In [110]:
pd.read_csv('tmp3.csv', parse_dates=[0])

Unnamed: 0,date,value,cat
0,2000-01-06,5,a
1,2000-02-06,10,b
2,2000-03-06,15,c


In [111]:
pd.read_csv('tmp3.csv', dayfirst=True, parse_dates=[0])

Unnamed: 0,date,value,cat
0,2000-06-01,5,a
1,2000-06-02,10,b
2,2000-06-03,15,c


In [112]:
val = '0.3066101993807095471566981359501369297504425048828125'

In [113]:
data = 'a,b,c\n1,2,{0}'.format(val)

In [114]:
abs(pd.read_csv(StringIO(data), engine='c',
                float_precision=None)['c'][0] - float(val))

1.1102230246251565e-16

In [115]:
abs(pd.read_csv(StringIO(data), engine='c',
                float_precision='high')['c'][0] - float(val))

5.551115123125783e-17

In [116]:
abs(pd.read_csv(StringIO(data), engine='c',
                float_precision='round_trip')['c'][0] - float(val))

0.0

In [117]:
print(open('tmp4.csv').read())

ID|level|category
Patient1|123,000|x
Patient2|23,000|y
Patient3|1,234,018|z


In [118]:
df = pd.read_csv('tmp4.csv', sep='|')

In [119]:
df

Unnamed: 0,ID,level,category
0,Patient1,123000,x
1,Patient2,23000,y
2,Patient3,1234018,z


In [120]:
df.level.dtype

dtype('O')

In [121]:
print(open('tmp4.csv').read())

ID|level|category
Patient1|123,000|x
Patient2|23,000|y
Patient3|1,234,018|z


In [122]:
df = pd.read_csv('tmp4.csv', sep='|', thousands=',')
df

Unnamed: 0,ID,level,category
0,Patient1,123000,x
1,Patient2,23000,y
2,Patient3,1234018,z


In [123]:
df.level.dtype

dtype('int64')

In [124]:
print(open('tmp5.csv').read())

level
Patient1,123000
Patient2,23000
Patient3,1234018


In [125]:
output = pd.read_csv('tmp5.csv', squeeze=True)
output

Patient1     123000
Patient2      23000
Patient3    1234018
Name: level, dtype: int64

In [126]:
type(output)

pandas.core.series.Series

In [127]:
data = ('a,b,c\n'
        '1,Yes,2\n'
        '3,No,4')

In [128]:
print(data)

a,b,c
1,Yes,2
3,No,4


In [129]:
pd.read_csv(StringIO(data))

Unnamed: 0,a,b,c
0,1,Yes,2
1,3,No,4


In [130]:
pd.read_csv(StringIO(data), true_values=['Yes'], false_values=['No'])

Unnamed: 0,a,b,c
0,1,True,2
1,3,False,4


In [131]:
data = ('a,b,c\n'
        '1,2,3\n'
        '4,5,6,7\n'
        '8,9,10')

In [132]:
# pd.read_csv(StringIO(data)) # ошибка

In [133]:
pd.read_csv(StringIO(data), error_bad_lines=False)

b'Skipping line 3: expected 3 fields, saw 4\n'


Unnamed: 0,a,b,c
0,1,2,3
1,8,9,10


In [134]:
pd.read_csv(StringIO(data), usecols=[0, 1, 2])

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,8,9,10


In [135]:
data = ('label1,label2,label3\n'
        'index1,"a,c,e\n'
        'index2,b,d,f')

In [136]:
print(data)

label1,label2,label3
index1,"a,c,e
index2,b,d,f


In [137]:
import csv

In [138]:
dia = csv.excel()

In [139]:
dia.quoting = csv.QUOTE_NONE

In [140]:
pd.read_csv(StringIO(data), dialect=dia)

Unnamed: 0,label1,label2,label3
index1,"""a",c,e
index2,b,d,f


In [141]:
data = 'a,b,c~1,2,3~4,5,6'

In [142]:
pd.read_csv(StringIO(data), lineterminator='~')

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6


In [143]:
data = 'a, b, c\n1, 2, 3\n4, 5, 6'

In [144]:
print(data)

a, b, c
1, 2, 3
4, 5, 6


In [145]:
pd.read_csv(StringIO(data), skipinitialspace=True)

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6


In [146]:
data = 'a,b\n"hello, \\"Bob\\", nice to see you",5'

In [147]:
print(data)

a,b
"hello, \"Bob\", nice to see you",5


In [148]:
pd.read_csv(StringIO(data), escapechar='\\')

Unnamed: 0,a,b
0,"hello, ""Bob"", nice to see you",5


In [149]:
print(open('bar.csv').read())

id8141   360.242940   149.910199   11950.7
id1594   444.953632   166.985655   11788.4
id1849   364.136849   183.628767   11806.2
id1230   413.836124   184.375703   11916.8
id1948   502.953953   173.237159   12468.3


In [150]:
# Column specifications are a list of half-intervals
colspecs = [(0, 6), (8, 20), (21, 33), (34, 43)]

In [151]:
df = pd.read_fwf('bar.csv', colspecs=colspecs, header=None, index_col=0)
df

Unnamed: 0_level_0,1,2,3
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id8141,360.24294,149.910199,11950.7
id1594,444.953632,166.985655,11788.4
id1849,364.136849,183.628767,11806.2
id1230,413.836124,184.375703,11916.8
id1948,502.953953,173.237159,12468.3


In [152]:
# Widths are a list of integers
widths = [6, 14, 13, 10]

In [153]:
df = pd.read_fwf('bar.csv', widths=widths, header=None)
df

Unnamed: 0,0,1,2,3
0,id8141,360.24294,149.910199,11950.7
1,id1594,444.953632,166.985655,11788.4
2,id1849,364.136849,183.628767,11806.2
3,id1230,413.836124,184.375703,11916.8
4,id1948,502.953953,173.237159,12468.3


In [154]:
df = pd.read_fwf('bar.csv', header=None, index_col=0)
df

Unnamed: 0_level_0,1,2,3
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id8141,360.24294,149.910199,11950.7
id1594,444.953632,166.985655,11788.4
id1849,364.136849,183.628767,11806.2
id1230,413.836124,184.375703,11916.8
id1948,502.953953,173.237159,12468.3


In [155]:
pd.read_fwf('bar.csv', header=None, index_col=0).dtypes

1    float64
2    float64
3    float64
dtype: object

In [156]:
pd.read_fwf('bar.csv', header=None, dtype={2: 'object'}).dtypes

0     object
1    float64
2     object
3    float64
dtype: object

In [157]:
print(open('foo3.csv').read())

A,B,C
20090101,a,1,2
20090102,b,3,4
20090103,c,4,5


In [158]:
pd.read_csv('foo3.csv')

Unnamed: 0,A,B,C
20090101,a,1,2
20090102,b,3,4
20090103,c,4,5


In [159]:
df = pd.read_csv('foo3.csv', parse_dates=True)

In [160]:
df.index

DatetimeIndex(['2009-01-01', '2009-01-02', '2009-01-03'], dtype='datetime64[ns]', freq=None)

In [161]:
print(open('mindex_ex.csv').read())

year,indiv,zit,xit
1977,"A",1.2,.6
1977,"B",1.5,.5
1977,"C",1.7,.8
1978,"A",.2,.06
1978,"B",.7,.2
1978,"C",.8,.3
1978,"D",.9,.5
1978,"E",1.4,.9
1979,"C",.2,.15
1979,"D",.14,.05
1979,"E",.5,.15
1979,"F",1.2,.5
1979,"G",3.4,1.9
1979,"H",5.4,2.7
1979,"I",6.4,1.2



In [162]:
df = pd.read_csv("mindex_ex.csv", index_col=[0, 1])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,zit,xit
year,indiv,Unnamed: 2_level_1,Unnamed: 3_level_1
1977,A,1.2,0.6
1977,B,1.5,0.5
1977,C,1.7,0.8
1978,A,0.2,0.06
1978,B,0.7,0.2
1978,C,0.8,0.3
1978,D,0.9,0.5
1978,E,1.4,0.9
1979,C,0.2,0.15
1979,D,0.14,0.05


In [163]:
df.loc[1978]

Unnamed: 0_level_0,zit,xit
indiv,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.2,0.06
B,0.7,0.2
C,0.8,0.3
D,0.9,0.5
E,1.4,0.9


In [164]:
from pandas._testing import makeCustomDataframe as mkdf

In [165]:
df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4)

In [166]:
df.to_csv('mi.csv')

In [167]:
print(open('mi.csv').read())

C0,,C_l0_g0,C_l0_g1,C_l0_g2
C1,,C_l1_g0,C_l1_g1,C_l1_g2
C2,,C_l2_g0,C_l2_g1,C_l2_g2
C3,,C_l3_g0,C_l3_g1,C_l3_g2
R0,R1,,,
R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2



In [168]:
pd.read_csv('mi.csv', header=[0, 1, 2, 3], index_col=[0, 1])

Unnamed: 0_level_0,C0,C_l0_g0,C_l0_g1,C_l0_g2
Unnamed: 0_level_1,C1,C_l1_g0,C_l1_g1,C_l1_g2
Unnamed: 0_level_2,C2,C_l2_g0,C_l2_g1,C_l2_g2
Unnamed: 0_level_3,C3,C_l3_g0,C_l3_g1,C_l3_g2
R0,R1,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4
R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2


In [169]:
print(open('mi2.csv').read())

,a,a,a,b,c,c
,q,r,s,t,u,v
one,1,2,3,4,5,6
two,7,8,9,10,11,12



In [170]:
pd.read_csv('mi2.csv', header=[0, 1], index_col=0)

Unnamed: 0_level_0,a,a,a,b,c,c
Unnamed: 0_level_1,q,r,s,t,u,v
one,1,2,3,4,5,6
two,7,8,9,10,11,12


In [171]:
print(open('tmp6.sv').read())

:0:1:2:3
0:0.4691122999071863:-0.2828633443286633:-1.5090585031735124:-1.1356323710171934
1:1.2121120250208506:-0.17321464905330858:0.11920871129693428:-1.0442359662799567
2:-0.8618489633477999:-2.1045692188948086:-0.4949292740687813:1.071803807037338
3:0.7215551622443669:-0.7067711336300845:-1.0395749851146963:0.27185988554282986
4:-0.42497232978883753:0.567020349793672:0.27623201927771873:-1.0874006912859915
5:-0.6736897080883706:0.1136484096888855:-1.4784265524372235:0.5249876671147047
6:0.4047052186802365:0.5770459859204836:-1.7150020161146375:-1.0392684835147725
7:-0.3706468582364464:-1.1578922506419993:-1.344311812731667:0.8448851414248841
8:1.0757697837155533:-0.10904997528022223:1.6435630703622064:-1.4693879595399115
9:0.35702056413309086:-0.6746001037299882:-1.776903716971867:-0.9689138124473498


In [172]:
print(open('tmp6.sv').read())

:0:1:2:3
0:0.4691122999071863:-0.2828633443286633:-1.5090585031735124:-1.1356323710171934
1:1.2121120250208506:-0.17321464905330858:0.11920871129693428:-1.0442359662799567
2:-0.8618489633477999:-2.1045692188948086:-0.4949292740687813:1.071803807037338
3:0.7215551622443669:-0.7067711336300845:-1.0395749851146963:0.27185988554282986
4:-0.42497232978883753:0.567020349793672:0.27623201927771873:-1.0874006912859915
5:-0.6736897080883706:0.1136484096888855:-1.4784265524372235:0.5249876671147047
6:0.4047052186802365:0.5770459859204836:-1.7150020161146375:-1.0392684835147725
7:-0.3706468582364464:-1.1578922506419993:-1.344311812731667:0.8448851414248841
8:1.0757697837155533:-0.10904997528022223:1.6435630703622064:-1.4693879595399115
9:0.35702056413309086:-0.6746001037299882:-1.776903716971867:-0.9689138124473498


In [173]:
pd.read_csv('tmp6.sv', sep=None, engine='python')

Unnamed: 0.1,Unnamed: 0,0,1,2,3
0,0,0.469112,-0.282863,-1.509059,-1.135632
1,1,1.212112,-0.173215,0.119209,-1.044236
2,2,-0.861849,-2.104569,-0.494929,1.071804
3,3,0.721555,-0.706771,-1.039575,0.27186
4,4,-0.424972,0.56702,0.276232,-1.087401
5,5,-0.67369,0.113648,-1.478427,0.524988
6,6,0.404705,0.577046,-1.715002,-1.039268
7,7,-0.370647,-1.157892,-1.344312,0.844885
8,8,1.07577,-0.10905,1.643563,-1.469388
9,9,0.357021,-0.6746,-1.776904,-0.968914


In [174]:
print(open('tmp.sv').read())

|0|1|2|3
0|0.4691122999071863|-0.2828633443286633|-1.5090585031735124|-1.1356323710171934
1|1.2121120250208506|-0.17321464905330858|0.11920871129693428|-1.0442359662799567
2|-0.8618489633477999|-2.1045692188948086|-0.4949292740687813|1.071803807037338
3|0.7215551622443669|-0.7067711336300845|-1.0395749851146963|0.27185988554282986
4|-0.42497232978883753|0.567020349793672|0.27623201927771873|-1.0874006912859915
5|-0.6736897080883706|0.1136484096888855|-1.4784265524372235|0.5249876671147047
6|0.4047052186802365|0.5770459859204836|-1.7150020161146375|-1.0392684835147725
7|-0.3706468582364464|-1.1578922506419993|-1.344311812731667|0.8448851414248841
8|1.0757697837155533|-0.10904997528022223|1.6435630703622064|-1.4693879595399115
9|0.35702056413309086|-0.6746001037299882|-1.776903716971867|-0.9689138124473498


In [175]:
table = pd.read_csv('tmp.sv', sep='|')
table

Unnamed: 0.1,Unnamed: 0,0,1,2,3
0,0,0.469112,-0.282863,-1.509059,-1.135632
1,1,1.212112,-0.173215,0.119209,-1.044236
2,2,-0.861849,-2.104569,-0.494929,1.071804
3,3,0.721555,-0.706771,-1.039575,0.27186
4,4,-0.424972,0.56702,0.276232,-1.087401
5,5,-0.67369,0.113648,-1.478427,0.524988
6,6,0.404705,0.577046,-1.715002,-1.039268
7,7,-0.370647,-1.157892,-1.344312,0.844885
8,8,1.07577,-0.10905,1.643563,-1.469388
9,9,0.357021,-0.6746,-1.776904,-0.968914


In [176]:
reader = pd.read_csv('tmp.sv', sep='|', chunksize=4)
reader

<pandas.io.parsers.TextFileReader at 0x9685d18>

In [177]:
for chunk in reader:
    print(chunk)

   Unnamed: 0         0         1         2         3
0           0  0.469112 -0.282863 -1.509059 -1.135632
1           1  1.212112 -0.173215  0.119209 -1.044236
2           2 -0.861849 -2.104569 -0.494929  1.071804
3           3  0.721555 -0.706771 -1.039575  0.271860
   Unnamed: 0         0         1         2         3
4           4 -0.424972  0.567020  0.276232 -1.087401
5           5 -0.673690  0.113648 -1.478427  0.524988
6           6  0.404705  0.577046 -1.715002 -1.039268
7           7 -0.370647 -1.157892 -1.344312  0.844885
   Unnamed: 0         0        1         2         3
8           8  1.075770 -0.10905  1.643563 -1.469388
9           9  0.357021 -0.67460 -1.776904 -0.968914


In [178]:
reader = pd.read_csv('tmp.sv', sep='|', iterator=True)
reader.get_chunk(5)

Unnamed: 0.1,Unnamed: 0,0,1,2,3
0,0,0.469112,-0.282863,-1.509059,-1.135632
1,1,1.212112,-0.173215,0.119209,-1.044236
2,2,-0.861849,-2.104569,-0.494929,1.071804
3,3,0.721555,-0.706771,-1.039575,0.27186
4,4,-0.424972,0.56702,0.276232,-1.087401
