# 数据的导入与导出

In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.read_csv('data/ex1.csv')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [3]:
%more data/ex1.csv

In [4]:
pd.read_table('data/ex1.csv', sep=',')#分隔符为，

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [5]:
pd.read_csv('data/ex2.csv', sep=',')

Unnamed: 0,1,2,3,4,hello
0,5,6,7,8,world
1,9,10,11,12,foo


In [7]:
pd.read_csv('data/ex2.csv', header=None, names=['a', 'b', 'c','d', 'msg'])

Unnamed: 0,a,b,c,d,msg
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [9]:
#用指定的一列做行索引,多级行索引
pd.read_csv('data/ex2.csv', header=None, names=['a', 'b', 'c','d', 'msg'], index_col=['msg', 'b'])

Unnamed: 0_level_0,Unnamed: 1_level_0,a,c,d
msg,b,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,2,1,3,4
world,6,5,7,8
foo,10,9,11,12


In [11]:
#针对不规则的分隔符,用正则表达式，\s表示空格，+表示多个，模块re
pd.read_table('data/ex3.csv', sep='\s+')

Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


In [15]:
#缺失值的处理,可以针对每一列定义缺失值
pd.read_csv('data/ex5.csv', na_values={'message': ['NA', 'NULL', 'foo']})

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,


In [16]:
#分块处理数据
#读取前10行数据
pd.read_csv('data/ex6.csv', nrows=10)

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.50184,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q
5,1.81748,0.742273,0.419395,-2.251035,Q
6,-0.776764,0.935518,-0.332872,-1.875641,U
7,-0.913135,1.530624,-0.572657,0.477252,K
8,0.35848,-0.497572,-0.367016,0.507702,S
9,-1.740877,-1.160417,-1.63783,2.172201,G


In [17]:
#一次性读取1000行
tr = pd.read_csv('data/ex6.csv', chunksize=1000)
tr

<pandas.io.parsers.TextFileReader at 0x7920748>

In [19]:
result = pd.Series([])
for chunk in tr:
    result = result.add(chunk['key'].value_counts(), fill_value=0)
result

0    142.0
1    133.0
2    141.0
3    147.0
4    154.0
5    148.0
6    149.0
7    147.0
8    149.0
9    139.0
A    290.0
B    277.0
C    259.0
D    288.0
E    336.0
F    295.0
G    270.0
H    291.0
I    290.0
J    298.0
K    303.0
L    315.0
M    309.0
N    279.0
O    299.0
P    299.0
Q    301.0
R    280.0
S    260.0
T    277.0
U    293.0
V    293.0
W    274.0
X    327.0
Y    286.0
Z    262.0
dtype: float64

In [20]:
result = result.sort_values(ascending=False)

In [21]:
result[:10]#前10个

E    336.0
X    327.0
L    315.0
M    309.0
K    303.0
Q    301.0
O    299.0
P    299.0
J    298.0
F    295.0
dtype: float64

In [22]:
df = pd.read_csv('data/ex5.csv')

In [23]:
df

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [25]:
#存入磁盘里
df.to_csv('data/ex5_out1.csv', index=False)