## 文件操作

### 读取文件内容：pd.read_csv()/pd.read_table()

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv('examples/ex1.csv')

In [5]:
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


指定分隔符

In [7]:
pd.read_table('examples/ex1.csv', sep=',')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


读取没有标题的文件

In [8]:
pd.read_csv('examples/ex2.csv')

Unnamed: 0,1,2,3,4,hello
0,5,6,7,8,world
1,9,10,11,12,foo


### 设置默认列名：pd.read_csv('', header=)

In [9]:
pd.read_csv('examples/ex2.csv', header=None)

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


### 自定义列名：pd.read_csv('', names=[])

In [10]:
pd.read_csv('examples/ex2.csv', names=['a', 'b', 'c', 'd', 'message'])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


### 设置表格的列数据变为行索引：pd.read_csv('', index_col=)

设置一列数据作为一行索引

In [24]:
names = ['a', 'b', 'c', 'd', 'message']

In [25]:
df = pd.read_csv('examples/ex2.csv', names=names, index_col='message')

In [26]:
df

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


In [27]:
df.index

Index(['hello', 'world', 'foo'], dtype='object', name='message')

In [28]:
df.values

array([[ 1,  2,  3,  4],
       [ 5,  6,  7,  8],
       [ 9, 10, 11, 12]], dtype=int64)

In [30]:
df.index.name

'message'

设置多列数据作为多行索引

In [4]:
df = pd.read_csv('examples/csv_mindex.csv', index_col=['key1', 'key2'])

In [5]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


### 以列表形式读取txt文件内容：list(open('txt文件'))

In [1]:
list(open('examples/ex3.txt'))

['            A         B         C\n',
 'aaa -0.264438 -1.026059 -0.619500\n',
 'bbb  0.927272  0.302904 -0.032399\n',
 'ccc -0.264273 -0.386314 -0.217601\n',
 'ddd -0.871858 -0.348382  1.100491\n']

### pd读取txt文件内容：pd.read_table('txt文件', sep='\s+')

In [4]:
df = pd.read_table('examples/ex3.txt', sep='\s+')

In [5]:
df

Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


In [7]:
df.loc['aaa']

A   -0.264438
B   -1.026059
C   -0.619500
Name: aaa, dtype: float64

In [8]:
df['A']

aaa   -0.264438
bbb    0.927272
ccc   -0.264273
ddd   -0.871858
Name: A, dtype: float64

### 读取csv文件时跳过某些行：skiprows=[]

In [9]:
pd.read_csv('examples/ex4.csv')

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,# hey!
a,b,c,d,message
# just wanted to make things more difficult for you,,,,
# who reads CSV files with computers,anyway?,,,
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo


In [10]:
df = pd.read_csv('examples/ex4.csv', skiprows=[0, 2, 3])

In [11]:
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


### 指定表格中某些文本为 NaN：na_values()

直接指定某些内容转换成空值

In [33]:
df = pd.read_csv('examples/ex5.csv')

In [34]:
df

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


In [35]:
pd.isnull(df)

Unnamed: 0,something,a,b,c,d,message
0,False,False,False,False,False,True
1,False,False,False,True,False,False
2,False,False,False,False,False,False


In [36]:
# 将表格中的 foo 和 world 字符串都设置成空值
df = pd.read_csv('examples/ex5.csv', na_values=['foo', 'world'])

In [37]:
df

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,
2,three,9,10,11.0,12,


指定某些列的某些字符串转换成空值

In [38]:
na_values = {'something': ['one', 'three'], 'd': [4, 12]}

In [None]:
df = pd.read_csv('examples/ex5.csv', na_values=na_values)