In [11]:
import pandas as pd
import numpy as np

In [3]:
#pandas有两种主要的数据结构，第一种是Series，是一种类似于一维数组的数据结构，它由一组数据以及一组与之相关的数据标签组成。
obj = pd.Series([4,7,-5,3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [4]:
#可以通过index和values属性来获得obj的索引和值
obj.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
obj.values

array([ 4,  7, -5,  3])

In [6]:
#索引在默认情况下是0-n-1，不过我们可以指定索引的值
obj2 = pd.Series([4,7,-5,3],index=['d','b','a','c'])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [7]:
#可以通过相对位置的下标或者index来访问Series中的元素
obj2[2]

-5

In [8]:
obj2['a']

-5

In [9]:
obj2[['a','b','d']]

a   -5
b    7
d    4
dtype: int64

In [12]:
#numpy数组运算（如根据布尔型数据进行过滤，标量乘法、应用数学函数等）都会保留索引和值之间的链接
np.exp(obj2)

d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [13]:
#可以把Series看成是一个定长的有序字典，因为他是索引值到数据值的一个映射，它可以用在许多原本需要字典参数的函数中
'b' in obj2

True

In [14]:
#可以直接通过字典dict来创建Series
sdata = {'Ohio':35000,'Texas':71000,'Oregon':16000,'Utah':5000}
obj3 = pd.Series(sdata)
obj3 

Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64

In [17]:
#如果既使用了字典，又指定了索引，那么字典中跟索引相匹配的值将会被找出来放到相应的位置上
states = ['California','Ohio','Texas','Oregon']
obj4 = pd.Series(sdata,index = states)
obj4

California        NaN
Ohio          35000.0
Texas         71000.0
Oregon        16000.0
dtype: float64

In [18]:
#Series最重要的一个功能是：它在算术运算中会自动对齐不同索引的数据
obj3 + obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [19]:
"""DataFrame"""

'DataFrame'

In [20]:
#DataFrame是一种表格型数据结构，它含有一组有序的列，每列可以是不同的值。
#DataFrame既有行索引，也有列索引，它可以看作是由Series组成的字典，不过这些Series公用一个索引
data = {
    'state':['Ohio','Ohio','Ohio','Nevada','Nevada'],
    'year':[2000,2001,2002,2001,2002],
    'pop':[1.5,1.7,3.6,2.4,2.9]
}
frame = pd.DataFrame(data)
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002


In [21]:
frame2 = pd.DataFrame(data,index=['one','two','three','four','five'],columns=['year','state','pop','debt'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


In [23]:
frame2.columns


Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [24]:
#返回一个Series 
frame2['year']

one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64

In [26]:
#可以使用一个标量修改DataFrame中的某一列，此时这个标量会广播到DataFrame的每一行上
frame2['debt']=16.5
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5


In [30]:
#也可以使用一个列表来修改，不过要保证列表的长度与DataFrame长度相同
frame2.debt = np.arange(5)
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0
two,2001,Ohio,1.7,1
three,2002,Ohio,3.6,2
four,2001,Nevada,2.4,3
five,2002,Nevada,2.9,4


In [28]:
#可以使用一个Series，此时会根据索引进行精确匹配
val = pd.Series([-1.2,-1.5,-1.7],index=['two','four','five'])
frame2['debt'] = val
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7


In [31]:
#使用嵌套字典也可以创建DataFrame，此时外层字典的键作为列，内层键则作为索引
pop = {'Nevada':{2001:2.4,2002:2.9},'Ohio':{2000:1.5,2001:1.7,2002:3.6}}
frame3 = pd.DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [33]:
pd.DataFrame(pop,index=[2001,2002,2003])

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [34]:
#values属性返回的是一个二维的ndarray
frame3.values

array([[ nan,  1.5],
       [ 2.4,  1.7],
       [ 2.9,  3.6]])

In [35]:
frame2.values

array([[2000, 'Ohio', 1.5, 0],
       [2001, 'Ohio', 1.7, 1],
       [2002, 'Ohio', 3.6, 2],
       [2001, 'Nevada', 2.4, 3],
       [2002, 'Nevada', 2.9, 4]], dtype=object)

In [36]:
"""基本功能：重新索引"""
obj2 = pd.Series([4,7,-5,3],index=['d','b','a','c'])
obj3 = obj2.reindex(['a','b','c','d','e'])
obj3

a   -5.0
b    7.0
c    3.0
d    4.0
e    NaN
dtype: float64

In [38]:
obj4 = obj2.reindex(['a','b','c','d','e'],fill_value=0)
obj4

a   -5
b    7
c    3
d    4
e    0
dtype: int64

In [40]:
#method进行差值填充，但是索引必须是单调递增或者单调递减的，所以一般用在时间序列这样的有序数据中

# obj5 = obj2.reindex(['a','b','c','d','e'],method='ffill') 会报错
obj3 = pd.Series(['blue','Purple','yellow'],index = [0,2,4])
obj3.reindex(range(6),method='ffill')

0      blue
1      blue
2    Purple
3    Purple
4    yellow
5    yellow
dtype: object

In [59]:
#对DataFrame进行重新索引，可以重新索引行，列或者两个都修改，如果只传入一个参数，则会从新索引行
frame = pd.DataFrame(np.arange(9).reshape((3,3)),index=[1,4,5],columns=['Ohio','Texas','California'])
frame2 = frame.reindex([1,2,4,5])
frame2

Unnamed: 0,Ohio,Texas,California
1,0.0,1.0,2.0
2,,,
4,3.0,4.0,5.0
5,6.0,7.0,8.0


In [60]:
states = ['Texas','Utah','California']
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
1,1,,2
4,4,,5
5,7,,8


In [173]:
#填充数据只能按行填充
frame = pd.DataFrame(np.arange(9).reshape((3,3)),index = ['a','c','d'],columns = ['Ohio','Texas','California'])
frame.reindex(['a','b','c','d'],method = 'bfill')
#frame.reindex(['a','b','c','d'],method = 'bfill',columns=states) 报错

SyntaxError: invalid syntax (<ipython-input-173-9f2a9b5a141f>, line 4)

In [70]:
#丢弃指定轴上的项,不会对原DataFrame产生影响
frame = pd.DataFrame(np.arange(9).reshape((3,3)),index = ['a','c','d'],columns = ['Ohio','Texas','California'])
frame.drop('a') 


Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [71]:
frame.drop(['Ohio'],axis=1)

Unnamed: 0,Texas,California
a,1,2
c,4,5
d,7,8


In [72]:
"""索引、选取和过滤"""

'索引、选取和过滤'

In [73]:
#对Series索引的工作方式类似于Numpy，不过可以使用index的值进行索引
obj = pd.Series(np.arange(4.),index=['a','b','c','d'])
obj['b']

1.0

In [74]:
obj[['b','c']]

b    1.0
c    2.0
dtype: float64

In [75]:
#与利用下标进行切片不同，使用标签进行切片时，末端是包含的
obj['b':'c']

b    1.0
c    2.0
dtype: float64

In [76]:
#对DataFrame进行索引：
data = pd.DataFrame(np.arange(16).reshape((4,4)),index = ['Ohio','Colorado','Utah','New York'],columns=['one','two','three','four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [77]:
data[['two','three']]

Unnamed: 0,two,three
Ohio,1,2
Colorado,5,6
Utah,9,10
New York,13,14


In [78]:
#取行
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [79]:
data[data['three']>5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [81]:
#使用ix方法进行索引,不过ix在最新的版本中已经被废弃了，如果要是用标签，最好使用loc方法，如果使用下标，最好使用iloc方法
#data.ix['Colorado',['two','three']]
data.loc['Colorado',['two','three']]

two      5
three    6
Name: Colorado, dtype: int64

In [83]:
data.iloc[0:3,2]

Ohio         2
Colorado     6
Utah        10
Name: three, dtype: int64

In [84]:
"""算术运算和数据对齐"""


'算术运算和数据对齐'

In [85]:
#根据索引自动相加，在不重叠的部分补足NA
s1 = pd.Series([7.3,-2.5,3.4,1.5],index=['a','c','d','e'])
s2 = pd.Series([-2.1,3.6,-1.5,4,3.1],index=['a','c','e','f','g'])
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [86]:
#DataFrame也会同样的补齐，在不重叠的部分补足NA
df1 = pd.DataFrame(np.arange(9).reshape((3,3)),columns=list('bcd'),index=['Ohio','Texas','Colorado'])
df2 = pd.DataFrame(np.arange(12).reshape((4,3)),columns = list('bde'),index=['Utah','Ohio','Texas','Oregon'])
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [87]:
#两个都是NA的，结果还是NA
df1.add(df2,fill_value=0)

Unnamed: 0,b,c,d,e
Colorado,6.0,7.0,8.0,
Ohio,3.0,1.0,6.0,5.0
Oregon,9.0,,10.0,11.0
Texas,9.0,4.0,12.0,8.0
Utah,0.0,,1.0,2.0


In [88]:
"""函数应用和映射"""

'函数应用和映射'

In [90]:
#numpy的元素级数组方法，也可以用于操作Pandas对象
frame = pd.DataFrame(np.random.randn(3,3),columns=list('bcd'),index=['Ohio','Texas','Colorado'])
np.abs(frame)

Unnamed: 0,b,c,d
Ohio,0.367521,0.232387,0.64933
Texas,3.115632,1.415106,2.093794
Colorado,0.714983,1.420871,0.557722


In [91]:
#另一个常见的操作是，将函数应用到由各列或行所形成的一维数组上。DataFrame的apply方法即可实现此功能。
f = lambda x:x.max() - x.min()
frame.apply(f)

b    3.830616
c    2.835978
d    2.743124
dtype: float64

In [92]:
frame.apply(f,axis=1)

Ohio        1.016851
Texas       4.530739
Colorado    2.135855
dtype: float64

In [93]:
def f(x):
    return pd.Series([x.min(),x.max()],index=['min','max'])
frame.apply(f)

Unnamed: 0,b,c,d
min,-0.714983,-1.415106,-0.64933
max,3.115632,1.420871,2.093794


In [94]:
#元素级的Python函数也是可以用的
format = lambda x:'%.2f'%x
frame.applymap(format)

Unnamed: 0,b,c,d
Ohio,0.37,-0.23,-0.65
Texas,3.12,-1.42,2.09
Colorado,-0.71,1.42,-0.56


In [96]:
#Series有一个元素级的map方法
frame['d'].map(format)

Ohio        -0.65
Texas        2.09
Colorado    -0.56
Name: d, dtype: object

In [97]:
"""排序和排名"""

'排序和排名'

In [99]:
#·
obj = pd.Series(range(4),index=['d','a','b','c'])
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [100]:
#对于DataFrame,sort_index可以根据任意轴的索引进行排序
frame = pd.DataFrame(np.arange(8).reshape((2,4)),index=['three','one'],columns=['d','a','b','c'])
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [102]:
frame.sort_index(1,ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [108]:
#按照任意一列进行排序
frame.sort_values(by=['a','b'])

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [105]:
#按值进行排序：
obj.sort_values()

d    0
a    1
b    2
c    3
dtype: int64

In [109]:
#rank函数会增加一个排名值，从1开始，一直到数组中有效数据的数量，对于平级关系，rank是通过为各组分配一个平均排名的方式破坏平级关系的
#使用method方法可以按照指定的方式进行rank排序
#例如使用first可以按值在原始数据中出现顺序分配排名
obj = pd.Series([7,-5,7,4,2,0,4])
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [111]:
obj.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [112]:
"""汇总和计算描述统计"""

'汇总和计算描述统计'

In [114]:
df = pd.DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],index=['a','b','c','d'],columns=['one','two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [115]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [116]:
df.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [117]:
#Na会被自动排除，可以使用skipna选项来禁用该功能
df.mean(axis=1,skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [118]:
#idxmax返回间接统计，是达到最大值的索引
df.idxmax()

one    b
two    d
dtype: object

In [119]:
#describe返回的是DataFrame的汇总统计
#非数值型的与数值型的统计返回结果不同
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [175]:
#Series的corr方法用于计算两个Series中重叠的，非NA的，按索引对齐的值的相关系数，与此类似，cov用于计算协方差
# import pandas.io.data as web
# all_data = {}
# for ticker in ['APPL','IBM','MSFT','GOOG']:
#     all_data[ticker] = web.get_data_yahoo(ticker,'1/1/2000','1/1/2010')
# price = pd.DataFrame({tic:data['Adj Close']  for tic,data in all_data.iteritems()})
# volume = pd.DataFrame({tic:data['Volume'] for tic,data in all_data.iteritems()})
obj1 = pd.Series(np.arange(10),index = list('abcdefghij'))
obj2 = pd.Series(np.arange(12),index = list('cdefghijklmn'))
obj1.corr(obj2)


1.0

In [124]:
obj1.cov(obj2)

6.0

In [125]:
frame1 = pd.DataFrame(np.random.randn(3,3),index=list('abc'),columns=list('abc'))
frame1.corr

<bound method DataFrame.corr of           a         b         c
a  1.253773  0.429059  1.535575
b -0.113987 -2.837396 -0.894469
c -0.548208  0.834003  0.994863>

In [126]:
frame1.cov()

Unnamed: 0,a,b,c
a,0.884409,0.357304,0.579613
b,0.357304,4.052147,2.442527
c,0.579613,2.442527,1.627843


In [129]:
#corrwith用于计算每一列与Series的相关系数
frame1.corrwith(frame1['a'])

a    1.000000
b    0.188742
c    0.483065
dtype: float64

In [130]:
"""唯一数、值计数以及成员资格"""

'唯一数、值计数以及成员资格'

In [132]:
#unique返回没有排序的唯一值数组
obj = pd.Series(['c','a','d','a','a','b','b','c','c'])
uniques = obj.unique()
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [133]:
#value_counts()返回各数的计数
obj.value_counts()

a    3
c    3
b    2
d    1
dtype: int64

In [134]:
obj[obj.isin(['b','c'])]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [135]:
"""处理缺失数据"""

'处理缺失数据'

In [137]:
#fillna方法用于填补缺失数据
#isnull方法用于判断数据是否为空数据
#dropna方法用于舍弃缺失数据
#返回一个新的Series或者DataFrame，对原数据没有影响,如果想在原数据上进行直接修改，使用inplace参数
data = pd.Series([1,np.nan,3.5,np.nan,7])
data.fillna(0)


0    1.0
1    0.0
2    3.5
3    0.0
4    7.0
dtype: float64

In [138]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [140]:
#对DataFrame来说，dropna方法如果发现缺失值，就会进行整行删除，不过可以指定删除的方式，how=all，是当整行全是na的时候才进行删除
#还可以指定删除的轴
data = pd.DataFrame([[1,6.5,3],[1,np.nan,np.nan],[np.nan,np.nan,np.nan],[np.nan,6.5,3]])
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [143]:
data.dropna(how='all',axis=1,inplace=True)
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [144]:
data.fillna({1:2,2:3})

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,2.0,3.0
2,,2.0,3.0
3,,6.5,3.0


In [145]:
data.fillna(method='ffill')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,6.5,3.0
2,1.0,6.5,3.0
3,1.0,6.5,3.0


In [146]:
"""层次化索引"""

'层次化索引'

In [149]:
"""层次化索引是pandas的一项重要功能，它能使你在一个轴上有多个索引级别，也就是说，它能使你以低维度形式处理高维度数据"""
data = pd.Series(np.random.randn(10),index=[['a','a','a','b','b','b','c','c','d','d'],[1,2,3,1,2,3,1,2,2,3]])
data


a  1    0.497970
   2   -0.042377
   3   -0.018719
b  1    0.061412
   2   -1.313888
   3    0.751478
c  1   -0.241329
   2   -1.945047
d  2    0.460786
   3   -0.411931
dtype: float64

In [154]:
data.index

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])

In [150]:
data['b']

1    0.061412
2   -1.313888
3    0.751478
dtype: float64

In [151]:
data[:,2]

a   -0.042377
b   -1.313888
c   -1.945047
d    0.460786
dtype: float64

In [152]:
#层次化索引在数据重塑和基于分组的操作中扮演重要的角色。比如，上面的数据可以使用unstack方法重塑成为一个DataFrame
data.unstack()

Unnamed: 0,1,2,3
a,0.49797,-0.042377,-0.018719
b,0.061412,-1.313888,0.751478
c,-0.241329,-1.945047,
d,,0.460786,-0.411931


In [153]:
#unstack的逆运算是stack()
data.unstack().stack()

a  1    0.497970
   2   -0.042377
   3   -0.018719
b  1    0.061412
   2   -1.313888
   3    0.751478
c  1   -0.241329
   2   -1.945047
d  2    0.460786
   3   -0.411931
dtype: float64

In [160]:
frame = pd.DataFrame(np.arange(12).reshape((4,3)),index=[['a','a','b','b'],[1,2,1,2]],columns=[['Ohio','Ohio','Colorado'],['Green','Red','Green']])
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [157]:
#可以创建层次化索引MulitIndex

index = pd.MultiIndex.from_arrays([['Ohio','Ohio','Colorado'],['Green','Red','Green']],names=['state','color'])
index

MultiIndex(levels=[['Colorado', 'Ohio'], ['Green', 'Red']],
           labels=[[1, 1, 0], [0, 1, 0]],
           names=['state', 'color'])

In [162]:
frame.columns=index
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [164]:
frame.index.names=['key1','key2']
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [165]:
#swaplevel接受两个级别编号或者名称，并返回一个互换了级别的新对象
frame.swaplevel('key1','key2')

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [168]:
#sort_index的level是指根据索引级别进行排序
frame.sort_index(level = 0)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [169]:
#根据级别进行汇总
frame.sum(level='key2')

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [170]:
frame.sum(level='color',axis=1)

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


In [171]:
"""数据加载"""
#最常用的是read_csv,read_table
#header 默认第一行为columns，如果指定header=None，则表明没有索引行，第一行就是数据
#index_col，默认作为索引的为第一列，可以设为index_col为-1，表明没有索引列
#nrows 表明读取的行数
#sep或delimiter 分隔符，read_csv默认是逗号，而read_table默认是制表符\t，
#encoding 编码格式
"""数据输出"""
#最常用的to_csv参数
#index=False 不输出索引
#header = False 不输出头部
#encoding 编码格式

'数据输出'

In [172]:
"""读取mysql数据库"""

'读取mysql数据库'