In [1]:
from pandas import Series, DataFrame
import numpy as np
import pandas as pd

# 1 pandas数据结构介绍

  1.1 序列（Series）
    - 索引
    - 分片
    - 字典操作
    - 索引自动对齐运算
    - name 属性
  1.2 数据框（DataFrame）
    - columns
    - index
    - 索引和切片
    - 弹出（pop）
    - .T 可以转置数据框
  1.3 Panel

# 2 索引对象 

In [2]:
s = Series(range(3), index=['a', 'b', 'c'])

In [3]:
s.index

Index(['a', 'b', 'c'], dtype='object')

In [4]:
s.index[1:2]

Index(['b'], dtype='object')

索引对象不允许被修改 

In [5]:
ss = Series(range(3, 6), index=s.index)

In [6]:
ss.index is s.index

True

# 3 核心的基本函数

- .head()
- .tail()
- .shape
- .values
- .reindex() 改变索引，创建一个新的对象

# 4 索引和旋转

object.ix[] 语法支持整数和标签混合使用进行索引，虽然语法很强大，但这种索引方法常常让使用者感到困惑。我们更喜欢更加严格的[]、iloc、loc语法进行索引。

In [7]:
dates = pd.date_range('2017-5-15', periods=8)

In [8]:
df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['a', 'b', 'c', 'd'])

In [9]:
df

Unnamed: 0,a,b,c,d
2017-05-15,0.422582,2.192212,-0.195834,1.043583
2017-05-16,0.058919,0.415707,0.387199,0.157099
2017-05-17,0.389779,-0.725836,-1.42132,0.872132
2017-05-18,1.899611,1.075165,0.36344,0.349861
2017-05-19,-1.620429,1.064473,-0.930995,-1.439978
2017-05-20,0.874462,0.673706,-0.11988,1.425492
2017-05-21,-1.403869,-0.182207,3.523592,-0.152446
2017-05-22,-1.388238,0.08674,-1.03597,-0.028374


In [10]:
s = df['a']  # []索引

In [11]:
s

2017-05-15    0.422582
2017-05-16    0.058919
2017-05-17    0.389779
2017-05-18    1.899611
2017-05-19   -1.620429
2017-05-20    0.874462
2017-05-21   -1.403869
2017-05-22   -1.388238
Freq: D, Name: a, dtype: float64

In [12]:
df[['a', 'b']]  #索引多列

Unnamed: 0,a,b
2017-05-15,0.422582,2.192212
2017-05-16,0.058919,0.415707
2017-05-17,0.389779,-0.725836
2017-05-18,1.899611,1.075165
2017-05-19,-1.620429,1.064473
2017-05-20,0.874462,0.673706
2017-05-21,-1.403869,-0.182207
2017-05-22,-1.388238,0.08674


loc 语法是严格基于标签的，其接收的标签对象包括：
  - 一个标签
  - 一列标签， ['a', 'b']
  - 标签的一个分片 'a':'f'，标签的开始和结束位置都会包括在内。
  - 单参的可调用对象

In [13]:
df.loc[:, 'a':'c']

Unnamed: 0,a,b,c
2017-05-15,0.422582,2.192212,-0.195834
2017-05-16,0.058919,0.415707,0.387199
2017-05-17,0.389779,-0.725836,-1.42132
2017-05-18,1.899611,1.075165,0.36344
2017-05-19,-1.620429,1.064473,-0.930995
2017-05-20,0.874462,0.673706,-0.11988
2017-05-21,-1.403869,-0.182207,3.523592
2017-05-22,-1.388238,0.08674,-1.03597


iloc语法是基于位置的，可以使用布尔数组进行筛选。如果请求的索引超出数据框的边界，抛出IndexError异常。其接收的参数包括：
  -  一个整数，如 5
  - 一个整数序列，[1, 2, 3]
  - 一个分片对象，如 1:7
  - 一个布尔数组
  - 单参数的可调用函数

In [14]:
df.iloc[:4, :3] # 前4行和前3列

Unnamed: 0,a,b,c
2017-05-15,0.422582,2.192212,-0.195834
2017-05-16,0.058919,0.415707,0.387199
2017-05-17,0.389779,-0.725836,-1.42132
2017-05-18,1.899611,1.075165,0.36344


# 5 算术运算与对齐

In [15]:
a = Series([1, 2, 3, 4], index=['d', 'b', 'c', 'a'])
b = Series([5, 6, 7, 8], index=['b', 'd', 'e', 'f'])

In [16]:
a+b # 公共的索引才计算，不同的为NaN

a    NaN
b    7.0
c    NaN
d    7.0
e    NaN
f    NaN
dtype: float64

 - DataFrame + DataFrame也同样使用上面的规则（行列相同的计算，不同的为NaN）。
 - DataFrame + Series 会使用到数组扩充的转换机制。

# 6 数据清洗

## 6.1 处理默认值

np.nan和Python中的None也被认为是默认值。

In [17]:
data = Series([np.nan, None, 'cat', 'dog'])

In [18]:
data

0     NaN
1    None
2     cat
3     dog
dtype: object

In [19]:
data.isnull()

0     True
1     True
2    False
3    False
dtype: bool

In [20]:
np.nan ==np.nan

False

使用 fillna() 函数填充默认值。 

In [21]:
df = DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f', 'h'], columns=['one', 'two', 'three'])

In [22]:
df = df.reindex(['a', 'b', 'c', 'd', 'e'])

In [23]:
df

Unnamed: 0,one,two,three
a,1.524755,1.875874,-1.222462
b,,,
c,0.183745,-2.091938,-2.124474
d,,,
e,0.520913,-1.275132,-0.273627


In [24]:
df.fillna(1)

Unnamed: 0,one,two,three
a,1.524755,1.875874,-1.222462
b,1.0,1.0,1.0
c,0.183745,-2.091938,-2.124474
d,1.0,1.0,1.0
e,0.520913,-1.275132,-0.273627


In [25]:
df.loc['e'] = None

In [26]:
df

Unnamed: 0,one,two,three
a,1.524755,1.875874,-1.222462
b,,,
c,0.183745,-2.091938,-2.124474
d,,,
e,,,


In [27]:
df.fillna(method='pad', limit=1)  # 使用该行的上一行填充数据

Unnamed: 0,one,two,three
a,1.524755,1.875874,-1.222462
b,1.524755,1.875874,-1.222462
c,0.183745,-2.091938,-2.124474
d,0.183745,-2.091938,-2.124474
e,,,


使用 dropna() 丢弃掉默认值的行或者列。 

In [28]:
df.dropna(axis=0) # drop行

Unnamed: 0,one,two,three
a,1.524755,1.875874,-1.222462
c,0.183745,-2.091938,-2.124474


In [29]:
df.dropna(axis=1) # drop列

a
b
c
d
e


`interpolate() `提供了多种插值的方法，平方插值、立方插值等。 

## 6.2 数据排序

`DataFrame.sort_values(by, ascending=True, inplace=False)`

- by: 根据某些列排序
- ascending： 是否升序（默认True）
- inplace : 是否直接修改原数据

eg:

`df.sort_values(by=['age', 'name'], ascending=[True, False])`

## 6.3 重复数据处理

查找重复数据

In [30]:
df = pd.DataFrame(
    data={
            "id": [1, 1, 3, 4, 5],
            "name": ["刘一", "刘一", "张三", "李四", "王五"],
        })

In [31]:
df.duplicated(keep="first")  # 默认是所有数据相同才是重复数据, 保留第一个

0    False
1     True
2    False
3    False
4    False
dtype: bool

In [32]:
df.duplicated(['id'])

0    False
1     True
2    False
3    False
4    False
dtype: bool

**取出重复数据**

In [33]:
df[df.duplicated(['id'])]

Unnamed: 0,id,name
1,1,刘一


删除重复数据

In [34]:
df.drop_duplicates()

Unnamed: 0,id,name
0,1,刘一
2,3,张三
3,4,李四
4,5,王五
