In [1]:
import pandas as pd

print(pd.__version__)

2.2.1


# 读取 CSV 文件

## [`pandas.read_csv`](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html)

- `dtype`：指定数据类型，比如指定股票代码这一列为 `str` 类型

```Python
data = pd.read_csv("./stocks.csv", dtype={"secCode":str})
```

- `parse_dates`：尝试将某一列解析为日期。设定为 `True`，则解析索引。也可以使用列表指定解析某一列

```Python
data = pd.read_csv("./stocks.csv", parse_dates=[1])        # 解析第二列
data = pd.read_csv("./stocks.csv", pares_dates=["date"])   # 解析 date 列
```

- `on_bad_lines`：{'error', 'warn', 'skip'}


# 修改列名/索引

## [`pandas.DataFrame.rename`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rename.html)

- `mapper`：`mapper` 与 `axis` 搭配使用
- `columns`：修改列名
- `index`：修改索引
- `inplace`：是否原地修改
- `axis`


In [2]:
data = pd.DataFrame(
    {"A": [1, 2, 3], "B": [2, 3, 4], "C": [3, 4, 5]}, index=["x", "y", "z"]
)

data

Unnamed: 0,A,B,C
x,1,2,3
y,2,3,4
z,3,4,5


In [3]:
data.rename(columns={"A": "a", "B": "b"})

Unnamed: 0,a,b,C
x,1,2,3
y,2,3,4
z,3,4,5


In [4]:
data.rename(index={"x": "X", "y": "Y"})

Unnamed: 0,A,B,C
X,1,2,3
Y,2,3,4
z,3,4,5


In [5]:
data.rename(mapper=lambda c: c.lower(), axis=1)

Unnamed: 0,a,b,c
x,1,2,3
y,2,3,4
z,3,4,5


# 插入/修改列

## [`pandas.DataFrame.insert`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.insert.html)

- `loc`：插入的索引位置
- `column`：列名
- `value`：Scalar/Series/array-like，插入列的内容


In [6]:
data = pd.DataFrame(
    {"A": [1, 2, 3], "B": [2, 3, 4], "C": [3, 4, 5]}, index=["x", "y", "z"]
)

data

Unnamed: 0,A,B,C
x,1,2,3
y,2,3,4
z,3,4,5


In [7]:
data.insert(0, column="D", value=0)
data

Unnamed: 0,D,A,B,C
x,0,1,2,3
y,0,2,3,4
z,0,3,4,5


In [8]:
data.insert(0, column="E", value=[6, 6, 6])
data

Unnamed: 0,E,D,A,B,C
x,6,0,1,2,3
y,6,0,2,3,4
z,6,0,3,4,5


## [`pandas.DataFrame.assign`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.assign.html)


In [9]:
data = pd.DataFrame(
    {"A": [1, 2, 3], "B": [2, 3, 4], "C": [3, 4, 5]}, index=["x", "y", "z"]
)

data

Unnamed: 0,A,B,C
x,1,2,3
y,2,3,4
z,3,4,5


In [10]:
data.assign(D=lambda df: df.C + 1)

Unnamed: 0,A,B,C,D
x,1,2,3,4
y,2,3,4,5
z,3,4,5,6


In [11]:
data.assign(D=lambda df: df.C + 1, E=lambda df: df.D + 1)

Unnamed: 0,A,B,C,D,E
x,1,2,3,4,5
y,2,3,4,5,6
z,3,4,5,6,7


## `loc/iloc`

选择已有列名则修改原始数据；新定义列名则插入新的数据


In [12]:
data = pd.DataFrame(
    {"A": [1, 2, 3], "B": [2, 3, 4], "C": [3, 4, 5]}, index=["x", "y", "z"]
)

data

Unnamed: 0,A,B,C
x,1,2,3
y,2,3,4
z,3,4,5


In [13]:
data.loc[:, "D"] = 0
data

Unnamed: 0,A,B,C,D
x,1,2,3,0
y,2,3,4,0
z,3,4,5,0


In [14]:
data.loc[:, "D"] = 1
data

Unnamed: 0,A,B,C,D
x,1,2,3,1
y,2,3,4,1
z,3,4,5,1


# 插入/修改行

## `loc/iloc`


In [15]:
data = pd.DataFrame(
    {"A": [1, 2, 3], "B": [2, 3, 4], "C": [3, 4, 5]}, index=["x", "y", "z"]
)

data

Unnamed: 0,A,B,C
x,1,2,3
y,2,3,4
z,3,4,5


In [16]:
data.loc["x"] = 0
data

Unnamed: 0,A,B,C
x,0,0,0
y,2,3,4
z,3,4,5


In [17]:
data.loc["m"] = 1
data

Unnamed: 0,A,B,C
x,0,0,0
y,2,3,4
z,3,4,5
m,1,1,1


# 删除行/列

## [`pandas.DataFrame.drop`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.drop.html)


In [18]:
data = pd.DataFrame(
    {"A": [1, 2, 3], "B": [2, 3, 4], "C": [3, 4, 5]}, index=["x", "y", "z"]
)

data

Unnamed: 0,A,B,C
x,1,2,3
y,2,3,4
z,3,4,5


In [19]:
data.drop(labels=["A", "B"], axis=1)

Unnamed: 0,C
x,3
y,4
z,5


In [20]:
data.drop(columns=["A", "B"])

Unnamed: 0,C
x,3
y,4
z,5


In [21]:
data.drop(labels=["x"], axis=0)

Unnamed: 0,A,B,C
y,2,3,4
z,3,4,5


In [22]:
data.drop(index=["x"], inplace=True)
data

Unnamed: 0,A,B,C
y,2,3,4
z,3,4,5


# 设置/重置索引

## [`pandas.DataFrame.set_index`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.set_index.html)

- `keys`：label or array-like or list of labels/arrays
- `drop`：是否删除作为索引的列，默认为 True
- `append`：Whether to append columns to existing index, default False.
- `inplace`


In [23]:
data = pd.DataFrame(
    {"month": [1, 4, 7, 10], "year": [2012, 2014, 2013, 2014], "sale": [55, 40, 84, 31]}
)

data

Unnamed: 0,month,year,sale
0,1,2012,55
1,4,2014,40
2,7,2013,84
3,10,2014,31


In [24]:
data.set_index("month", drop=False)

Unnamed: 0_level_0,month,year,sale
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,2012,55
4,4,2014,40
7,7,2013,84
10,10,2014,31


In [25]:
data.set_index(["month", "year"])

Unnamed: 0_level_0,Unnamed: 1_level_0,sale
month,year,Unnamed: 2_level_1
1,2012,55
4,2014,40
7,2013,84
10,2014,31


In [26]:
data.set_index("month", append=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,year,sale
Unnamed: 0_level_1,month,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,2012,55
1,4,2014,40
2,7,2013,84
3,10,2014,31


## [`pandas.DataFrame.reset_index`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.reset_index.html#pandas.DataFrame.reset_index)


In [27]:
data = pd.DataFrame(
    [("bird", 389.0), ("bird", 24.0), ("mammal", 80.5), ("mammal", 78)],
    index=["falcon", "parrot", "lion", "monkey"],
    columns=("class", "max_speed"),
)

data

Unnamed: 0,class,max_speed
falcon,bird,389.0
parrot,bird,24.0
lion,mammal,80.5
monkey,mammal,78.0


In [28]:
data.reset_index()

Unnamed: 0,index,class,max_speed
0,falcon,bird,389.0
1,parrot,bird,24.0
2,lion,mammal,80.5
3,monkey,mammal,78.0


In [29]:
data.reset_index(drop=True)

Unnamed: 0,class,max_speed
0,bird,389.0
1,bird,24.0
2,mammal,80.5
3,mammal,78.0


# 字符串方法

## [`pandas.Series.str.split`](https://pandas.pydata.org/docs/reference/api/pandas.Series.str.split.html)

这个方法我常用在需要从日期中提取年份时，比如：

In [30]:
data = pd.DataFrame(
    {"ticker": ["000001", "000018", "600201"],
     "date": ["2018-08-03", "2019-02-12", "2020-12-31"]},
)

data

Unnamed: 0,ticker,date
0,1,2018-08-03
1,18,2019-02-12
2,600201,2020-12-31


In [42]:
# 提取年份，expand=True
data.loc[:, "year"] = data.date.str.split("-", expand=True, n=1).iloc[:,0]
data

Unnamed: 0,ticker,date,year
0,1,2018-08-03,2018
1,18,2019-02-12,2019
2,600201,2020-12-31,2020


## [`pandas.series.str.zfill`](https://pandas.pydata.org/docs/reference/api/pandas.Series.str.zfill.html)

在读取股价数据时，有时候股票代码前面的 0 缺失，可以使用这个方法来填充。

In [43]:
data = pd.DataFrame(
    {"ticker": ["1", "18", "600201"],
     "date": ["2018-08-03", "2019-02-12", "2020-12-31"]},
)

data

Unnamed: 0,ticker,date
0,1,2018-08-03
1,18,2019-02-12
2,600201,2020-12-31


In [44]:
data.loc[:, "ticker"] = data.ticker.str.zfill(6)
data

Unnamed: 0,ticker,date
0,1,2018-08-03
1,18,2019-02-12
2,600201,2020-12-31
