# 一、准备工作

In [11]:
import numpy as np
import pandas as pd

# 二、生成对象

## 1. 使用列表创建一个 Series

In [12]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

- 一个 series 是一个一维的标记数组，可以容纳任何数据类型（整数、字符串、浮点数、Python 对象）；
- 与 Python 列表不同，一个 series 总是包含相同类型的数据。

## 2. 使用列表创建一个 DataFrame

In [14]:
lst = ['Gamer47', 'Shox', 'Simple']
df = pd.DataFrame(lst)
df

Unnamed: 0,0
0,Gamer47
1,Shox
2,Simple


## 3. 使用 Series 字典对象生成 DataFrame

In [17]:
df2 = pd.DataFrame({'A': 1,
                    'B': pd.Timestamp('20210219'),
                    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D': np.array([3] * 4, dtype='int32'),
                    'E': pd.Categorical(['test', 'train', 'test', 'train']),
                    'F': 'foo'})
df2

Unnamed: 0,A,B,C,D,E,F
0,1,2021-02-19,1.0,3,test,foo
1,1,2021-02-19,1.0,3,train,foo
2,1,2021-02-19,1.0,3,test,foo
3,1,2021-02-19,1.0,3,train,foo


## 4. 创建一个空的 DataFrame

In [18]:
df = pd.DataFrame()
df

# 三、查看数据

In [21]:
dates = pd.date_range('20210219', periods=6)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2021-02-19,-0.085578,-1.355016,-0.827,-0.689935
2021-02-20,-0.948585,-0.349289,-0.267233,0.844029
2021-02-21,-1.777166,0.739841,-0.819841,1.020413
2021-02-22,-0.717299,0.364915,2.610492,0.020046
2021-02-23,-0.110647,-0.755049,-0.182103,1.972917
2021-02-24,0.02409,1.309118,-0.184814,0.630202


## 1. 查看头部数据

In [22]:
df.head()

Unnamed: 0,A,B,C,D
2021-02-19,-0.085578,-1.355016,-0.827,-0.689935
2021-02-20,-0.948585,-0.349289,-0.267233,0.844029
2021-02-21,-1.777166,0.739841,-0.819841,1.020413
2021-02-22,-0.717299,0.364915,2.610492,0.020046
2021-02-23,-0.110647,-0.755049,-0.182103,1.972917


## 2. 查看尾部数据

In [23]:
df.tail()

Unnamed: 0,A,B,C,D
2021-02-20,-0.948585,-0.349289,-0.267233,0.844029
2021-02-21,-1.777166,0.739841,-0.819841,1.020413
2021-02-22,-0.717299,0.364915,2.610492,0.020046
2021-02-23,-0.110647,-0.755049,-0.182103,1.972917
2021-02-24,0.02409,1.309118,-0.184814,0.630202


## 3. 查看数据的统计摘要

In [24]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.602531,-0.00758,0.054917,0.632945
std,0.694915,0.992184,1.28774,0.90729
min,-1.777166,-1.355016,-0.827,-0.689935
25%,-0.890763,-0.653609,-0.681689,0.172585
50%,-0.413973,0.007813,-0.226024,0.737116
75%,-0.091845,0.64611,-0.182781,0.976317
max,0.02409,1.309118,2.610492,1.972917


## 4. 查看索引和列名

In [25]:
df.index

DatetimeIndex(['2021-02-19', '2021-02-20', '2021-02-21', '2021-02-22',
               '2021-02-23', '2021-02-24'],
              dtype='datetime64[ns]', freq='D')

In [26]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

# 四、索引

## 1. 简述 Pandas Index

pandas 支持四种类型的多轴索引，它们统称索引器：

- DataFrame.[] 称为索引运算符
- DataFrame.loc[] 用于标签
- DataFrame.iloc[] 用于基于位置和整数
- DataFrame.ix[] 用于基于标签和整数

## 2. 定义重新索引 (Reindexing)

In [33]:
N = 20
df = pd.DataFrame({
    'A': pd.date_range(start='20210219', periods=N, freq='D'),
    'x': np.linspace(0, stop=N-1, num=N),
    'y': np.random.rand(N),
    'C': np.random.choice(['Low', 'Medium', 'High'], N).tolist(),
    'D': np.random.normal(100, 10, size=(N)).tolist()
})

df_reindexed = df.reindex(index=[0, 2, 5], columns=['A', 'C', 'B'])
df_reindexed

Unnamed: 0,A,C,B
0,2021-02-19,Medium,
2,2021-02-21,Medium,
5,2021-02-24,Medium,


## 3. 设置索引

pandas.set_index() 是一种将列表、序列或者 DataFrame 设置为 DataFrame 索引的方法。

**语法：** `DataFrame.set_index(keys, inplace=False)`

In [34]:
df.set_index('A', inplace=True)
df.head()

Unnamed: 0_level_0,x,y,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-02-19,0.0,0.19956,Medium,111.33336
2021-02-20,1.0,0.602096,High,83.369119
2021-02-21,2.0,0.575078,Medium,99.012966
2021-02-22,3.0,0.860347,Medium,105.831498
2021-02-23,4.0,0.445448,High,113.16763


## 4. 重置索引

pandas.Series.reset_index(): 生成一个新的 DataFrame 或带有重置索引的 Series.

In [35]:
sr = pd.Series([10, 25, 3, 11, 24, 6])
index_ = ['Coca Cola', 'Sprite', 'Coke', 'Fanta', 'Dew', 'ThumbsUp']
sr.index = index_
sr

Coca Cola    10
Sprite       25
Coke          3
Fanta        11
Dew          24
ThumbsUp      6
dtype: int64

In [36]:
result = sr.reset_index()
result

Unnamed: 0,index,0
0,Coca Cola,10
1,Sprite,25
2,Coke,3
3,Fanta,11
4,Dew,24
5,ThumbsUp,6
