In [1]:
import pandas as pd
import numpy as np

In [2]:
data = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
        'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
        'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
        'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}
labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']

In [3]:
df = pd.DataFrame(data, index=labels)
df

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,2.0,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, a to j
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   animal    10 non-null     object 
 1   age       8 non-null      float64
 2   visits    10 non-null     int64  
 3   priority  10 non-null     object 
dtypes: float64(1), int64(1), object(2)
memory usage: 720.0+ bytes


In [5]:
df.iloc[:3]    # 展示前三行

# 第二种方法 df.head(3)

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no


In [7]:
df[['animal', 'age']]

# 第二种方法是 df.loc[:, ['animal', 'age']]

Unnamed: 0,animal,age
a,cat,2.5
b,cat,3.0
c,snake,0.5
d,dog,
e,dog,5.0
f,cat,2.0
g,snake,4.5
h,cat,
i,dog,7.0
j,dog,3.0


In [9]:
df[df['age'].isnull()]    # 选出age缺失的行

Unnamed: 0,animal,age,visits,priority
d,dog,,3,yes
h,cat,,1,yes


In [10]:
df

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,2.0,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [11]:
df['visits'].sum()

19

In [13]:
df.groupby('animal')['age'].mean()    # 通过animal进行分类，取出age这一列，求出平均值

animal
cat      2.5
dog      5.0
snake    2.5
Name: age, dtype: float64

In [14]:
df['animal'].value_counts()

cat      4
dog      4
snake    2
Name: animal, dtype: int64

In [16]:
df.sort_values(by=['age', 'visits'], ascending=[False, True])    # 先按照age进行降序排列，然后按照visits进行升序排列

Unnamed: 0,animal,age,visits,priority
i,dog,7.0,2,no
e,dog,5.0,2,no
g,snake,4.5,1,no
j,dog,3.0,1,no
b,cat,3.0,3,yes
a,cat,2.5,1,yes
f,cat,2.0,3,no
c,snake,0.5,2,no
h,cat,,1,yes
d,dog,,3,yes


In [17]:
df['priority'] = df['priority'].map({'yes': True, 'no': False})
df

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,True
b,cat,3.0,3,True
c,snake,0.5,2,False
d,dog,,3,True
e,dog,5.0,2,False
f,cat,2.0,3,False
g,snake,4.5,1,False
h,cat,,1,True
i,dog,7.0,2,False
j,dog,3.0,1,False


In [19]:
df['animal'] = df['animal'].replace('snake', 'python')    # 将animal中的snake换成python
df

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,True
b,cat,3.0,3,True
c,python,0.5,2,False
d,dog,,3,True
e,dog,5.0,2,False
f,cat,2.0,3,False
g,python,4.5,1,False
h,cat,,1,True
i,dog,7.0,2,False
j,dog,3.0,1,False


In [20]:
df.dtypes

animal       object
age         float64
visits        int64
priority       bool
dtype: object

In [24]:
# 插入新行
df.loc['k'] = [5.5, 'dog', 'no', 2]
df

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,1
b,cat,3,3,1
c,python,0.5,2,0
d,dog,,3,1
e,dog,5,2,0
f,cat,2,3,0
g,python,4.5,1,0
h,cat,,1,1
i,dog,7,2,0
j,dog,3,1,0


In [25]:
df = df.drop('k')    # 删除索引为k的行
df

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,1
b,cat,3.0,3,1
c,python,0.5,2,0
d,dog,,3,1
e,dog,5.0,2,0
f,cat,2.0,3,0
g,python,4.5,1,0
h,cat,,1,1
i,dog,7.0,2,0
j,dog,3.0,1,0


## 进阶操作

### 删除数值重复的行

In [26]:
df = pd.DataFrame({'A': [1, 2, 2, 3, 4 ,5, 6, 6, 7, 7]})
df


Unnamed: 0,A
0,1
1,2
2,2
3,3
4,4
5,5
6,6
7,6
8,7
9,7


In [28]:
df1 = df.drop_duplicates(subset='A')    # 对于‘A'这一列，删除重复的数值
df1

Unnamed: 0,A
0,1
1,2
3,3
4,4
5,5
6,6
8,7


### 一个全数值的df,每个数值减去改行的平均数

In [29]:
df = pd.DataFrame(np.random.random(size=(5, 3)))
df

Unnamed: 0,0,1,2
0,0.429529,0.77173,0.462763
1,0.274116,0.365683,0.428646
2,0.762162,0.554346,0.749627
3,0.431771,0.793596,0.106272
4,0.630109,0.34771,0.279138


In [30]:
df = df.sub(df.mean(axis=1), axis=0)
df

Unnamed: 0,0,1,2
0,-0.125145,0.217056,-0.091911
1,-0.082032,0.009535,0.072498
2,0.073451,-0.134366,0.060915
3,-0.012108,0.349716,-0.337608
4,0.211124,-0.071276,-0.139848
