## <font color='red'>如何修改DataFrame数据？</font>

In [5]:
import pandas as pd
import numpy as np
df = pd.DataFrame(data = np.random.randint(0,150,size = [10,3]),# 计算机科目的考试成绩
                  index = list('ABCDEFGHIJ'),# 行标签，用户
                  columns=['Python','Tensorflow','Keras']) # 考试科目
s = pd.Series(data = np.random.randint(0,150,size = 9),index=list('BCDEFGHIJ'),name = 'PyTorch')
display(df,s)

Unnamed: 0,Python,Tensorflow,Keras
A,108,76,29
B,107,31,98
C,129,141,90
D,70,124,4
E,111,79,107
F,133,111,146
G,75,119,30
H,148,66,95
I,44,119,62
J,40,74,26


B      9
C     76
D     42
E     27
F     39
G     51
H    115
I     24
J    125
Name: PyTorch, dtype: int32

In [6]:
df['PyTorch'] = s
df

Unnamed: 0,Python,Tensorflow,Keras,PyTorch
A,108,76,29,
B,107,31,98,9.0
C,129,141,90,76.0
D,70,124,4,42.0
E,111,79,107,27.0
F,133,111,146,39.0
G,75,119,30,51.0
H,148,66,95,115.0
I,44,119,62,24.0
J,40,74,26,125.0


In [8]:
df.loc['A','Python'] = 256
df

Unnamed: 0,Python,Tensorflow,Keras,PyTorch
A,256,76,29,
B,107,31,98,9.0
C,129,141,90,76.0
D,70,124,4,42.0
E,111,79,107,27.0
F,133,111,146,39.0
G,75,119,30,51.0
H,148,66,95,115.0
I,44,119,62,24.0
J,40,74,26,125.0


In [9]:
df.loc[:,'Keras'] = 128
df

Unnamed: 0,Python,Tensorflow,Keras,PyTorch
A,256,76,128,
B,107,31,128,9.0
C,129,141,128,76.0
D,70,124,128,42.0
E,111,79,128,27.0
F,133,111,128,39.0
G,75,119,128,51.0
H,148,66,128,115.0
I,44,119,128,24.0
J,40,74,128,125.0


In [10]:
df.iloc[2,3] = 512
df

Unnamed: 0,Python,Tensorflow,Keras,PyTorch
A,256,76,128,
B,107,31,128,9.0
C,129,141,128,512.0
D,70,124,128,42.0
E,111,79,128,27.0
F,133,111,128,39.0
G,75,119,128,51.0
H,148,66,128,115.0
I,44,119,128,24.0
J,40,74,128,125.0


In [11]:
df['Python'] += 10
df

Unnamed: 0,Python,Tensorflow,Keras,PyTorch
A,266,76,128,
B,117,31,128,9.0
C,139,141,128,512.0
D,80,124,128,42.0
E,121,79,128,27.0
F,143,111,128,39.0
G,85,119,128,51.0
H,158,66,128,115.0
I,54,119,128,24.0
J,50,74,128,125.0


In [14]:
df[df >= 128] = -df
df

Unnamed: 0,Python,Tensorflow,Keras,PyTorch
A,-266,76,-128,
B,117,31,-128,9.0
C,-139,-141,-128,-512.0
D,80,124,-128,42.0
E,121,79,-128,27.0
F,-143,111,-128,39.0
G,85,119,-128,51.0
H,-158,66,-128,115.0
I,54,119,-128,24.0
J,50,74,-128,125.0


## <font color='red'>多层索引如何筛选数据？</font>

In [49]:
import pandas as pd
# 创建一个有多层索引的DataFrame
data = {
    'Year': [2020, 2020, 2021, 2021, 2022, 2022],
    'Quarter': [1, 2, 1, 2, 1, 2],
    'Sales': [100, 150, 120, 180, 130, 200],
    'profit':[50,80,80,100,30,50]
}
df = pd.DataFrame(data)
df

Unnamed: 0,Year,Quarter,Sales,profit
0,2020,1,100,50
1,2020,2,150,80
2,2021,1,120,80
3,2021,2,180,100
4,2022,1,130,30
5,2022,2,200,50


In [50]:
# 设置多层索引
df.set_index(['Year', 'Quarter'], 
             inplace=True)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Sales,profit
Year,Quarter,Unnamed: 2_level_1,Unnamed: 3_level_1
2020,1,100,50
2020,2,150,80
2021,1,120,80
2021,2,180,100
2022,1,130,30
2022,2,200,50


In [52]:
df.loc[[2020,2022]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Sales,profit
Year,Quarter,Unnamed: 2_level_1,Unnamed: 3_level_1
2020,1,100,50
2020,2,150,80
2022,1,130,30
2022,2,200,50


In [54]:
df.loc[2020].loc[2]

Sales     150
profit     80
Name: 2, dtype: int64

In [55]:
df.loc[(2020,2)]

Sales     150
profit     80
Name: (2020, 2), dtype: int64

In [56]:
df.index

MultiIndex([(2020, 1),
            (2020, 2),
            (2021, 1),
            (2021, 2),
            (2022, 1),
            (2022, 2)],
           names=['Year', 'Quarter'])

In [57]:
df.loc[[(2020,1),(2021,2),(2022,1)]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Sales,profit
Year,Quarter,Unnamed: 2_level_1,Unnamed: 3_level_1
2020,1,100,50
2021,2,180,100
2022,1,130,30


In [58]:
df.loc[(2020,1):(2021,1)]

Unnamed: 0_level_0,Unnamed: 1_level_0,Sales,profit
Year,Quarter,Unnamed: 2_level_1,Unnamed: 3_level_1
2020,1,100,50
2020,2,150,80
2021,1,120,80


In [59]:
df.iloc[0]

Sales     100
profit     50
Name: (2020, 1), dtype: int64

In [61]:
df.iloc[[0,-1]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Sales,profit
Year,Quarter,Unnamed: 2_level_1,Unnamed: 3_level_1
2020,1,100,50
2022,2,200,50


In [63]:
df.iloc[[0,-1],[1]]

Unnamed: 0_level_0,Unnamed: 1_level_0,profit
Year,Quarter,Unnamed: 2_level_1
2020,1,50
2022,2,50


In [64]:
df.iloc[1:4]

Unnamed: 0_level_0,Unnamed: 1_level_0,Sales,profit
Year,Quarter,Unnamed: 2_level_1,Unnamed: 3_level_1
2020,2,150,80
2021,1,120,80
2021,2,180,100


In [67]:
df.loc[[(2020,1),(2022,2)],['Sales']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Sales
Year,Quarter,Unnamed: 2_level_1
2020,1,100
2022,2,200


## <font color='red'>DataFrame简单统计函数有哪些？</font>

In [72]:
import pandas as pd

# 创建示例DataFrame
df = pd.DataFrame(np.random.randint(0,20,size = (20,3)),
                  columns=['A','B','C'])
df

Unnamed: 0,A,B,C
0,3,6,15
1,13,2,1
2,10,6,3
3,17,1,7
4,17,13,5
5,16,2,4
6,12,6,5
7,13,12,4
8,4,7,14
9,17,19,10


In [75]:
# 计算平均值
print("Mean:")
print(df.mean(axis = 1).round(2))

# 计算中位数
print("\nMedian:")
print(df.median())

# 计算总和
print("\nSum:")
print(df.sum())

# 找到最小值
print("\nMin:")
print(df.min())

# 找到最大值
print("\nMax:")
print(df.max())

# 计算标准差
print("\nStandard Deviation:")
print(df.std())

# 计算方差
print("\nVariance:")
print(df.var())

Mean:
0      8.00
1      5.33
2      6.33
3      8.33
4     11.67
5      7.33
6      7.67
7      9.67
8      8.33
9     15.33
10     7.33
11     9.33
12     6.67
13    12.33
14     6.67
15    10.33
16    11.33
17     8.67
18    13.67
19    14.67
dtype: float64

Median:
A    11.5
B     8.5
C     8.0
dtype: float64

Sum:
A    221
B    172
C    174
dtype: int64

Min:
A    3
B    0
C    1
dtype: int32

Max:
A    19
B    19
C    16
dtype: int32

Standard Deviation:
A    4.882568
B    5.679140
C    4.646447
dtype: float64

Variance:
A    23.839474
B    32.252632
C    21.589474
dtype: float64


In [78]:
df.loc[2,'A'] = np.nan
df

Unnamed: 0,A,B,C
0,3.0,6,15
1,13.0,2,1
2,,6,3
3,17.0,1,7
4,17.0,13,5
5,16.0,2,4
6,12.0,6,5
7,13.0,12,4
8,4.0,7,14
9,17.0,19,10


In [79]:
# 计算非缺失值数量
print("\nCount:")
print(df.count())


Count:
A    19
B    20
C    20
dtype: int64


In [80]:
# 计算分位数
print("\nQuantile:")
print(df.quantile([0.25, 0.5, 0.75]))


Quantile:
         A      B     C
0.25   7.0   4.25   5.0
0.50  12.0   8.50   8.0
0.75  15.5  13.00  12.5


In [82]:
# 生成描述性统计信息
print("\nDescribe:")
display(df.describe())


Describe:


Unnamed: 0,A,B,C
count,19.0,20.0,20.0
mean,11.105263,8.6,8.7
std,5.009932,5.67914,4.646447
min,3.0,0.0,1.0
25%,7.0,4.25,5.0
50%,12.0,8.5,8.0
75%,15.5,13.0,12.5
max,19.0,19.0,16.0


In [84]:
df

Unnamed: 0,A,B,C
0,3.0,6,15
1,13.0,2,1
2,,6,3
3,17.0,1,7
4,17.0,13,5
5,16.0,2,4
6,12.0,6,5
7,13.0,12,4
8,4.0,7,14
9,17.0,19,10


In [83]:
# 计算唯一值的频次
print("\nValue Counts:")
print(df['A'].value_counts())


Value Counts:
17.0    3
13.0    2
12.0    2
8.0     2
5.0     2
3.0     1
16.0    1
4.0     1
10.0    1
6.0     1
11.0    1
19.0    1
15.0    1
Name: A, dtype: int64


In [85]:


# 找到最小值和最大值的索引位置
print("\nIndex of Min:")
print(df['B'].idxmin())

print("\nIndex of Max:")
print(df['C'].idxmax())


Index of Min:
12

Index of Max:
19


## <font color='red'>DataFrame高级统计函数有哪些？</font>

In [95]:
import pandas as pd
# 创建示例DataFrame
df = pd.DataFrame(np.random.randint(0,50,size = (10,3)),columns=['A','B','C'])
df

Unnamed: 0,A,B,C
0,29,20,32
1,16,0,31
2,36,9,49
3,42,15,6
4,13,34,10
5,15,32,14
6,19,29,3
7,23,38,2
8,36,43,33
9,22,35,3


In [96]:
# 计算数据的累计和
df.cumsum()

Unnamed: 0,A,B,C
0,29,20,32
1,45,20,63
2,81,29,112
3,123,44,118
4,136,78,128
5,151,110,142
6,170,139,145
7,193,177,147
8,229,220,180
9,251,255,183


In [98]:
df.cummin()

Unnamed: 0,A,B,C
0,29,20,32
1,16,0,31
2,16,0,31
3,16,0,6
4,13,0,6
5,13,0,6
6,13,0,3
7,13,0,2
8,13,0,2
9,13,0,2


In [99]:
df.cumprod()

Unnamed: 0,A,B,C
0,29,20,32
1,464,0,992
2,16704,0,48608
3,701568,0,291648
4,9120384,0,2916480
5,136805760,0,40830720
6,-1695657856,0,122492160
7,-345425024,0,244984320
8,449601024,0,-505452032
9,1301287936,0,-1516356096


In [102]:
import pandas as pd
data = {'A': [10, 20, 60, 40, 50]}
df = pd.DataFrame(data)
# 计算数据的差分
df['diff_A'] = df['A'].diff(periods=2)
df

Unnamed: 0,A,diff_A
0,10,
1,20,
2,60,50.0
3,40,20.0
4,50,-10.0


In [103]:
import pandas as pd

data = {'A': [10, 20, 30, 40, 50]}
df = pd.DataFrame(data)

# 计算数据的百分比变化
df['pct_change_A'] = df['A'].pct_change(periods=2)
df

Unnamed: 0,A,pct_change_A
0,10,
1,20,
2,30,2.0
3,40,1.0
4,50,0.666667


## <font color='red'>DataFrame相关性分析函数有哪些？</font>

In [10]:
import pandas as pd

# 示例数据
data = {
    '销售额': [100, 150, 200, 250, 300],
    '广告费用': [10, 20, 10, 40, 50],
    '利润': [50, 40, 40, 20, 60]}

# 创建DataFrame
df = pd.DataFrame(data)
df

Unnamed: 0,销售额,广告费用,利润
0,100,10,50
1,150,20,40
2,200,10,40
3,250,40,20
4,300,50,60


In [2]:
df.cov()

Unnamed: 0,销售额,广告费用,利润
销售额,6250.0,1250.0,1250.0
广告费用,1250.0,250.0,250.0
利润,1250.0,250.0,250.0


In [4]:
df['销售额'].cov(df['广告费用'])

1250.0

In [6]:
df['销售额'].var()

6250.0

In [11]:
df.corr()

Unnamed: 0,销售额,广告费用,利润
销售额,1.0,0.870388,0.0
广告费用,0.870388,1.0,0.037113
利润,0.0,0.037113,1.0


## <font color='red'>多层索引如何按层计算？</font>

In [5]:
import pandas as pd
data = {'A': [1, 2, 3], 'B': [4, 5, 6]}
df = pd.DataFrame(data)
display(df)
# 沿着每一列（即按列）计算总和
sum_per_column = df.sum(axis=0)
print(sum_per_column)

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


A     6
B    15
dtype: int64


In [6]:
df.sum(axis = 1)

0    5
1    7
2    9
dtype: int64

In [7]:
import pandas as pd
import numpy as np

df = pd.DataFrame(np.random.randint(0,151,size = (20,3)),
                  columns=['Python','Math','Chinese'],
                  index = pd.MultiIndex.from_product([list('ABCDEFGHIJ'),['期中','期末']]))
display(df)

Unnamed: 0,Unnamed: 1,Python,Math,Chinese
A,期中,43,31,35
A,期末,107,3,73
B,期中,139,96,148
B,期末,85,37,5
C,期中,69,21,101
C,期末,33,100,67
D,期中,54,128,32
D,期末,72,91,78
E,期中,150,62,59
E,期末,144,41,49


In [9]:
df.mean(level = 1)

  df.mean(level = 1)


Unnamed: 0,Python,Math,Chinese
期中,100.5,71.6,82.0
期末,74.4,72.9,65.1


In [14]:
df.groupby(level=1).max()

Unnamed: 0,Python,Math,Chinese
期中,150,128,150
期末,144,132,118


## <font color='red'>pandas如何删除空数据？</font>

In [5]:
import pandas as pd
import numpy as np

# 创建一个包含空数据的示例DataFrame
data = {
    'A': [1, 2, np.nan, 4],
    'B': [np.nan, np.nan, np.nan, np.nan],
    'C': [9, 10, 11, 12]
}
df = pd.DataFrame(data)
df

Unnamed: 0,A,B,C
0,1.0,,9
1,2.0,,10
2,,,11
3,4.0,,12


In [3]:
df.dropna(axis = 1)

Unnamed: 0,C
0,9
1,10
2,11
3,12


In [7]:
df.dropna(axis = 1,how = 'all')

Unnamed: 0,A,C
0,1.0,9
1,2.0,10
2,,11
3,4.0,12


## <font color='red'>pandas如何填充空数据？</font>

In [11]:
import pandas as pd
import numpy as np

# 创建一个包含空数据的示例DataFrame
df = pd.DataFrame(np.random.randint(0,151,size = (100,3)),columns=['Python','Math','Chinese'])
df[df < 30] = np.nan
display(df)

Unnamed: 0,Python,Math,Chinese
0,61.0,129.0,115.0
1,53.0,127.0,56.0
2,87.0,97.0,
3,141.0,75.0,
4,130.0,102.0,81.0
...,...,...,...
95,,47.0,66.0
96,,,123.0
97,112.0,47.0,112.0
98,104.0,121.0,44.0


In [12]:
df.fillna(value=0)

Unnamed: 0,Python,Math,Chinese
0,61.0,129.0,115.0
1,53.0,127.0,56.0
2,87.0,97.0,0.0
3,141.0,75.0,0.0
4,130.0,102.0,81.0
...,...,...,...
95,0.0,47.0,66.0
96,0.0,0.0,123.0
97,112.0,47.0,112.0
98,104.0,121.0,44.0


In [15]:
df.fillna(df.median().round(1))

Unnamed: 0,Python,Math,Chinese
0,61.0,129.0,115.0
1,53.0,127.0,56.0
2,87.0,97.0,88.0
3,141.0,75.0,88.0
4,130.0,102.0,81.0
...,...,...,...
95,89.0,47.0,66.0
96,89.0,84.5,123.0
97,112.0,47.0,112.0
98,104.0,121.0,44.0


In [18]:
df.mode()

Unnamed: 0,Python,Math,Chinese
0,112.0,47.0,55.0


In [19]:
df.Python.value_counts()

112.0    4
89.0     3
70.0     3
59.0     2
33.0     2
        ..
114.0    1
53.0     1
80.0     1
56.0     1
68.0     1
Name: Python, Length: 61, dtype: int64

In [23]:
'''method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
    Method to use for filling holes in reindexed Series
    pad / ffill: propagate last valid observation forward to next valid
    backfill / bfill: use next valid observation to fill gap.
axis : {0 or 'index', 1 or 'columns'}
    Axis along which to fill missing values.'''
df.fillna(method='pad',axis = 1)

Unnamed: 0,Python,Math,Chinese
0,61.0,129.0,115.0
1,53.0,127.0,56.0
2,87.0,97.0,97.0
3,141.0,75.0,75.0
4,130.0,102.0,81.0
...,...,...,...
95,,47.0,66.0
96,,,123.0
97,112.0,47.0,112.0
98,104.0,121.0,44.0


## <font color='red'>pandas如何处理异常值？</font>

In [2]:
import numpy as np
import pandas as pd

# 创建一个包含异常值的示例Series
data = pd.Series(np.random.randint(0,100,size = 300))

data.iloc[-1] = 300 # 异常值，300
data.iloc[103] = -100

# 计算均值和标准差
mean = data.mean()
std = data.std()

# 计算上限和下限
upper_limit = mean + 3 * std
lower_limit = mean - 3 * std
print(upper_limit,lower_limit)

148.73822997408274 -51.46489664074942


In [3]:
data

0       32
1       26
2       74
3       93
4        5
      ... 
295     27
296     13
297     78
298     46
299    300
Length: 300, dtype: int32

In [4]:
cond = (data > upper_limit) | (data < lower_limit)
data[cond]

103   -100
299    300
dtype: int32

## <font color='red'>pandas如何删除异常值？</font>

In [7]:
data[~cond]

0      32
1      26
2      74
3      93
4       5
       ..
294    69
295    27
296    13
297    78
298    46
Length: 298, dtype: int32

In [9]:
index = data[cond].index
index

Int64Index([103, 299], dtype='int64')

In [12]:
data.drop(index,inplace=True)

In [13]:
data

0      32
1      26
2      74
3      93
4       5
       ..
294    69
295    27
296    13
297    78
298    46
Length: 298, dtype: int32