## <font color='red'>DataFrame如何数据重塑？</font>

### 转置

In [12]:
import pandas as pd

# 示例数据
data = {
    'Name': ['张三', '李四', '王五'],
    'Math': [90, 85, 78],
    'English': [85, 80, 92]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Math,English
0,张三,90,85
1,李四,85,80
2,王五,78,92


In [13]:
df.T

Unnamed: 0,0,1,2
Name,张三,李四,王五
Math,90,85,78
English,85,80,92


### melt长表转换

In [14]:
df

Unnamed: 0,Name,Math,English
0,张三,90,85
1,李四,85,80
2,王五,78,92


In [15]:
df.melt(id_vars='Name',
        var_name='Subject',
        value_name = 'score')

Unnamed: 0,Name,Subject,score
0,张三,Math,90
1,李四,Math,85
2,王五,Math,78
3,张三,English,85
4,李四,English,80
5,王五,English,92


### pivot宽表转换

In [16]:
# 示例数据
data = {
    'Name': ['张三', '张三', '李四', '李四', '王五', '王五'],
    'Subject': ['Math', 'English', 'Math', 'English', 'Math', 'English'],
    'Score': [90, 85, 85, 80, 78, 92]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Subject,Score
0,张三,Math,90
1,张三,English,85
2,李四,Math,85
3,李四,English,80
4,王五,Math,78
5,王五,English,92


In [18]:
df.pivot(index = 'Name',columns='Subject',values='Score')

Subject,English,Math
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
张三,85,90
李四,80,85
王五,92,78


## <font color='red'>DataFrame多层索引如何行列转换？</font>

In [21]:
import pandas as pd
import numpy as np
# 多层索引
multi_index = pd.MultiIndex.from_product(
    [['张三','李四','王五','赵六'], 
     ['期中', '期末']],
    names=['Name', 'Term'])

df = pd.DataFrame(np.random.randint(0,151,size = (8,2)),
                  index=multi_index,
                  columns=['Python','Math'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Python,Math
Name,Term,Unnamed: 2_level_1,Unnamed: 3_level_1
张三,期中,115,113
张三,期末,18,51
李四,期中,125,89
李四,期末,139,4
王五,期中,5,125
王五,期末,135,108
赵六,期中,144,149
赵六,期末,70,95


In [24]:
df2 = df.unstack(level=0)
df2

Unnamed: 0_level_0,Python,Python,Python,Python,Math,Math,Math,Math
Name,张三,李四,王五,赵六,张三,李四,王五,赵六
Term,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
期中,115,125,5,144,113,89,125,149
期末,18,139,135,70,51,4,108,95


In [32]:
df2.unstack().to_frame(name = 'score')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,score
Unnamed: 0_level_1,Name,Term,Unnamed: 3_level_1
Python,张三,期中,115
Python,张三,期末,18
Python,李四,期中,125
Python,李四,期末,139
Python,王五,期中,5
Python,王五,期末,135
Python,赵六,期中,144
Python,赵六,期末,70
Math,张三,期中,113
Math,张三,期末,51


In [29]:
df2.stack(level=0).unstack(0)

Name,张三,张三,李四,李四,王五,王五,赵六,赵六
Term,期中,期末,期中,期末,期中,期末,期中,期末
Math,113,51,89,4,125,108,149,95
Python,115,18,125,139,5,135,144,70


## <font color='red'>DataFrame如何数据集成？</font>

In [42]:
import pandas as pd
# 一班考试成绩
df1 = pd.DataFrame(np.random.randint(0,151,size = (51,2)),columns=['Python','Math'])
# 二班考试成绩
df2 = pd.DataFrame(np.random.randint(0,151,size = (49,2)),columns=['Python','Math'])

In [44]:
df = pd.concat([df1,df2])
df

Unnamed: 0,Python,Math
0,85,20
1,81,19
2,9,78
3,92,129
4,89,133
...,...,...
44,58,99
45,105,97
46,89,130
47,77,70


In [46]:
df.reset_index(drop=True)

Unnamed: 0,Python,Math
0,85,20
1,81,19
2,9,78
3,92,129
4,89,133
...,...,...
95,58,99
96,105,97
97,89,130
98,77,70


In [47]:
import pandas as pd
# 一班文化课考试成绩
df1 = pd.DataFrame(np.random.randint(0,151,size = (51,2)),columns=['Python','Math'])
# 一班体育考试成绩
df2 = pd.DataFrame(np.random.randint(0,151,size = (51,2)),columns=['篮球','足球'])

In [49]:
pd.concat([df1,df2],axis = 1)

Unnamed: 0,Python,Math,篮球,足球
0,15,4,9,129
1,132,106,47,58
2,4,31,74,23
3,98,4,114,34
4,92,142,71,107
5,51,42,12,84
6,13,102,74,49
7,46,123,98,55
8,29,90,74,77
9,113,105,74,14


## <font color='red'>什么是merge数据融合？</font>

In [58]:
import numpy as np
import pandas as pd
data1 = {
    'ID': [1, 2, 3],
    'Name': ['Alice', 'Bob', 'Charlie']
}

data2 = {
    'id': [2, 3, 4],
    'Age': [25, 30, 22]}

df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
display(df1,df2)

Unnamed: 0,ID,Name
0,1,Alice
1,2,Bob
2,3,Charlie


Unnamed: 0,id,Age
0,2,25
1,3,30
2,4,22


In [63]:
pd.merge(df1,df2,
         left_on='ID',right_on='id',
         how ='outer')

Unnamed: 0,ID,Name,id,Age
0,1.0,Alice,,
1,2.0,Bob,2.0,25.0
2,3.0,Charlie,3.0,30.0
3,,,4.0,22.0


In [64]:
import numpy as np
import pandas as pd
df1 = pd.DataFrame(np.random.randint(0,151,size = (50,2)),columns=['Python','Math'])
df1

Unnamed: 0,Python,Math
0,150,96
1,15,55
2,149,70
3,106,44
4,43,20
5,136,39
6,119,1
7,33,120
8,72,80
9,65,141


In [69]:
df2 = df1.mean(axis =1 ).to_frame(name = '平均分')
df2

Unnamed: 0,平均分
0,123.0
1,35.0
2,109.5
3,75.0
4,31.5
5,87.5
6,60.0
7,76.5
8,76.0
9,103.0


In [70]:
pd.concat([df1,df2],axis = 1)

Unnamed: 0,Python,Math,平均分
0,150,96,123.0
1,15,55,35.0
2,149,70,109.5
3,106,44,75.0
4,43,20,31.5
5,136,39,87.5
6,119,1,60.0
7,33,120,76.5
8,72,80,76.0
9,65,141,103.0


In [72]:
pd.merge(df1,df2,left_index=True,right_index=True)

Unnamed: 0,Python,Math,平均分
0,150,96,123.0
1,15,55,35.0
2,149,70,109.5
3,106,44,75.0
4,43,20,31.5
5,136,39,87.5
6,119,1,60.0
7,33,120,76.5
8,72,80,76.0
9,65,141,103.0


## <font color='red'>什么是join数据连接？</font>

In [1]:
import pandas as pd

# 创建员工信息DataFrame
employee_data = {
    'EmployeeID': [101, 102, 103, 104, 105],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Department': ['HR', 'Engineering', 'Marketing', 'Finance', 'HR']
}
df_employee = pd.DataFrame(employee_data)

# 创建员工工资信息DataFrame
salary_data = {
    'EmployeeID': [101, 102, 104, 105],
    'Salary': [50000, 60000, 55000, 65000]
}
df_salary = pd.DataFrame(salary_data)
display(df_employee,df_salary)

Unnamed: 0,EmployeeID,Name,Department
0,101,Alice,HR
1,102,Bob,Engineering
2,103,Charlie,Marketing
3,104,David,Finance
4,105,Eve,HR


Unnamed: 0,EmployeeID,Salary
0,101,50000
1,102,60000
2,104,55000
3,105,65000


In [5]:
df_employee.set_index('EmployeeID').join(df_salary.set_index('EmployeeID'),how = 'right')

Unnamed: 0_level_0,Name,Department,Salary
EmployeeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
101,Alice,HR,50000
102,Bob,Engineering,60000
104,David,Finance,55000
105,Eve,HR,65000


## <font color='red'>如何根据字段筛选数据？</font>

In [6]:
import pandas as pd
import numpy as np
df = pd.DataFrame(data = np.random.randint(0,150,size = [150,3]),# 考试科目成绩
                   columns=['Python','Math','Chinese'])
df

Unnamed: 0,Python,Math,Chinese
0,130,18,59
1,142,144,121
2,130,137,36
3,70,30,91
4,53,18,16
...,...,...,...
145,10,125,109
146,71,68,14
147,69,60,128
148,86,37,69


In [7]:
# MathDance
df.Python

0      130
1      142
2      130
3       70
4       53
      ... 
145     10
146     71
147     69
148     86
149     35
Name: Python, Length: 150, dtype: int32

In [8]:
df['Math']

0       18
1      144
2      137
3       30
4       18
      ... 
145    125
146     68
147     60
148     37
149    138
Name: Math, Length: 150, dtype: int32

In [9]:
df[['Math']]

Unnamed: 0,Math
0,18
1,144
2,137
3,30
4,18
...,...
145,125
146,68
147,60
148,37


In [10]:
df[['Math','Python']]

Unnamed: 0,Math,Python
0,18,130
1,144,142
2,137,130
3,30,70
4,18,53
...,...,...
145,125,10
146,68,71
147,60,69
148,37,86


## <font color='red'>如何根据标签筛选数据？</font>

In [11]:
import pandas as pd
import numpy as np
df = pd.DataFrame(data = np.random.randint(0,150,size = [10,3]),# 计算机科目的考试成绩
                  index = list('ABCDEFGHIJ'),# 行标签
                  columns=['Python','Tensorflow','Keras'])
df

Unnamed: 0,Python,Tensorflow,Keras
A,36,131,47
B,76,14,12
C,41,20,1
D,125,91,87
E,67,133,31
F,62,125,132
G,61,57,11
H,130,115,146
I,54,38,117
J,74,22,26


In [13]:
df.loc['A']

Python         36
Tensorflow    131
Keras          47
Name: A, dtype: int32

In [15]:
df.loc[['A','E','F']]

Unnamed: 0,Python,Tensorflow,Keras
A,36,131,47
E,67,133,31
F,62,125,132


In [17]:
df.loc['A':'F',['Python','Keras']]

Unnamed: 0,Python,Keras
A,36,47
B,76,12
C,41,1
D,125,87
E,67,31
F,62,132


In [18]:
df.loc[:,['Keras','Python']]

Unnamed: 0,Keras,Python
A,47,36
B,12,76
C,1,41
D,87,125
E,31,67
F,132,62
G,11,61
H,146,130
I,117,54
J,26,74


In [19]:
df.loc['A'::2,['Python','Tensorflow']]

Unnamed: 0,Python,Tensorflow
A,36,131
C,41,20
E,67,133
G,61,57
I,54,38


In [21]:
df.loc['A','Tensorflow']

131

In [24]:
df[['Python','Keras']]

Unnamed: 0,Python,Keras
A,36,47
B,76,12
C,41,1
D,125,87
E,67,31
F,62,132
G,61,11
H,130,146
I,54,117
J,74,26


## <font color='red'>如何根据位置筛选数据？</font>

In [25]:
import pandas as pd
import numpy as np
df = pd.DataFrame(data = np.random.randint(0,150,size = [10,3]),# 计算机科目的考试成绩
                  index = list('ABCDEFGHIJ'),# 行标签
                  columns=['Python','Tensorflow','Keras'])
df

Unnamed: 0,Python,Tensorflow,Keras
A,97,102,114
B,131,11,76
C,115,35,34
D,97,18,62
E,60,75,30
F,149,89,139
G,4,88,7
H,43,44,46
I,15,50,108
J,73,149,59


In [26]:
df.iloc[0]

Python         97
Tensorflow    102
Keras         114
Name: A, dtype: int32

In [27]:
df.iloc[[0]]

Unnamed: 0,Python,Tensorflow,Keras
A,97,102,114


In [28]:
df.iloc[2:8,0:2]

Unnamed: 0,Python,Tensorflow
C,115,35
D,97,18
E,60,75
F,149,89
G,4,88
H,43,44


In [29]:
df.iloc[[1,3,5],[2,0,1]]

Unnamed: 0,Keras,Python,Tensorflow
B,76,131,11
D,62,97,18
F,139,149,89


In [30]:
df.iloc[1:3,:]

Unnamed: 0,Python,Tensorflow,Keras
B,131,11,76
C,115,35,34


In [32]:
df.iloc[:,-2:]

Unnamed: 0,Tensorflow,Keras
A,102,114
B,11,76
C,35,34
D,18,62
E,75,30
F,89,139
G,88,7
H,44,46
I,50,108
J,149,59


In [33]:
df.iloc[0,2]

114

## <font color='red'>如何根据布尔索引筛选数据？</font>

In [35]:
import pandas as pd
import numpy as np
df = pd.DataFrame(data = np.random.randint(0,150,size = [2000,3]),# 计算机科目的考试成绩
                  columns=['Python','Tensorflow','Keras']) # 考试科目
df

Unnamed: 0,Python,Tensorflow,Keras
0,2,55,122
1,86,0,64
2,128,31,106
3,118,20,33
4,73,148,56
...,...,...,...
1995,67,72,12
1996,141,48,119
1997,49,109,83
1998,78,140,12


In [37]:
cond = df.Python >= 147
df[cond]

Unnamed: 0,Python,Tensorflow,Keras
11,147,127,133
118,148,83,128
141,149,14,78
180,147,41,34
228,149,120,132
301,148,30,115
371,147,109,45
402,149,54,116
460,147,20,92
517,149,147,15


In [42]:
cond

0       False
1       False
2       False
3       False
4       False
        ...  
1995    False
1996    False
1997    False
1998    False
1999    False
Name: Python, Length: 2000, dtype: bool

In [40]:
cond2 = (df.Python > 145) | (df['Keras'] > 145)
df[cond2]

Unnamed: 0,Python,Tensorflow,Keras
11,147,127,133
104,146,126,38
118,148,83,128
136,89,100,146
141,149,14,78
...,...,...,...
1909,147,85,93
1913,148,6,64
1951,73,37,149
1952,149,38,10


In [43]:
df[df > 100]

Unnamed: 0,Python,Tensorflow,Keras
0,,,122.0
1,,,
2,128.0,,106.0
3,118.0,,
4,,148.0,
...,...,...,...
1995,,,
1996,141.0,,119.0
1997,,109.0,
1998,,140.0,


In [47]:
cond3 = df.Python.isin([99,119,139])
df[~cond3]

Unnamed: 0,Python,Tensorflow,Keras
0,2,55,122
1,86,0,64
2,128,31,106
3,118,20,33
4,73,148,56
...,...,...,...
1995,67,72,12
1996,141,48,119
1997,49,109,83
1998,78,140,12


## <font color='red'>如何通过query筛选数据？</font>

In [48]:
import pandas as pd

# 创建示例DataFrame
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],
        'Age': [25, 30, 22, 28],
        'Salary': [50000, 60000, 45000, 70000]}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Salary
0,Alice,25,50000
1,Bob,30,60000
2,Charlie,22,45000
3,David,28,70000


In [49]:
df.query('Age >= 25 and Salary >= 55000')

Unnamed: 0,Name,Age,Salary
1,Bob,30,60000
3,David,28,70000
