# 数据清洗-缺失值处理，字符串处理

In [16]:
import pandas as pd

stu_df = pd.read_excel("./datas/student_excel/student_excel.xlsx", skiprows = 2)

In [17]:
stu_df

Unnamed: 0.1,Unnamed: 0,姓名,科目,分数
0,,小明,语文,85.0
1,,,数学,80.0
2,,,英语,90.0
3,,,,
4,,小王,语文,85.0
5,,,数学,
6,,,英语,90.0
7,,,,
8,,小刚,语文,85.0
9,,,数学,80.0


In [18]:
stu_df.loc[stu_df['分数'].notnull(), :]

Unnamed: 0.1,Unnamed: 0,姓名,科目,分数
0,,小明,语文,85.0
1,,,数学,80.0
2,,,英语,90.0
4,,小王,语文,85.0
6,,,英语,90.0
8,,小刚,语文,85.0
9,,,数学,80.0
10,,,英语,90.0


In [19]:
# 删除掉全是空值的列
stu_df.dropna(axis = 'columns', how = 'all', inplace = True)

In [7]:
stu_df

Unnamed: 0,姓名,科目,分数
0,小明,语文,85.0
1,,数学,80.0
2,,英语,90.0
3,,,
4,小王,语文,85.0
5,,数学,
6,,英语,90.0
7,,,
8,小刚,语文,85.0
9,,数学,80.0


In [20]:
# 删除全是空值的行
stu_df.dropna(axis = 'rows', how = 'all', inplace = True)
# stu_df.dropna(axis = 'index', how = 'all', inplace = True)

In [10]:
stu_df

Unnamed: 0,姓名,科目,分数
0,小明,语文,85.0
1,,数学,80.0
2,,英语,90.0
4,小王,语文,85.0
5,,数学,
6,,英语,90.0
8,小刚,语文,85.0
9,,数学,80.0
10,,英语,90.0


In [21]:
# 分数列为空值的单元填充为 0
stu_df.fillna({'分数':0}, inplace = True) #不要忘记 inplace
# 等同于 stu_df.loc[:, '分数'] = stu_df['分数'].fillna(0)

In [24]:
stu_df

Unnamed: 0,姓名,科目,分数
0,小明,语文,85.0
1,小明,数学,80.0
2,小明,英语,90.0
4,小王,语文,85.0
5,小王,数学,0.0
6,小王,英语,90.0
8,小刚,语文,85.0
9,小刚,数学,80.0
10,小刚,英语,90.0


In [25]:
stu_df['姓名'] = stu_df['姓名'].fillna(method = 'ffill')

In [26]:
stu_df

Unnamed: 0,姓名,科目,分数
0,小明,语文,85.0
1,小明,数学,80.0
2,小明,英语,90.0
4,小王,语文,85.0
5,小王,数学,0.0
6,小王,英语,90.0
8,小刚,语文,85.0
9,小刚,数学,80.0
10,小刚,英语,90.0


# 字符串处理

In [28]:
fpath = './datas/beijing_tianqi/beijing_tianqi_2018.csv'
df = pd.read_csv(fpath)

In [29]:
df.head

<bound method NDFrame.head of             ymd bWendu yWendu tianqi fengxiang fengli  aqi aqiInfo  aqiLevel
0    2018-01-01     3℃    -6℃   晴~多云       东北风   1-2级   59       良         2
1    2018-01-02     2℃    -5℃   阴~多云       东北风   1-2级   49       优         1
2    2018-01-03     2℃    -5℃     多云        北风   1-2级   28       优         1
3    2018-01-04     0℃    -8℃      阴       东北风   1-2级   28       优         1
4    2018-01-05     3℃    -6℃   多云~晴       西北风   1-2级   50       优         1
..          ...    ...    ...    ...       ...    ...  ...     ...       ...
360  2018-12-27    -5℃   -12℃   多云~晴       西北风     3级   48       优         1
361  2018-12-28    -3℃   -11℃      晴       西北风     3级   40       优         1
362  2018-12-29    -3℃   -12℃      晴       西北风     2级   29       优         1
363  2018-12-30    -2℃   -11℃   晴~多云       东北风     1级   31       优         1
364  2018-12-31    -2℃   -10℃     多云       东北风     1级   56       良         2

[365 rows x 9 columns]>

In [30]:
df.head()

Unnamed: 0,ymd,bWendu,yWendu,tianqi,fengxiang,fengli,aqi,aqiInfo,aqiLevel
0,2018-01-01,3℃,-6℃,晴~多云,东北风,1-2级,59,良,2
1,2018-01-02,2℃,-5℃,阴~多云,东北风,1-2级,49,优,1
2,2018-01-03,2℃,-5℃,多云,北风,1-2级,28,优,1
3,2018-01-04,0℃,-8℃,阴,东北风,1-2级,28,优,1
4,2018-01-05,3℃,-6℃,多云~晴,西北风,1-2级,50,优,1


In [40]:
df['aqi'].dtype

dtype('int64')

In [47]:
# 字符串替换函数
df['bWendu'].str.replace("℃", '')

0       3
1       2
2       2
3       0
4       3
       ..
360    -5
361    -3
362    -3
363    -2
364    -2
Name: bWendu, Length: 365, dtype: object

In [44]:
df['bWendu'].str.isnumeric()

0      False
1      False
2      False
3      False
4      False
       ...  
360    False
361    False
362    False
363    False
364    False
Name: bWendu, Length: 365, dtype: bool

In [45]:
df['bWendu']

0       3℃
1       2℃
2       2℃
3       0℃
4       3℃
      ... 
360    -5℃
361    -3℃
362    -3℃
363    -2℃
364    -2℃
Name: bWendu, Length: 365, dtype: object

In [49]:
df.head()

Unnamed: 0,ymd,bWendu,yWendu,tianqi,fengxiang,fengli,aqi,aqiInfo,aqiLevel
0,2018-01-01,3℃,-6℃,晴~多云,东北风,1-2级,59,良,2
1,2018-01-02,2℃,-5℃,阴~多云,东北风,1-2级,49,优,1
2,2018-01-03,2℃,-5℃,多云,北风,1-2级,28,优,1
3,2018-01-04,0℃,-8℃,阴,东北风,1-2级,28,优,1
4,2018-01-05,3℃,-6℃,多云~晴,西北风,1-2级,50,优,1


In [50]:
condition = df['ymd'].str.startswith('2018-03')

In [51]:
condition

0      False
1      False
2      False
3      False
4      False
       ...  
360    False
361    False
362    False
363    False
364    False
Name: ymd, Length: 365, dtype: bool

In [52]:
df[condition].head()

Unnamed: 0,ymd,bWendu,yWendu,tianqi,fengxiang,fengli,aqi,aqiInfo,aqiLevel
59,2018-03-01,8℃,-3℃,多云,西南风,1-2级,46,优,1
60,2018-03-02,9℃,-1℃,晴~多云,北风,1-2级,95,良,2
61,2018-03-03,13℃,3℃,多云~阴,北风,1-2级,214,重度污染,5
62,2018-03-04,7℃,-2℃,阴~多云,东南风,1-2级,144,轻度污染,3
63,2018-03-05,8℃,-3℃,晴,南风,1-2级,94,良,2


## 如何提取201803这样的月份？

### 1.先将2018-03-01替换成20180301这样的格式

In [54]:
df['ymd'].str.replace('-', '').str.slice(0, 6)
# 注意：df['ymd'].str.replace('-', '').slice(0, 6)会报错

0      201801
1      201801
2      201801
3      201801
4      201801
        ...  
360    201812
361    201812
362    201812
363    201812
364    201812
Name: ymd, Length: 365, dtype: object

In [56]:
# 等同于 
df['ymd'].str.replace('-', '').str[0:6]

0      201801
1      201801
2      201801
3      201801
4      201801
        ...  
360    201812
361    201812
362    201812
363    201812
364    201812
Name: ymd, Length: 365, dtype: object