In [1]:
from numpy import nan as NA
import pandas as pd

In [2]:
# pd缺失值处理方法
# dropna筛选轴标签，fillna缺失值填充，isnull，notnull

In [3]:
# data.dropna(axis=0)默认删除含有NA的行,可通过how='all'只删除全为NA的行
# thresh=2保留含有两个及以上非NA值的行

In [4]:
# data.fillna(0/dict)对NA填充0，或按照字典对不同列填充值如{'a':0,'b':1}
# 默认返回新对象，可通过inplace=Ture对原数据进行修改
# method='ffill',limit=2,对2个NA使用前插法

In [5]:
# df.duplicated(list)每一行是否出现过，可通过list指定列名，df.drop_duplicates()删除重复行
# keep='last'可指明从后往前

In [6]:
ds=pd.read_csv('dataset.csv',nrows=4)

In [7]:
ds

Unnamed: 0,WindNumber,Time,WindSpeed,Power,RotorSpeed
0,1,2017/11/1 0:20,4.188333,65.75,5.779167
1,1,2017/11/1 0:30,4.0425,120.7,8.6365
2,1,2017/11/1 0:40,4.137368,135.157895,8.668421
3,1,2017/11/1 0:50,4.274737,157.315789,8.727895


In [8]:
ds['maplist']=ds.Power.map(lambda x : 100 if x>=100 else 0)
# 通过map进行数据转换

In [9]:
ds

Unnamed: 0,WindNumber,Time,WindSpeed,Power,RotorSpeed,maplist
0,1,2017/11/1 0:20,4.188333,65.75,5.779167,0
1,1,2017/11/1 0:30,4.0425,120.7,8.6365,100
2,1,2017/11/1 0:40,4.137368,135.157895,8.668421,100
3,1,2017/11/1 0:50,4.274737,157.315789,8.727895,100


In [10]:
# 替代值
ds.replace(0,10)
# 参数可以是(list，list)或(dict)

Unnamed: 0,WindNumber,Time,WindSpeed,Power,RotorSpeed,maplist
0,1,2017/11/1 0:20,4.188333,65.75,5.779167,10
1,1,2017/11/1 0:30,4.0425,120.7,8.6365,100
2,1,2017/11/1 0:40,4.137368,135.157895,8.668421,100
3,1,2017/11/1 0:50,4.274737,157.315789,8.727895,100


In [11]:
# 可以通过rename(dict/list)对索引轴进行重命名
ds.rename(columns={'maplist':'Maplist'},inplace=True)
# 或ds.index.map(lambda x:x.upper())全大写

In [12]:
ds

Unnamed: 0,WindNumber,Time,WindSpeed,Power,RotorSpeed,Maplist
0,1,2017/11/1 0:20,4.188333,65.75,5.779167,0
1,1,2017/11/1 0:30,4.0425,120.7,8.6365,100
2,1,2017/11/1 0:40,4.137368,135.157895,8.668421,100
3,1,2017/11/1 0:50,4.274737,157.315789,8.727895,100


In [13]:
# 分箱
bins=[0,70,140,210]
# labels指定箱名
# right=False代表右开，否则右闭
# precision精度
# cut(要分箱的数据，箱(也可以是整数，自动化分)，labels=箱名)
cats=pd.cut(ds.Power,bins,labels=['a','b','c'])
cats

0    a
1    b
2    b
3    c
Name: Power, dtype: category
Categories (3, object): ['a' < 'b' < 'c']

In [14]:
pd.value_counts(cats)

b    2
a    1
c    1
Name: Power, dtype: int64

In [15]:
# qcut(data,4)根据分位数分箱
catsq=pd.qcut(ds.Power,4)
# (,list)还可传入01之间的list [0,0.1,0.5,0.8,1]
catsq

0     (65.749, 106.962]
1    (106.962, 127.929]
2    (127.929, 140.697]
3    (140.697, 157.316]
Name: Power, dtype: category
Categories (4, interval[float64]): [(65.749, 106.962] < (106.962, 127.929] < (127.929, 140.697] < (140.697, 157.316]]

In [16]:
# ds[np.abs(ds)>3.any(1)]存在大于3的行

In [17]:
import numpy as np

In [18]:
# np.sign()根据正负形成1，-1

In [19]:
sampler=np.random.permutation(4)
# 随机整数数组，进行重排
sampler

array([1, 2, 3, 0])

In [20]:
ds.iloc[sampler]

Unnamed: 0,WindNumber,Time,WindSpeed,Power,RotorSpeed,Maplist
1,1,2017/11/1 0:30,4.0425,120.7,8.6365,100
2,1,2017/11/1 0:40,4.137368,135.157895,8.668421,100
3,1,2017/11/1 0:50,4.274737,157.315789,8.727895,100
0,1,2017/11/1 0:20,4.188333,65.75,5.779167,0


In [21]:
# 随机抽样.sample
ds.sample(n=3)

Unnamed: 0,WindNumber,Time,WindSpeed,Power,RotorSpeed,Maplist
2,1,2017/11/1 0:40,4.137368,135.157895,8.668421,100
0,1,2017/11/1 0:20,4.188333,65.75,5.779167,0
1,1,2017/11/1 0:30,4.0425,120.7,8.6365,100


In [22]:
pd.get_dummies(ds.Maplist,prefix='pre_')
# 指标,prefix加前缀

Unnamed: 0,pre__0,pre__100
0,1,0
1,0,1
2,0,1
3,0,1


In [23]:
# 字符串
val='a,b,  \t c'
val.split(',')
# split(',')根据，划分

['a', 'b', '  \t c']

In [24]:
# strip（）去除空格及换行
pieces=[x.strip() for x in val.split(',')]
pieces

['a', 'b', 'c']

In [25]:
'::'.join(pieces)
# 连接

'a::b::c'

In [26]:
# find找不到-1.index找不到异常
val.index('a')

0

In [27]:
val.replace(',','.')

'a.b.  \t c'

In [28]:
# re表达式
import re

In [29]:
re.split('\s+',val)
# /s+一个或多个空白字符,按空白字符进行分割,这里是空格 \t

['a,b,', 'c']

In [30]:
regex=re.compile('\s+')
# 先编译再使用
regex.split(val)

['a,b,', 'c']

In [31]:
regex.findall(val)
# findall返回所有，search返回diyige，match只从开头进行匹配
# 空格 \t

['  \t ']

In [32]:
regex.sub('q',val)
# 替代满足的模式

'a,b,qc'

In [None]:
# 用（）来regex.gruops（）
# sub(r'\1',text)\1表示第一个匹配的分组