In [None]:
"""
pandas某些行的transformation
由于机器学习模型只接受数值，所以要将不同的数据类型转为数值

"""
import pandas as pd
import numpy as np

"""
字段说明
   1. X - x-axis spatial coordinate within the Montesinho park map: 1 to 9
   2. Y - y-axis spatial coordinate within the Montesinho park map: 2 to 9
   3. month - month of the year: "jan" to "dec" 
   4. day - day of the week: "mon" to "sun"
   5. FFMC - FFMC index from the FWI system: 18.7 to 96.20
   6. DMC - DMC index from the FWI system: 1.1 to 291.3 
   7. DC - DC index from the FWI system: 7.9 to 860.6 
   8. ISI - ISI index from the FWI system: 0.0 to 56.10
   9. temp - temperature in Celsius degrees: 2.2 to 33.30
   10. RH - relative humidity in %: 15.0 to 100
   11. wind - wind speed in km/h: 0.40 to 9.40 
   12. rain - outside rain in mm/m2 : 0.0 to 6.4 
   13. area - the burned area of the forest (in ha): 0.00 to 1090.84 
   (this output variable is very skewed towards 0.0, thus it may make
    sense to model with the logarithm transform). 
"""

df = pd.read_csv("./input/forestfires.csv", sep=",") 
df.head(5)


In [None]:
df.info()

In [None]:
# 分类属性转为数值数据 onehot
df_dummy = pd.get_dummies(df, columns=["month", "day"], prefix_sep="__")
df_dummy.head(5) 

In [None]:
# dataframe转为ndarray
nparray = df_dummy.values
print(type(nparray))
nparray

In [None]:
# 数值属性的各种标准化或者压缩 sklearn.preprocessing

# 连续数据分箱 - 连续数据离散化的一种方法
self_define_bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
df["FFMC_3_bin"] = pd.cut(df["FFMC"], bins=3)
df["FFMC_self_define_bin"] = pd.cut(df["FFMC"], bins=self_define_bins)
df["FFMC_label"] = pd.cut(df["FFMC"], bins=self_define_bins, labels=False)
df.head(5)

In [None]:
# 字段的变化，创建一个新字段, 并且drop老的字段
# 也可以对分类属性做自己的编码，一般也是one-hot编码。
def reset_day(day):
    if day == "fri" or day == "sun" or day == "sat":
        return "second"
    else: 
        return "first"

df["part"] = df["day"].apply(reset_day)
print(df.head(10))
df.drop(["day"], axis=1, inplace=True)
print(df.head(10))