In [10]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

### 1.对类别型特征进行独热编码

In [3]:
#读取数据
train=pd.read_csv('day.csv')

# 对类别型特征进行独热编码
categorical_features=['season','mnth','weathersit','weekday']

# 数据类型变为object，才能被get_dummies处理，独热编码
for col in categorical_features:
    train[col]=train[col].astype('object')

X_train_cat=train[categorical_features]
X_train_cat=pd.get_dummies(X_train_cat)
X_train_cat.head()

Unnamed: 0,season_1,season_2,season_3,season_4,mnth_1,mnth_2,mnth_3,mnth_4,mnth_5,mnth_6,mnth_7,mnth_8,mnth_9,mnth_10,mnth_11,mnth_12,weathersit_1,weathersit_2,weathersit_3,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6
0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1
1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0
2,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0
3,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
4,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0


### 2.对数值型特征进行标准化/MinMaxScaler，去量纲

In [4]:
# 数值型变量预处理，使用MinMaxScaler
mn_X=MinMaxScaler()
numerical_features=['temp','atemp','hum','windspeed']
temp=mn_X.fit_transform(train[numerical_features])

X_train_num=pd.DataFrame(data=temp,columns=numerical_features,index=train.index)
X_train_num.head()

Unnamed: 0,temp,atemp,hum,windspeed
0,0.35517,0.373517,0.82862,0.284606
1,0.379232,0.360541,0.715771,0.466215
2,0.171,0.14483,0.449638,0.46574
3,0.17553,0.174649,0.607131,0.284297
4,0.20912,0.197158,0.449313,0.339143


### 3.将类别型特征和数值型特征连在一起

In [5]:
X_train = pd.concat([X_train_cat,X_train_num, train['holiday'], train['workingday']], axis = 1, ignore_index=False)
X_train.head()

Unnamed: 0,season_1,season_2,season_3,season_4,mnth_1,mnth_2,mnth_3,mnth_4,mnth_5,mnth_6,mnth_7,mnth_8,mnth_9,mnth_10,mnth_11,mnth_12,weathersit_1,weathersit_2,weathersit_3,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,temp,atemp,hum,windspeed,holiday,workingday
0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0.35517,0.373517,0.82862,0.284606,0,0
1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0.379232,0.360541,0.715771,0.466215,0,0
2,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0.171,0.14483,0.449638,0.46574,0,1
3,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0.17553,0.174649,0.607131,0.284297,0,1
4,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0.20912,0.197158,0.449313,0.339143,0,1


### 4.保存特征工程

In [9]:
feature_train=pd.concat([train['instant'],X_train,train['yr'],train['cnt']],axis=1)
feature_train.to_csv('feature_train_day_cnt.csv',index=False)
feature_train.info()
feature_train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 35 columns):
instant         731 non-null int64
season_1        731 non-null uint8
season_2        731 non-null uint8
season_3        731 non-null uint8
season_4        731 non-null uint8
mnth_1          731 non-null uint8
mnth_2          731 non-null uint8
mnth_3          731 non-null uint8
mnth_4          731 non-null uint8
mnth_5          731 non-null uint8
mnth_6          731 non-null uint8
mnth_7          731 non-null uint8
mnth_8          731 non-null uint8
mnth_9          731 non-null uint8
mnth_10         731 non-null uint8
mnth_11         731 non-null uint8
mnth_12         731 non-null uint8
weathersit_1    731 non-null uint8
weathersit_2    731 non-null uint8
weathersit_3    731 non-null uint8
weekday_0       731 non-null uint8
weekday_1       731 non-null uint8
weekday_2       731 non-null uint8
weekday_3       731 non-null uint8
weekday_4       731 non-null uint8
weekday_5       731 

Unnamed: 0,instant,season_1,season_2,season_3,season_4,mnth_1,mnth_2,mnth_3,mnth_4,mnth_5,mnth_6,mnth_7,mnth_8,mnth_9,mnth_10,mnth_11,mnth_12,weathersit_1,weathersit_2,weathersit_3,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,temp,atemp,hum,windspeed,holiday,workingday,yr,cnt
0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0.35517,0.373517,0.82862,0.284606,0,0,0,985
1,2,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0.379232,0.360541,0.715771,0.466215,0,0,0,801
2,3,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0.171,0.14483,0.449638,0.46574,0,1,0,1349
3,4,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0.17553,0.174649,0.607131,0.284297,0,1,0,1562
4,5,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0.20912,0.197158,0.449313,0.339143,0,1,0,1600
