# 剪枝编码和缩放特征

Pruning Encoding and Rescaling Features

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split



## 测试_培训_拆分

testing training split

In [None]:
pd.set_option('display.width', 75)
pd.set_option('display.max_columns', 7)
pd.set_option('display.max_rows', 25)
pd.options.display.float_format = '{:,.0f}'.format
nls97 = pd.read_csv("data/nls97b.csv")
nls97.set_index("personid", inplace=True)

feature_cols = ['satverbal','satmath','gpascience',
  'gpaenglish','gpamath','gpaoverall']

# 将 NLS 数据分为训练数据集和测试数据集
X_train, X_test, y_train, y_test =  \
  train_test_split(nls97[feature_cols],\
  nls97[['wageincome']], test_size=0.3, random_state=0)

# 删除与另一个特征高度相关的特征
nls97.shape[0]
X_train.info()
y_train.info()
X_test.info()
y_test.info()

## 冗余特征

redundant features

In [None]:
import feature_engine.selection as fesel
pd.set_option('display.width', 80)
pd.set_option('display.max_columns', 8)
pd.set_option('display.max_rows', 25)
pd.options.display.float_format = '{:,.0f}'.format
nls97 = pd.read_csv("data/nls97b.csv")
nls97.set_index("personid", inplace=True)
ltpoland = pd.read_csv("data/ltpoland.csv")
ltpoland.set_index("station", inplace=True)
ltpoland.dropna()

feature_cols = ['satverbal','satmath','gpascience',
  'gpaenglish','gpamath','gpaoverall']

# separate NLS data into train and test datasets
X_train, X_test, y_train, y_test =  \
  train_test_split(nls97[feature_cols],\
  nls97[['wageincome']], test_size=0.3, random_state=0)

# remove a feature highly correlated with another
X_train.corr()
tr = fesel.DropCorrelatedFeatures(variables=None, method='pearson', threshold=0.75)
tr.fit(X_train)
X_train_tr = tr.transform(X_train)
X_test_tr = tr.transform(X_test)
X_train_tr.info()

feature_cols = ['year','month','latabs','latitude','elevation',
  'longitude','country']

# separate temperature data into train and test datasets
X_train, X_test, y_train, y_test =  \
  train_test_split(ltpoland[feature_cols],\
  ltpoland[['temperature']], test_size=0.3, random_state=0)
X_train.sample(5, random_state=99)
X_train.year.value_counts()
X_train.country.value_counts()
(X_train.latitude!=X_train.latabs).sum()

# drop features with same values throughout dataset
tr = fesel.DropConstantFeatures()
tr.fit(X_train)
X_train_tr = tr.transform(X_train)
X_test_tr = tr.transform(X_test)
X_train_tr.head()

# drop features that have the same values as another feature
tr = fesel.DropDuplicateFeatures()
tr.fit(X_train_tr)
X_train_tr = tr.transform(X_train_tr)
X_train_tr.head()

## 单热编码

one-hot encoding

In [None]:
from feature_engine.encoding import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
pd.set_option('display.width', 80)
pd.set_option('display.max_columns', 8)
pd.set_option('display.max_rows', 100)
pd.options.display.float_format = '{:,.0f}'.format
nls97 = pd.read_csv("data/nls97b.csv")
nls97.set_index("personid", inplace=True)


feature_cols = ['gender','maritalstatus','colenroct99']
nls97demo = nls97[['wageincome'] + feature_cols].dropna()

# separate NLS data into train and test datasets
X_demo_train, X_demo_test, y_demo_train, y_demo_test =  \
  train_test_split(nls97demo[feature_cols],\
  nls97demo[['wageincome']], test_size=0.3, random_state=0)

# use get dummies to create dummies features
pd.get_dummies(X_demo_train, columns=['gender','maritalstatus']).head(2).T
pd.get_dummies(X_demo_train, columns=['gender','maritalstatus'],
  drop_first=True).head(2).T

# use the one hot encoder to create encoded features for gender and marital status
ohe = OneHotEncoder(drop_last=True, variables=['gender','maritalstatus'])
ohe.fit(X_demo_train)
X_demo_train_ohe = ohe.transform(X_demo_train)
X_demo_test_ohe = ohe.transform(X_demo_test)
X_demo_train_ohe.filter(regex='gen|mar', axis="columns").head(2).T


# use the ordinal encoder for college enrollment
X_demo_train.colenroct99.unique()
X_demo_train.head()

oe = OrdinalEncoder(categories=\
  [X_demo_train.colenroct99.unique()])
colenr_enc = \
  pd.DataFrame(oe.fit_transform(X_demo_train[['colenroct99']]),
    columns=['colenroct99'], index=X_demo_train.index)
X_demo_train_enc = \
  X_demo_train[['gender','maritalstatus']].\
  join(colenr_enc)
X_demo_train_enc.head()
X_demo_train.colenroct99.value_counts().sort_index()
X_demo_train_enc.colenroct99.value_counts().sort_index()

## 高卡度编码

high cardinality encoding

In [None]:
from feature_engine.encoding import OneHotEncoder
from category_encoders.hashing import HashingEncoder
pd.set_option('display.width', 80)
pd.set_option('display.max_columns', 8)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.0f}'.format

covidtotals = pd.read_csv("data/covidtotals.csv")
feature_cols = ['location','population',
    'aged_65_older','region']
covidtotals = covidtotals[['total_cases'] + feature_cols].dropna()

# Separate into train and test sets
X_train, X_test, y_train, y_test =  \
  train_test_split(covidtotals[feature_cols],\
  covidtotals[['total_cases']], test_size=0.3, random_state=0)

# use the one hot encoder for region
X_train.region.value_counts()
ohe = OneHotEncoder(top_categories=6, variables=['region'])
covidtotals_ohe = ohe.fit_transform(covidtotals)
covidtotals_ohe.filter(regex='location|region',
  axis="columns").sample(5, random_state=99).T

# use the hashing encoder for region
X_train['region2'] = X_train.region
he = HashingEncoder(cols=['region'], n_components=6)
X_train_enc = he.fit_transform(X_train)
X_train_enc.\
 groupby(['col_0','col_1','col_2','col_3','col_4',
   'col_5','region2']).\
 size().reset_index().rename(columns={0:'count'})

## 特征转换

feature transform

In [None]:
from feature_engine import transformation as vt
import matplotlib.pyplot as plt
from scipy import stats
pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.0f}'.format

covidtotals = pd.read_csv("data/covidtotals.csv")

feature_cols = ['location','population',
    'aged_65_older','region']
covidtotals = covidtotals[['total_cases'] + feature_cols].dropna()

# separate into train and test sets
X_train, X_test, y_train, y_test =  \
  train_test_split(covidtotals[feature_cols],\
  covidtotals[['total_cases']], test_size=0.3, random_state=0)

# show a histogram of total cases
y_train.total_cases.skew()
plt.hist(y_train.total_cases/1000000)
plt.title("Total Covid Cases (in millions)")
plt.xlabel('Cases')
plt.ylabel("Number of Countries")
plt.show()

# do a log transformation on total cases
tf = vt.LogTransformer(variables = ['total_cases'])
y_train_tf = tf.fit_transform(y_train)

y_train_tf.total_cases.skew()
plt.hist(y_train_tf.total_cases)
plt.title("Total Covid Cases (log transformation)")
plt.xlabel('Cases')
plt.ylabel("Number of Countries")
plt.show()


# do a Box Cox transformation on total cases
tf = vt.BoxCoxTransformer(variables = ['total_cases'])
y_train_tf = tf.fit_transform(y_train)

y_train_tf.total_cases.skew()
plt.hist(y_train_tf.total_cases)
plt.title("Total Covid Cases (Box Cox transformation)")
plt.xlabel('Cases')
plt.ylabel("Number of Countries")
plt.show()

stats.boxcox(y_train.total_cases)[1]

## 特征分选

feature binning

In [None]:
from feature_engine.discretisation import EqualFrequencyDiscretiser as efd
from feature_engine.discretisation import EqualWidthDiscretiser as ewd
from sklearn.preprocessing import KBinsDiscretizer

pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.3f}'.format

covidtotals = pd.read_csv("data/covidtotals.csv")

feature_cols = ['location','population',
    'aged_65_older','region']
covidtotals = covidtotals[['total_cases'] + feature_cols].dropna()

# Separate into train and test sets
X_train, X_test, y_train, y_test =  \
  train_test_split(covidtotals[feature_cols],\
  covidtotals[['total_cases']], test_size=0.3, random_state=0)

# use qcut for bins
y_train['total_cases_group'] = pd.qcut(y_train.total_cases, q=10, labels=[0,1,2,3,4,5,6,7,8,9])
y_train.total_cases_group.value_counts().sort_index()

# set up function to run the transform
def runtransform(bt, dftrain, dftest):
  bt.fit(dftrain)
  train_bins = bt.transform(dftrain)
  test_bins = bt.transform(dftest)
  return train_bins, test_bins

# set up bins based on equal frequency
y_train.drop(['total_cases_group'], axis=1, inplace=True)
bintransformer = efd(q=10, variables=['total_cases'])
y_train_bins, y_test_bins = runtransform(bintransformer, y_train, y_test)
y_train_bins.total_cases.value_counts().sort_index()

# set up bins based on equal width
bintransformer = ewd(bins=10, variables=['total_cases'])
y_train_bins, y_test_bins = runtransform(bintransformer, y_train, y_test)
y_train_bins.total_cases.value_counts().sort_index()

pd.options.display.float_format = '{:,.0f}'.format
y_train_bins = y_train_bins.\
  rename(columns={'total_cases':'total_cases_group'}).\
  join(y_train)
y_train_bins.groupby("total_cases_group")["total_cases"].agg(['min','max'])

# use k means clustering
kbins = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='kmeans')
y_train_bins = \
  pd.DataFrame(kbins.fit_transform(y_train),
  columns=['total_cases'])
y_train_bins.total_cases.value_counts().sort_index()

y_train.total_cases.agg(['skew','kurtosis'])
y_train_bins.total_cases.agg(['skew','kurtosis'])

## 特征缩放

feature scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.2f}'.format

covidtotals = pd.read_csv("data/covidtotals.csv")
feature_cols = ['population','total_deaths',
    'aged_65_older']
covidtotals = covidtotals[['total_cases'] + feature_cols].dropna()

# separate into train and test sets
X_train, X_test, y_train, y_test =  \
  train_test_split(covidtotals[feature_cols],\
  covidtotals[['total_cases']], test_size=0.3, random_state=0)

# do min-max scaling
scaler = MinMaxScaler()
X_train_mms = pd.DataFrame(scaler.fit_transform(X_train),
  columns=X_train.columns, index=X_train.index)
X_train_mms.describe()


# do standard scaling
scaler = StandardScaler()
X_train_ss = \
  pd.DataFrame(scaler.fit_transform(X_train),
  columns=X_train.columns, index=X_train.index)
X_train_ss.describe()

# use the robust scaler
scaler = RobustScaler()
X_train_rs = pd.DataFrame(scaler.fit_transform(X_train),
  columns=X_train.columns, index=X_train.index)
X_train_rs.describe()

## 散列编码器

In [None]:
from feature_engine.encoding import OneHotEncoder
from category_encoders.hashing import HashingEncoder

pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.0f}'.format

covidtotals = pd.read_csv("data/covidtotals.csv")
feature_cols = ['location','population',
    'aged_65_older','region']
covidtotals = covidtotals[['total_cases'] + feature_cols].dropna()

# 分为训练集和测试集
X_train, X_test, y_train, y_test =  \
  train_test_split(covidtotals[feature_cols],\
  covidtotals[['total_cases']], test_size=0.3, random_state=0)

# 使用单热编码器进行区域编码
X_train.region.value_counts()
ohe = OneHotEncoder(top_categories=6, variables=['region'])
covidtotals_ohe = ohe.fit_transform(covidtotals)
covidtotals_ohe.filter(regex='location|region',
  axis="columns").sample(5, random_state=99).T

# 使用哈希编码器进行区域编码
he = HashingEncoder(cols=['region'], n_components=16)
covidtotals['region2'] = covidtotals.region
covidtotals_enc = he.fit_transform(covidtotals)

covidtotals_enc.filter(regex='col|reg', axis="columns")
covidtotals_enc.groupby(['col_0','col_1','col_2','col_3','col_4',\
    'col_5','col_6','col_7','col_8','col_9','col_10','col_11','col_12',\
        'col_13','col_14','col_15','region2']).size().reset_index()