# Preprocessing Features

## 拆分数据-测试/训练

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.width', 75)
pd.set_option('display.max_columns', 7)
pd.set_option('display.max_rows', 25)
pd.options.display.float_format = '{:,.0f}'.format
nls97 = pd.read_csv("data/nls97g.csv", low_memory=False)
nls97.set_index("personid", inplace=True)

feature_cols = ['satverbal','satmath','gpascience',
  'gpaenglish','gpamath','gpaoverall']

# 将NLS数据分离为训练和测试数据集
X_train, X_test, y_train, y_test =  \
  train_test_split(nls97[feature_cols],\
  nls97[['wageincome20']], test_size=0.3, random_state=0)

# 删除与另一个特征高度相关的特征
nls97.shape[0]
X_train.info()
y_train.info()
X_test.info()
y_test.info()

## 冗余特性

In [None]:
import pandas as pd
import feature_engine.selection as fesel
from sklearn.model_selection import train_test_split
pd.set_option('display.width', 62)
pd.set_option('display.max_columns', 7)
pd.set_option('display.max_rows', 25)
pd.options.display.float_format = '{:,.0f}'.format
nls97 = pd.read_csv("data/nls97g.csv", low_memory=False)
nls97.set_index("personid", inplace=True)
ltpoland = pd.read_csv("data/ltpoland.csv")
ltpoland.set_index("station", inplace=True)

ltpoland.dropna(inplace=True)

feature_cols = ['satverbal','satmath','gpascience',
  'gpaenglish','gpamath','gpaoverall']

# 将NLS数据分离为训练和测试数据集
X_train, X_test, y_train, y_test =  \
  train_test_split(nls97[feature_cols],\
  nls97[['wageincome20']], test_size=0.3, random_state=0)

# 删除与另一个特征高度相关的特征
X_train.corr()
tr = fesel.DropCorrelatedFeatures(variables=None, method='pearson', threshold=0.75)
tr.fit(X_train)
X_train_tr = tr.transform(X_train)
X_test_tr = tr.transform(X_test)
X_train_tr.info()

feature_cols = ['year','month','latabs','latitude','elevation',
  'longitude','country']

# 将温度数据分离到训练和测试数据集中
X_train, X_test, y_train, y_test =  \
  train_test_split(ltpoland[feature_cols],\
  ltpoland[['temperature']], test_size=0.3, random_state=0)
X_train.sample(5, random_state=99)
X_train.year.value_counts()
X_train.country.value_counts()
(X_train.latitude!=X_train.latabs).sum()

# 在整个数据集中删除具有相同值的特征
tr = fesel.DropConstantFeatures()
tr.fit(X_train)
X_train_tr = tr.transform(X_train)
X_test_tr = tr.transform(X_test)
X_train_tr.head()

# 删除与另一个特征具有相同值的特征
tr = fesel.DropDuplicateFeatures()
tr.fit(X_train_tr)
X_train_tr = tr.transform(X_train_tr)
X_train_tr.head()

## One-hot编码

In [None]:
import pandas as pd
from feature_engine.encoding import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
pd.set_option('display.width', 62)
pd.set_option('display.max_columns', 7)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.0f}'.format
nls97 = pd.read_csv("data/nls97g.csv", low_memory=False)
nls97.set_index("personid", inplace=True)

feature_cols = ['gender','maritalstatus','colenroct99']
nls97demo = nls97[['wageincome20'] + feature_cols].dropna()

# 将NLS数据分离为训练和测试数据集
X_demo_train, X_demo_test, y_demo_train, y_demo_test =  \
  train_test_split(nls97demo[feature_cols],\
  nls97demo[['wageincome20']], test_size=0.3, random_state=0)

# 使用get dummies创建dummies功能
pd.get_dummies(X_demo_train, 
  columns=['gender','maritalstatus'], dtype=float).\
  head(2).T
pd.get_dummies(X_demo_train,
  columns=['gender','maritalstatus'], dtype=float,
  drop_first=True).head(2).T

# 使用一键(one-hot)编码器为性别和婚姻状况创建编码特征
ohe = OneHotEncoder(drop_last=True, variables=['gender','maritalstatus'])
ohe.fit(X_demo_train)
X_demo_train_ohe = ohe.transform(X_demo_train)
X_demo_test_ohe = ohe.transform(X_demo_test)
X_demo_train_ohe.filter(regex='gen|mar', axis="columns").head(2).T


# 在大学招生中使用序数(ordinal)编码器
X_demo_train.colenroct99.\
  sort_values().unique()
X_demo_train.head()
X_demo_train.info()

oe = OrdinalEncoder(categories=\
  [X_demo_train.colenroct99.sort_values().\
   unique()])
colenr_enc = \
  pd.DataFrame(oe.fit_transform(X_demo_train[['colenroct99']]),
    columns=['colenroct99'], index=X_demo_train.index)
X_demo_train_enc = \
  X_demo_train[['gender','maritalstatus']].\
  join(colenr_enc)
X_demo_train_enc.head()
X_demo_train.colenroct99.value_counts().\
  sort_index()
X_demo_train_enc.colenroct99.value_counts().\
  sort_index()
  
X_demo_train.head()

nls97.loc[764231,:]

## 高可靠性编码

In [None]:
import pandas as pd
from feature_engine.encoding import OneHotEncoder
from category_encoders.hashing import HashingEncoder
from sklearn.model_selection import train_test_split
pd.set_option('display.width', 72)
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.0f}'.format

covidtotals = pd.read_csv("data/covidtotals.csv")
feature_cols = ['location','population',
    'aged_65_older','life_expectancy','region']
covidtotals = covidtotals[['total_cases'] + feature_cols].dropna()

# 分为训练组和测试组
X_train, X_test, y_train, y_test =  \
  train_test_split(covidtotals[feature_cols],\
  covidtotals[['total_cases']], test_size=0.3, random_state=0)

# 对区域使用一个热编码器
X_train.region.value_counts()
ohe = OneHotEncoder(top_categories=6, variables=['region'])
covidtotals_ohe = ohe.fit_transform(covidtotals)
covidtotals_ohe.filter(regex='location|region',
  axis="columns").sample(5, random_state=2).T

# 对区域使用哈希编码器
X_train['region2'] = X_train.region
he = HashingEncoder(cols=['region'], n_components=6)
X_train_enc = he.fit_transform(X_train)
X_train_enc.\
 groupby(['col_0','col_1','col_2','col_3','col_4',
   'col_5','region2']).\
 size().reset_index(name="count")

# 特征转换

In [None]:
import pandas as pd
from feature_engine import transformation as vt
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from scipy import stats
pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.0f}'.format

covidtotals = pd.read_csv("data/covidtotals.csv")

feature_cols = ['location','population',
    'aged_65_older','life_expectancy','region']
covidtotals = covidtotals[['total_cases'] + feature_cols].dropna()

# 分为训练集和测试集
X_train, X_test, y_train, y_test =  \
  train_test_split(covidtotals[feature_cols],\
  covidtotals[['total_cases']], test_size=0.3, random_state=0)

# 显示总病例的直方图
y_train.total_cases.skew()
plt.hist(y_train.total_cases/1000000)
plt.title("Total Covid Cases (in millions)")
plt.xlabel('Cases')
plt.ylabel("Number of Countries")
plt.show()

# 对总案例进行日志转换
tf = vt.LogTransformer(variables = ['total_cases'])
y_train_tf = tf.fit_transform(y_train)

y_train_tf.total_cases.skew()
plt.hist(y_train_tf.total_cases)
plt.title("Total Covid Cases (log transformation)")
plt.xlabel('Cases')
plt.ylabel("Number of Countries")
plt.show()

# 对所有案例进行Box-Cox变换
tf = vt.BoxCoxTransformer(variables = ['total_cases'])
y_train_tf = tf.fit_transform(y_train)

y_train_tf.total_cases.skew()

plt.hist(y_train_tf.total_cases)
plt.title("Total Covid Cases (Box Cox transformation)")
plt.xlabel('Cases')
plt.ylabel("Number of Countries")
plt.show()

stats.boxcox(y_train.total_cases)[1]


# 特征分选

In [None]:
import pandas as pd
from feature_engine.discretisation import EqualFrequencyDiscretiser as efd
from feature_engine.discretisation import EqualWidthDiscretiser as ewd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer
pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.3f}'.format

covidtotals = pd.read_csv("data/covidtotals.csv")

feature_cols = ['location','population',
    'aged_65_older','life_expectancy','region']
covidtotals = covidtotals[['total_cases'] + feature_cols].dropna()

# 分为训练集和测试集
X_train, X_test, y_train, y_test =  \
  train_test_split(covidtotals[feature_cols],\
  covidtotals[['total_cases']], test_size=0.3, random_state=0)

# 使用 qcut 进行分仓
y_train['total_cases_group'] = \
  pd.qcut(y_train.total_cases, q=10, 
  labels=[0,1,2,3,4,5,6,7,8,9])
y_train.total_cases_group.value_counts().\
  sort_index()

# 设置运行转换的函数
def runtransform(bt, dftrain, dftest):
  bt.fit(dftrain)
  train_bins = bt.transform(dftrain)
  test_bins = bt.transform(dftest)
  return train_bins, test_bins

# 根据等频设置分区
y_train.drop(['total_cases_group'], axis=1, inplace=True)
bintransformer = efd(q=10, variables=['total_cases'])
y_train_bins, y_test_bins = runtransform(bintransformer, y_train, y_test)
y_train_bins.total_cases.value_counts().sort_index()

# 以等宽为基础设仓
bintransformer = ewd(bins=10, variables=['total_cases'])
y_train_bins, y_test_bins = runtransform(bintransformer, y_train, y_test)
y_train_bins.total_cases.value_counts().sort_index()

pd.options.display.float_format = '{:,.0f}'.format
y_train_bins = y_train_bins.\
  rename(columns={'total_cases':'total_cases_group'}).\
  join(y_train)
y_train_bins.groupby("total_cases_group")["total_cases"].\
  agg(['min','max'])

# 使用 k 手段聚类
kbins = KBinsDiscretizer(n_bins=10, encode='ordinal',
  strategy='kmeans', subsample=None, random_state=123)
y_train_bins = \
  pd.DataFrame(kbins.fit_transform(y_train),
  columns=['total_cases'], index=y_train.index)
y_train_bins.total_cases.value_counts().sort_index()


y_train.total_cases.agg(['skew','kurtosis'])
y_train_bins.total_cases.agg(['skew','kurtosis'])

y_train_bins.rename(columns={'total_cases':'total_cases_bin'}, inplace=True)
y_train.join(y_train_bins).\
  groupby(['total_cases_bin'])['total_cases'].\
  agg(['min','max','size'])


# 特征缩放

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

pd.set_option('display.width', 69)
pd.set_option('display.max_columns', 7)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.2f}'.format

covidtotals = pd.read_csv("data/covidtotals.csv")

feature_cols = ['population','total_deaths',
    'aged_65_older','life_expectancy']
covidtotals = covidtotals[['total_cases'] + feature_cols].dropna()

# 分为训练集和测试集
X_train, X_test, y_train, y_test =  \
  train_test_split(covidtotals[feature_cols],\
  covidtotals[['total_cases']], test_size=0.3, random_state=0)

# 进行最小最大缩放
scaler = MinMaxScaler()
X_train_mms = pd.DataFrame(scaler.fit_transform(X_train),
  columns=X_train.columns, index=X_train.index)
X_train_mms.describe()

# 进行标准缩放
scaler = StandardScaler()
X_train_ss = \
  pd.DataFrame(scaler.fit_transform(X_train),
  columns=X_train.columns, index=X_train.index)
X_train_ss.describe()

# 使用稳健的定标器
scaler = RobustScaler()
X_train_rs = pd.DataFrame(scaler.fit_transform(X_train),
  columns=X_train.columns, index=X_train.index)
X_train_rs.describe()


# 哈希编码

In [None]:
import pandas as pd
from feature_engine.encoding import OneHotEncoder
from category_encoders.hashing import HashingEncoder
from sklearn.model_selection import train_test_split
from feature_engine.encoding import OrdinalEncoder
from sklearn.feature_extraction import FeatureHasher
h = FeatureHasher(n_features=8, input_type='string', alternate_sign=False)
pd.set_option('display.width', 80)
pd.set_option('display.max_columns', 8)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.0f}'.format

covidtotals = pd.read_csv("data/covidtotals.csv")
feature_cols = ['location','population',
    'aged_65_older','diabetes_prevalence','region']
# KeyError: "['diabetes_prevalence'] not in index"
covidtotals = covidtotals[['total_cases'] + feature_cols].dropna()

# 分为训练组和测试组
X_train, X_test, y_train, y_test =  \
  train_test_split(covidtotals[feature_cols],\
  covidtotals[['total_cases']], test_size=0.3, random_state=0)

oe = OrdinalEncoder(encoding_method='arbitrary', 
  variables=['region'])

X_train = oe.fit_transform(X_train)
X_train.region.value_counts().sort_index()

X_train.region

# TypeError: 'int' object is not iterable
f = h.transform(X_train.region)
f.toarray()
temp = pd.DataFrame(f.toarray(), index=X_train.index) 
type(temp)
temp.shape
temp
X_train

test = pd.DataFrame({'type': ['a', 'b', 'a', 'c', 'b']})
test
f = h.transform(test.type)
f.toarray()

# 对区域使用一个热编码器
X_train.region.value_counts()
ohe = OneHotEncoder(top_categories=6, variables=['region'])
covidtotals_ohe = ohe.fit_transform(covidtotals)
covidtotals_ohe.filter(regex='location|region',
  axis="columns").sample(5, random_state=99).T

# 对区域使用哈希编码器
he = HashingEncoder(cols=['region'], n_components=6)
covidtotals_enc = he.fit_transform(covidtotals)
covidtotals_enc = covidtotals_enc.join(covidtotals[['region']])
covidtotals_enc[['col_0','col_1','col_2','col_3',
  'col_4','col_5','region']].sample(5, random_state=1)

data = pd.DataFrame([
    ['value_1', 23],
    ['value_2', 13],
    ['value_3', 42],
    ['value_4', 13],
    ['value_2', 46],
    ['value_1', 28],
    ['value_2', 32],
    ['value_3', 87],
    ['value_4', 98],
    ['value_5', 86],
    ['value_3', 45],
    ['value_2', 73],
    ['value_1', 36],
    ['value_3', 93]
], columns = ['feature1', 'feature2'])

data['feature1b'] = data.feature1

feature_hasher = FeatureHasher(n_features = 3, input_type = 'string')

temp2 = pd.DataFrame(feature_hasher.fit_transform(data['feature1']).toarray())
temp2

pd.concat([
pd.DataFrame(feature_hasher.fit_transform(data['feature1']).toarray()),
data[['feature1b','feature2']]], axis = 1)

h.fit_transform(data[['country']].to_dict(orient='records'))

In [None]:
import pandas as pd
from feature_engine.encoding import OneHotEncoder
from category_encoders.hashing import HashingEncoder
from sklearn.model_selection import train_test_split
pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 200)
pd.options.display.float_format = '{:,.0f}'.format

covidtotals = pd.read_csv("data/covidtotals.csv")
feature_cols = ['location','population',
    'aged_65_older','diabetes_prevalence','region']
covidtotals = covidtotals[['total_cases'] + feature_cols].dropna()

# 分为训练组和测试组
X_train, X_test, y_train, y_test =  \
  train_test_split(covidtotals[feature_cols],\
  covidtotals[['total_cases']], test_size=0.3, random_state=0)

# 对区域使用一个热编码器
X_train.region.value_counts()
ohe = OneHotEncoder(top_categories=6, variables=['region'])
covidtotals_ohe = ohe.fit_transform(covidtotals)
covidtotals_ohe.filter(regex='location|region',
  axis="columns").sample(5, random_state=99).T

# 对区域使用哈希编码器
he = HashingEncoder(cols=['region'], n_components=16)
covidtotals['region2'] = covidtotals.region
covidtotals_enc = he.fit_transform(covidtotals)

covidtotals_enc.filter(regex='col|reg', axis="columns")
covidtotals_enc.groupby(['col_0','col_1','col_2','col_3','col_4','col_5','col_6','col_7','col_8','col_9','col_10','col_11','col_12','col_13','col_14','col_15','region2']).size().reset_index()

