# 为分类选择特征

selecting features for classification

In [None]:
import pandas as pd
from feature_engine.encoding import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest,\
  mutual_info_classif, f_classif

pd.set_option('display.width', 75)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 100)
pd.options.display.float_format = '{:,.3f}'.format


nls97compba = pd.read_csv("data/nls97compba.csv")

feature_cols = ['gender','satverbal','satmath','gpascience',
  'gpaenglish','gpamath','gpaoverall','motherhighgrade',
  'fatherhighgrade','parentincome']

# 将 NLS 数据分为训练数据集和测试数据集
X_train, X_test, y_train, y_test =  \
  train_test_split(nls97compba[feature_cols],\
  nls97compba[['completedba']], test_size=0.3, random_state=0)

# 对性别特征进行编码，并对其他特征进行缩放
ohe = OneHotEncoder(drop_last=True, variables=['gender'])
X_train_enc = ohe.fit_transform(X_train)
scaler = StandardScaler()
standcols = X_train_enc.iloc[:,:-1].columns
X_train_enc = \
  pd.DataFrame(scaler.fit_transform(X_train_enc[standcols]),
  columns=standcols, index=X_train_enc.index).\
  join(X_train_enc[['gender_Female']], how="left", on=None, validate="many_to_many")

# 利用相互(mutual)信息选择预测大学毕业的 5 个最佳特征
ksel = SelectKBest(score_func=mutual_info_classif, k=5)
ksel.fit(X_train_enc, y_train.values.ravel())
selcols = X_train_enc.columns[ksel.get_support()]
selcols
pd.DataFrame({'score': ksel.scores_,
  'feature': X_train_enc.columns},
   columns=['feature','score']).\
   sort_values(['score'], ascending=False)
X_train_analysis = X_train_enc[selcols]
X_train_analysis.dtypes

# 利用方差分析(ANOVA)选择预测大学毕业的 5 个最佳特征
ksel = SelectKBest(score_func=f_classif, k=5)
ksel.fit(X_train_enc, y_train.values.ravel())
selcols = X_train_enc.columns[ksel.get_support()]
selcols
pd.DataFrame({'score': ksel.scores_,
  'feature': X_train_enc.columns},
   columns=['feature','score']).\
   sort_values(['score'], ascending=False)

Unnamed: 0,feature,score
5,gpaoverall,119.471
3,gpaenglish,108.006
2,gpascience,96.824
1,satmath,84.901
0,satverbal,77.363
4,gpamath,60.93
7,fatherhighgrade,37.481
6,motherhighgrade,29.377
8,parentincome,22.266
9,gender_Female,15.098


# 为回归选择特征

selecting features for regression

In [None]:
import pandas as pd
from feature_engine.encoding import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression,\
  mutual_info_regression
pd.set_option('display.width', 75)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 100)
pd.options.display.float_format = '{:,.3f}'.format

# load the NLS data
nls97wages = pd.read_csv("data/nls97wages.csv")

feature_cols = ['satverbal','satmath','gpascience',
  'gpaenglish','gpamath','gpaoverall','gender','motherhighgrade',
  'fatherhighgrade','parentincome','completedba']

X_train, X_test, y_train, y_test =  \
  train_test_split(nls97wages[feature_cols],\
  nls97wages[['wageincome']], test_size=0.3, random_state=0)
      
# 对数据进行编码和缩放
ohe = OneHotEncoder(drop_last=True, variables=['gender'])
X_train_enc = ohe.fit_transform(X_train)
scaler = StandardScaler()
standcols = X_train_enc.iloc[:,:-1].columns
X_train_enc = \
  pd.DataFrame(scaler.fit_transform(X_train_enc[standcols]),
  columns=standcols, index=X_train_enc.index).\
  join(X_train_enc[['gender_Male']], how="left", on=None, validate="many_to_many")

y_train = \
  pd.DataFrame(scaler.fit_transform(y_train),
  columns=['wageincome'], index=y_train.index)

# 选择预测工资收入的 5 个最佳功能
ksel = SelectKBest(score_func=f_regression, k=5)
ksel.fit(X_train_enc, y_train.values.ravel())
selcols = X_train_enc.columns[ksel.get_support()]
selcols
pd.DataFrame({'score': ksel.scores_,
  'feature': X_train_enc.columns},
   columns=['feature','score']).\
   sort_values(['score'], ascending=False)

# 利用交互(mutual)信息选出 5 个最佳特征
from functools import partial
ksel = SelectKBest(score_func=\
  partial(mutual_info_regression, random_state=0),
  k=5)
ksel.fit(X_train_enc, y_train.values.ravel())
selcols = X_train_enc.columns[ksel.get_support()]
selcols
pd.DataFrame({'score': ksel.scores_,
  'feature': X_train_enc.columns},
   columns=['feature','score']).\
   sort_values(['score'], ascending=False)

Unnamed: 0,feature,score
1,satmath,0.101
10,gender_Male,0.074
7,fatherhighgrade,0.047
2,gpascience,0.044
9,completedba,0.044
4,gpamath,0.016
8,parentincome,0.015
6,motherhighgrade,0.012
0,satverbal,0.0
3,gpaenglish,0.0


# 向前向后选择

In [None]:
import pandas as pd
from feature_engine.encoding import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from mlxtend.feature_selection import SequentialFeatureSelector

pd.set_option('display.width', 75)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 100)
pd.options.display.float_format = '{:,.0f}'.format

# load the NLS data
nls97compba = pd.read_csv("data/nls97compba.csv")

feature_cols = ['satverbal','satmath','gpascience',
  'gpaenglish','gpamath','gpaoverall','gender','motherhighgrade',
  'fatherhighgrade','parentincome']

# separate NLS data into train and test datasets
X_train, X_test, y_train, y_test =  \
  train_test_split(nls97compba[feature_cols],\
  nls97compba[['completedba']], test_size=0.3, random_state=0)

# encode the data      
ohe = OneHotEncoder(drop_last=True, variables=['gender'])
X_train_enc = ohe.fit_transform(X_train)
scaler = StandardScaler()
standcols = X_train_enc.iloc[:,:-1].columns
X_train_enc = \
  pd.DataFrame(scaler.fit_transform(X_train_enc[standcols]),
  columns=standcols, index=X_train_enc.index).\
  join(X_train_enc[['gender_Female']], how="left", on=None, validate="many_to_many")

# Build RF classifier to use in feature selection
rfc = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=0)

# Build step forward feature selection
sfs = SequentialFeatureSelector(rfc, k_features=5,
  forward=True, floating=False, verbose=2,
  scoring='accuracy', cv=5)

# Perform SFFS
sfs.fit(X_train_enc, y_train.values.ravel())
selcols = X_train_enc.columns[list(sfs.k_feature_idx_)]
selcols

# Build step forward feature selection
rfc = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=0)
sfs = SequentialFeatureSelector(rfc, k_features=5,
  forward=False, floating=False, verbose=2,
  scoring='accuracy', cv=5)

# Perform SFFS
sfs.fit(X_train_enc, y_train.values.ravel())
selcols = X_train_enc.columns[list(sfs.k_feature_idx_)]
selcols

# exhausive backward selection

In [None]:
import pandas as pd
from feature_engine.encoding import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from mlxtend.feature_selection import ExhaustiveFeatureSelector
from sklearn.metrics import accuracy_score
pd.set_option('display.width', 75)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 100)
pd.options.display.float_format = '{:,.0f}'.format

# load the NLS data
nls97compba = pd.read_csv("data/nls97compba.csv")

feature_cols = ['satverbal','satmath','gpascience',
  'gpaenglish','gpamath','gpaoverall','gender','motherhighgrade',
  'fatherhighgrade','parentincome']

# separate NLS data into train and test datasets
X_train, X_test, y_train, y_test =  \
  train_test_split(nls97compba[feature_cols],\
  nls97compba[['completedba']], test_size=0.3, random_state=0)

# encode the data      
ohe = OneHotEncoder(drop_last=True, variables=['gender'])
ohe.fit(X_train)
X_train_enc, X_test_enc = \
  ohe.transform(X_train), ohe.transform(X_test)

scaler = StandardScaler()
standcols = X_train_enc.iloc[:,:-1].columns
scaler.fit(X_train_enc[standcols])
X_train_enc = \
  pd.DataFrame(scaler.transform(X_train_enc[standcols]),
  columns=standcols, index=X_train_enc.index).\
  join(X_train_enc[['gender_Female']])
X_test_enc = \
  pd.DataFrame(scaler.transform(X_test_enc[standcols]),
  columns=standcols, index=X_test_enc.index).\
  join(X_test_enc[['gender_Female']])


# Build RF classifier to use in feature selection
rfc = RandomForestClassifier(n_estimators=100, max_depth=2, 
  n_jobs=-1, random_state=0)

# Build exhaustive feature selection
efs = ExhaustiveFeatureSelector(rfc, max_features=5,
  min_features=1, scoring='accuracy', 
  print_progress=True, cv=5)

# Perform EFS
efs.fit(X_train_enc, y_train.values.ravel())
efs.best_feature_names_

# evaluate the accuracy of the random forest classifier model
X_train_efs = efs.transform(X_train_enc)
X_test_efs = efs.transform(X_test_enc)

rfc.fit(X_train_efs, y_train.values.ravel())
y_pred = rfc.predict(X_test_efs)

confusion = pd.DataFrame(y_pred, columns=['pred'],
  index=y_test.index).\
  join(y_test)
confusion.loc[confusion.pred==confusion.completedba].shape[0]\
  /confusion.shape[0]

accuracy_score(y_test, y_pred)

# build logistic classifier and redo the feature selection
lr = LogisticRegression(solver='liblinear')
efs = ExhaustiveFeatureSelector(lr, max_features=5,
  min_features=1, scoring='accuracy', 
  print_progress=True, cv=5)
efs.fit(X_train_enc, y_train.values.ravel())
efs.best_feature_names_


# evaluate the accuracy of the logistic model
X_train_efs = efs.transform(X_train_enc)
X_test_efs = efs.transform(X_test_enc)

lr.fit(X_train_efs, y_train.values.ravel())
y_pred = lr.predict(X_test_efs)

accuracy_score(y_test, y_pred)

# RFE 回归

rfe regression

In [None]:
import pandas as pd
from feature_engine.encoding import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
pd.set_option('display.width', 75)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 100)
pd.options.display.float_format = '{:,.3f}'.format

# load the NLS data
nls97wages = pd.read_csv("data/nls97wages.csv")

feature_cols = ['satverbal','satmath','gpascience',
  'gpaenglish','gpamath','gpaoverall','motherhighgrade',
  'fatherhighgrade','parentincome','gender','completedba']

X_train, X_test, y_train, y_test =  \
  train_test_split(nls97wages[feature_cols],\
  nls97wages[['weeklywage']], test_size=0.3, random_state=0)
      
# standardize and scale the data      
ohe = OneHotEncoder(drop_last=True, variables=['gender'])
ohe.fit(X_train)
X_train_enc, X_test_enc = \
  ohe.transform(X_train), ohe.transform(X_test)

scaler = StandardScaler()
standcols = feature_cols[:-2]
scaler.fit(X_train_enc[standcols])
X_train_enc = \
  pd.DataFrame(scaler.transform(X_train_enc[standcols]),
  columns=standcols, index=X_train_enc.index).\
  join(X_train_enc[['gender_Male','completedba']])
X_test_enc = \
  pd.DataFrame(scaler.transform(X_test_enc[standcols]),
  columns=standcols, index=X_test_enc.index).\
  join(X_test_enc[['gender_Male','completedba']])

scaler.fit(y_train)
y_train, y_test = \
  pd.DataFrame(scaler.transform(y_train),
  columns=['weeklywage'], index=y_train.index),\
  pd.DataFrame(scaler.transform(y_test),
  columns=['weeklywage'], index=y_test.index)


# use decision trees for recursive feature elimination
rfr = RandomForestRegressor(max_depth=2)

treesel = RFE(estimator=rfr, n_features_to_select=5)
treesel.fit(X_train_enc, y_train.values.ravel())
selcols = X_train_enc.columns[treesel.get_support()]
selcols
pd.DataFrame({'ranking': treesel.ranking_,
  'feature': X_train_enc.columns},
   columns=['feature','ranking']).\
   sort_values(['ranking'], ascending=True)
   
rfr.fit(treesel.transform(X_train_enc), y_train.values.ravel())
rfr.score(treesel.transform(X_train_enc), y_train.values.ravel())
rfr.score(treesel.transform(X_test_enc), y_test)


# use linear regression for recursive feature elimination
lr = LinearRegression()

lrsel = RFE(estimator=lr, n_features_to_select=5)
lrsel.fit(X_train_enc, y_train)
selcols = X_train_enc.columns[lrsel.get_support()]
selcols
pd.DataFrame({'ranking': lrsel.ranking_,
  'feature': X_train_enc.columns},
   columns=['feature','ranking']).\
   sort_values(['ranking'], ascending=True)
   
lr.fit(lrsel.transform(X_train_enc), y_train)
lr.score(lrsel.transform(X_train_enc), y_train)
lr.score(lrsel.transform(X_test_enc), y_test)

# RFE 分类

In [None]:
import pandas as pd
from feature_engine.encoding import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
pd.set_option('display.width', 75)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 100)
pd.options.display.float_format = '{:,.0f}'.format

# load the NLS data
nls97compba = pd.read_csv("data/nls97compba.csv")

feature_cols = ['satverbal','satmath','gpascience',
  'gpaenglish','gpamath','gpaoverall','gender','motherhighgrade',
  'fatherhighgrade','parentincome']

# separate NLS data into train and test datasets
X_train, X_test, y_train, y_test =  \
  train_test_split(nls97compba[feature_cols],\
  nls97compba[['completedba']], test_size=0.3, random_state=0)

# encode the data      
ohe = OneHotEncoder(drop_last=True, variables=['gender'])
ohe.fit(X_train)
X_train_enc, X_test_enc = \
  ohe.transform(X_train), ohe.transform(X_test)

scaler = StandardScaler()
standcols = X_train_enc.iloc[:,:-1].columns
scaler.fit(X_train_enc[standcols])
X_train_enc = \
  pd.DataFrame(scaler.transform(X_train_enc[standcols]),
  columns=standcols, index=X_train_enc.index).\
  join(X_train_enc[['gender_Female']], how="left", on=None, validate="many_to_many")
X_test_enc = \
  pd.DataFrame(scaler.transform(X_test_enc[standcols]),
  columns=standcols, index=X_test_enc.index).\
  join(X_test_enc[['gender_Female']], how="left", on=None, validate="many_to_many")


# Build RF classifier to use in feature selection
rfc = RandomForestClassifier(n_estimators=100, max_depth=2, 
  n_jobs=-1, random_state=0)

# Build exhaustive feature selection
treesel = RFE(estimator=rfc, n_features_to_select=5)
treesel.fit(X_train_enc, y_train.values.ravel())
selcols = X_train_enc.columns[treesel.get_support()]
selcols
pd.DataFrame({'ranking': treesel.ranking_,
  'feature': X_train_enc.columns},
   columns=['feature','ranking']).\
   sort_values(['ranking'], ascending=True)
   
# evaluate the accuracy of the random forest classifier model
rfc.fit(treesel.transform(X_train_enc), y_train.values.ravel())
y_pred = rfc.predict(treesel.transform(X_test_enc))

confusion = pd.DataFrame(y_pred, columns=['pred'],
  index=y_test.index).\
  join(y_test, how="left", on=None, validate="many_to_many")
confusion.loc[confusion.pred==confusion.completedba].shape[0]\
  /confusion.shape[0]

accuracy_score(y_test, y_pred)

# boruta 分类

In [None]:
import pandas as pd
from feature_engine.encoding import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
from sklearn.metrics import accuracy_score
pd.set_option('display.width', 75)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 100)
pd.options.display.float_format = '{:,.0f}'.format

# load the NLS data
nls97compba = pd.read_csv("data/nls97compba.csv")

feature_cols = ['satverbal','satmath','gpascience',
  'gpaenglish','gpamath','gpaoverall','gender','motherhighgrade',
  'fatherhighgrade','parentincome']

# separate NLS data into train and test datasets
X_train, X_test, y_train, y_test =  \
  train_test_split(nls97compba[feature_cols],\
  nls97compba[['completedba']], test_size=0.3, random_state=0)

# encode the data      
ohe = OneHotEncoder(drop_last=True, variables=['gender'])
ohe.fit(X_train)
X_train_enc, X_test_enc = \
  ohe.transform(X_train), ohe.transform(X_test)

scaler = StandardScaler()
standcols = X_train_enc.iloc[:,:-1].columns
scaler.fit(X_train_enc[standcols])
X_train_enc = \
  pd.DataFrame(scaler.transform(X_train_enc[standcols]),
  columns=standcols, index=X_train_enc.index).\
  join(X_train_enc[['gender_Female']])
X_test_enc = \
  pd.DataFrame(scaler.transform(X_test_enc[standcols]),
  columns=standcols, index=X_test_enc.index).\
  join(X_test_enc[['gender_Female']])


# Build RF classifier to use in feature selection
rfc = RandomForestClassifier(n_estimators=100, max_depth=2, 
  n_jobs=-1, random_state=0)

# Build exhaustive feature selection
borsel = BorutaPy(rfc, random_state=0, verbose=2)
borsel.fit(X_train_enc.values, y_train.values.ravel())

selcols = X_train_enc.columns[borsel.support_]
selcols
pd.DataFrame({'ranking': borsel.ranking_,
  'feature': X_train_enc.columns},
   columns=['feature','ranking']).\
   sort_values(['ranking'], ascending=True)
   
# evaluate the accuracy of the random forest classifier model
rfc.fit(borsel.transform(X_train_enc.values), y_train.values.ravel())
y_pred = rfc.predict(borsel.transform(X_test_enc.values))

confusion = pd.DataFrame(y_pred, columns=['pred'],
  index=y_test.index).\
  join(y_test)
confusion.loc[confusion.pred==confusion.completedba].shape[0]\
  /confusion.shape[0]

accuracy_score(y_test, y_pred)

# 规范化

regularization

In [None]:
# import pandas, numpy, and matplotlib
import pandas as pd
from feature_engine.encoding import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score

pd.set_option('display.width', 75)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 100)
pd.options.display.float_format = '{:,.0f}'.format
nls97compba = pd.read_csv("data/nls97compba.csv")

feature_cols = ['satverbal','satmath','gpascience',
  'gpaenglish','gpamath','gpaoverall','gender','motherhighgrade',
  'fatherhighgrade','parentincome']

# separate NLS data into train and test datasets
X_train, X_test, y_train, y_test =  \
  train_test_split(nls97compba[feature_cols],\
  nls97compba[['completedba']], test_size=0.3, random_state=0)
      
# do one hot encoding and scaling
ohe = OneHotEncoder(drop_last=True, variables=['gender'])
ohe.fit(X_train)
X_train_enc, X_test_enc = \
  ohe.transform(X_train), ohe.transform(X_test)

scaler = StandardScaler()
standcols = X_train_enc.iloc[:,:-1].columns
scaler.fit(X_train_enc[standcols])
X_train_enc = \
  pd.DataFrame(scaler.transform(X_train_enc[standcols]),
  columns=standcols, index=X_train_enc.index).\
  join(X_train_enc[['gender_Female']])
X_test_enc = \
  pd.DataFrame(scaler.transform(X_test_enc[standcols]),
  columns=standcols, index=X_test_enc.index).\
  join(X_test_enc[['gender_Female']])

# logistic regression for feature importance
lr = LogisticRegression(C=1, penalty="l1", solver='liblinear')
regsel = SelectFromModel(lr, max_features=5)
regsel.fit(X_train_enc, y_train.values.ravel())
selcols = X_train_enc.columns[regsel.get_support()]
selcols

lr.fit(regsel.transform(X_train_enc), y_train.values.ravel())
y_pred = lr.predict(regsel.transform(X_test_enc))

accuracy_score(y_test, y_pred)

# random forest classification for feature importance
rfc = RandomForestClassifier(n_estimators=100, max_depth=2, 
  n_jobs=-1, random_state=0)

rfcsel = SelectFromModel(rfc, max_features=5)
rfcsel.fit(X_train_enc, y_train.values.ravel())
selcols = X_train_enc.columns[rfcsel.get_support()]
selcols

rfc.fit(rfcsel.transform(X_train_enc), y_train.values.ravel())
y_pred = rfc.predict(rfcsel.transform(X_test_enc))

accuracy_score(y_test, y_pred)

# pca

In [None]:
import pandas as pd
import numpy as np
from feature_engine.encoding import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

pd.set_option('display.width', 75)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 100)
pd.options.display.float_format = '{:,.2f}'.format

# load the NLS data
nls97compba = pd.read_csv("data/nls97compba.csv")

feature_cols = ['satverbal','satmath','gpascience',
  'gpaenglish','gpamath','gpaoverall','gender',
  'motherhighgrade',  'fatherhighgrade','parentincome']

# separate NLS data into train and test datasets
X_train, X_test, y_train, y_test =  \
  train_test_split(nls97compba[feature_cols],\
  nls97compba[['completedba']], test_size=0.3,
  random_state=0)

# encode the data      
ohe = OneHotEncoder(drop_last=True, variables=['gender'])
ohe.fit(X_train)
X_train_enc, X_test_enc = \
  ohe.transform(X_train), ohe.transform(X_test)

scaler = StandardScaler()
standcols = X_train_enc.iloc[:,:-1].columns
scaler.fit(X_train_enc[standcols])
X_train_enc = \
  pd.DataFrame(scaler.transform(X_train_enc[standcols]),
  columns=standcols, index=X_train_enc.index).\
  join(X_train_enc[['gender_Female']])
X_test_enc = \
  pd.DataFrame(scaler.transform(X_test_enc[standcols]),
  columns=standcols, index=X_test_enc.index).\
  join(X_test_enc[['gender_Female']])


# instantiate a pca object and fit the model
pca = PCA(n_components=5)
pca.fit(X_train_enc)

# take a closer look at the components
pd.DataFrame(pca.components_,
  columns=X_train_enc.columns).T

pca.explained_variance_ratio_
np.cumsum(pca.explained_variance_ratio_)

# create numpy arrays transformed values based on components
X_train_pca = pca.transform(X_train_enc)
X_train_pca.shape
np.round(X_train_pca[0:6],2)
X_test_pca = pca.transform(X_test_enc)

# evaluate the accuracy of the random forest classifier model
rfc = RandomForestClassifier(n_estimators=100, 
  max_depth=2, n_jobs=-1, random_state=0)

rfc.fit(X_train_pca, y_train.values.ravel())
y_pred = rfc.predict(X_test_pca)

accuracy_score(y_test, y_pred)