In [24]:
import os
import pandas as pd

adult_filename = "adult.data"
adult = pd.read_csv(adult_filename, header=None,
                   names=["Age", "Work-Class", "fnlwgt", "Education", "Education-Num", "Marital-Status",
                         "Occupation", "Relationship", "Race", "Sex", "Capital-gain", "Capital-loss", "Hours-per-week",
                          "Native-Country", "Earning-Raw"])

In [25]:
adult.dropna(how="all", inplace=True)
adult.columns

Index(['Age', 'Work-Class', 'fnlwgt', 'Education', 'Education-Num',
       'Marital-Status', 'Occupation', 'Relationship', 'Race', 'Sex',
       'Capital-gain', 'Capital-loss', 'Hours-per-week', 'Native-Country',
       'Earning-Raw'],
      dtype='object')

In [26]:
adult["Hours-per-week"].describe()

count    32561.000000
mean        40.437456
std         12.347429
min          1.000000
25%         40.000000
50%         40.000000
75%         45.000000
max         99.000000
Name: Hours-per-week, dtype: float64

In [27]:
adult["Work-Class"].unique()

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay',
       ' Never-worked'], dtype=object)

In [28]:
adult["LongHours"] = adult["Hours-per-week"] > 40

In [29]:
import numpy as np
X = np.arange(30).reshape((10, 3))

In [30]:
X[:,1] = 1
from sklearn.feature_selection import VarianceThreshold
vt = VarianceThreshold()
xt = vt.fit_transform(X)

In [31]:
print(xt)

[[ 0  2]
 [ 3  5]
 [ 6  8]
 [ 9 11]
 [12 14]
 [15 17]
 [18 20]
 [21 23]
 [24 26]
 [27 29]]


In [32]:
print(vt.variances_)

[74.25  0.   74.25]


In [33]:
X = adult[["Age", "Education-Num", "Capital-gain", "Capital-loss", "Hours-per-week"]].values
print(adult["Earning-Raw"].values)

[' <=50K' ' <=50K' ' <=50K' ... ' <=50K' ' <=50K' ' >50K']


In [34]:
y = (adult["Earning-Raw"] == ' >50K').values

In [35]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
transformer = SelectKBest(score_func=chi2, k=3)
xt_chi2 = transformer.fit_transform(X, y)
print(transformer.scores_)

[8.60061182e+03 2.40142178e+03 8.21924671e+07 1.37214589e+06
 6.47640900e+03]


In [36]:
from scipy.stats import pearsonr

In [37]:
def multivariate_pearsonr(X, y):
    scores, pvalues = [], []
    for column in range(X.shape[1]):
        cur_score, cur_p = pearsonr(X[:, column], y)
        scores.append(abs(cur_score))
        pvalues.append(cur_p)
    return (np.array(scores), np.array(pvalues))

In [38]:
transformer = SelectKBest(score_func=multivariate_pearsonr, k=3)
xt_pearson = transformer.fit_transform(X, y)
print(transformer.scores_)

[0.2340371  0.33515395 0.22332882 0.15052631 0.22968907]


In [39]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
clf = DecisionTreeClassifier(random_state=14)
scores_chi2 = cross_val_score(clf, xt_chi2, y, scoring='accuracy')
scores_pearson = cross_val_score(clf, xt_pearson, y, scoring='accuracy')



In [40]:
print(scores_chi2.mean())
print(scores_pearson.mean())

0.8285986761581544
0.7706459415969852


In [46]:
from sklearn.base import TransformerMixin
from sklearn.utils import as_float_array
class MeanDiscrete(TransformerMixin):
    def fit(self, X, y=None):
        X = as_float_array(X)
        self.mean = X.mean(axis=0)
        return self
    def transform(self, X):
        X = as_float_array(X)
        assert X.shape[1] == self.mean.shape[0]
        return X > self.mean

In [47]:
mean_discrete = MeanDiscrete()
X_mean = mean_discrete.fit_transform(X)

In [48]:
print(X_mean)

[[ True  True  True False False]
 [ True  True False False False]
 [False False False False False]
 ...
 [ True False False False False]
 [False False False False False]
 [ True False  True False False]]


In [49]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([('mean_discrete', MeanDiscrete()),
                     ('classifier', DecisionTreeClassifier(random_state=14))])
scores_mean_discrete = cross_val_score(pipeline, X, y, scoring='accuracy')
print("Mean Discrete performance: {0:.3f}".format(scores_mean_discrete.mean()))

Mean Discrete performance: 0.803


