In [93]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Categorical data

In [94]:
# The file has no headers naming the columns, so we pass header=None
# and provide the column names explicitly in "names"
data = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", na_values=[" ?"], 
    header=None, index_col=False,
    names=['age', 'workclass', 'fnlwgt', 'education', 'education-num',
    'marital-status', 'occupation', 'relationship', 'race', 'gender',
    'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
    'income'])
# For illustration purposes, we only select some of the columns
data = data[['workclass', 'age', 'education', 'education-num', 'occupation', 'capital-gain','gender', 'hours-per-week',  'income']]
# IPython.display allows nice output formatting within the Jupyter notebook
# add some none
data['education-num'][0]=None
display(data.head())

Unnamed: 0,workclass,age,education,education-num,occupation,capital-gain,gender,hours-per-week,income
0,State-gov,39,Bachelors,,Adm-clerical,2174,Male,40,<=50K
1,Self-emp-not-inc,50,Bachelors,13.0,Exec-managerial,0,Male,13,<=50K
2,Private,38,HS-grad,9.0,Handlers-cleaners,0,Male,40,<=50K
3,Private,53,11th,7.0,Handlers-cleaners,0,Male,40,<=50K
4,Private,28,Bachelors,13.0,Prof-specialty,0,Female,40,<=50K


In [95]:
data = data[0:1000]

In [96]:
data.isnull().sum()

workclass         62
age                0
education          0
education-num      1
occupation        62
capital-gain       0
gender             0
hours-per-week     0
income             0
dtype: int64

Teraz rzućmy okiem na wszystkie atrybuty kategoryczne:

In [97]:
data["workclass"].value_counts()

 Private             698
 Self-emp-not-inc     81
 Local-gov            68
 State-gov            37
 Self-emp-inc         33
 Federal-gov          21
Name: workclass, dtype: int64

In [98]:
data["education"].value_counts()

 HS-grad         321
 Some-college    225
 Bachelors       166
 Masters          54
 Assoc-voc        48
 11th             46
 Assoc-acdm       35
 10th             21
 9th              16
 7th-8th          15
 Doctorate        14
 5th-6th          11
 Prof-school      10
 12th              9
 1st-4th           7
 Preschool         2
Name: education, dtype: int64

In [99]:
data["gender"].value_counts()

 Male      671
 Female    329
Name: gender, dtype: int64

In [100]:
data["occupation"].value_counts()

 Craft-repair         126
 Prof-specialty       124
 Exec-managerial      124
 Sales                112
 Other-service        107
 Adm-clerical          94
 Machine-op-inspct     61
 Transport-moving      52
 Tech-support          44
 Handlers-cleaners     43
 Farming-fishing       31
 Protective-serv       16
 Priv-house-serv        3
 Armed-Forces           1
Name: occupation, dtype: int64

Sprawdźmy, czy etykiety przyjmują wartości 0 lub 1.

Jak nie to musimy jes troszkę przerobić.

In [101]:
X = data.drop(['income'], axis=1)
y = data['income'].values
# np.unique(y)
# y[ y == ' <=50K'] = 0
# y[ y == ' >50K'] = 1

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

print("X.shape: {} y.shape: {}".format(X.shape, y.shape))

X.shape: (1000, 8) y.shape: (1000,)


In [102]:
# y

Podzielmy zbiór na train/test

In [103]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

Teraz zbudujmy nasze **pipeline** preprocessingu. 

Wykorzystamy DataframeSelector aby wybrać określone atrybuty z DataFrame:

In [104]:
from sklearn.base import BaseEstimator, TransformerMixin

# A class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

Zbudujmy **pipeline** dla atrybutów numerycznych:

In [105]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer

imputer = Imputer(strategy="median")

num_pipeline = Pipeline([
        ("select_numeric", DataFrameSelector(["education-num"])),
        ("imputer", Imputer(strategy="median")),
    ])



In [106]:
num_pipeline.fit_transform(X_train)

array([[ 9.],
       [ 8.],
       [12.],
       [ 9.],
       [ 9.],
       [10.],
       [10.],
       [ 9.],
       [ 9.],
       [ 9.],
       [ 9.],
       [10.],
       [ 9.],
       [ 9.],
       [10.],
       [13.],
       [10.],
       [ 9.],
       [13.],
       [ 9.],
       [ 9.],
       [10.],
       [10.],
       [11.],
       [10.],
       [ 9.],
       [10.],
       [13.],
       [ 9.],
       [ 2.],
       [10.],
       [ 9.],
       [10.],
       [13.],
       [10.],
       [10.],
       [ 9.],
       [ 6.],
       [13.],
       [ 6.],
       [ 7.],
       [ 9.],
       [ 9.],
       [13.],
       [12.],
       [ 9.],
       [ 9.],
       [ 6.],
       [ 9.],
       [15.],
       [ 9.],
       [11.],
       [ 9.],
       [15.],
       [13.],
       [ 7.],
       [13.],
       [14.],
       [ 3.],
       [14.],
       [10.],
       [ 9.],
       [10.],
       [10.],
       [11.],
       [10.],
       [14.],
       [ 9.],
       [ 6.],
       [ 3.],
       [ 9.],
      

Będziemy także potrzebować imputera do kategorycznych kolumn napisowych (zwykły Imputer nie działa na tych kolumnach):

In [107]:
# Inspired from stackoverflow.com/questions/25239958
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

Teraz możemy zbudować pipeline dla atrybutów kategorycznych.

We can convert each categorical value to a one-hot vector using a OneHotEncoder. Right now this class can only handle integer categorical inputs, but in Scikit-Learn 0.20 it will also handle string categorical inputs (see PR https://github.com/scikit-learn/scikit-learn/issues/10521). So for now we import it from future_encoders.py

In [108]:
# from future_encoders import OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["workclass", "education", "occupation", "gender"])),
        ("imputer", MostFrequentImputer()),
        ("cat_encoder", OneHotEncoder(sparse=False, handle_unknown = 'ignore')),
    ])

In [109]:
cat_pipeline.fit_transform(X_train)

array([[0., 0., 1., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 0., 1.],
       [0., 0., 1., ..., 0., 0., 1.],
       ...,
       [0., 0., 1., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 1., 0.]])

Na koniec połączmy powyższe podejścia:

In [110]:
from sklearn.pipeline import FeatureUnion
preprocess_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

# Zad

Robimy StratifiedKFold i znajdujemy optymalne parametry dla

* SVM z jądrem rbf
* SVM z jądrem poly
* SVM liniowego
* Regresji logistycznej

In [111]:
from sklearn.model_selection import StratifiedKFold

seed=123
kfold = StratifiedKFold(n_splits=5, random_state=seed)

In [112]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
pipe = Pipeline([
    ('preprocessing', preprocess_pipeline), 
    ('classifier', SVC(kernel='linear'))])


param_grid = {
            'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]
}

grid_1 = GridSearchCV(pipe, param_grid, cv=kfold)

grid_1.fit(X_train, y_train)
grid_1.best_params_





{'classifier__C': 10}

In [117]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

pipe = Pipeline([
    ('preprocessing', preprocess_pipeline), 
    ('classifier', SVC(kernel='rbf'))])


param_grid = {
            'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100] ,   
            'classifier__gamma': [0.001, 0.01, 0.1, 1, 10, 100]
}

grid_2 = GridSearchCV(pipe, param_grid, cv=kfold)

grid_2.fit(X_train, y_train)
grid_2.best_params_















{'classifier__C': 1, 'classifier__gamma': 0.1}

In [120]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

pipe = Pipeline([
    ('preprocessing', preprocess_pipeline), 
    ('classifier', RandomForestClassifier())])


param_grid = {
            'classifier__n_estimators': [100, 500, 1000],   
            'classifier__min_samples_leaf': [0.1, 0.01]       
}

grid_3 = GridSearchCV(pipe, param_grid, cv=kfold)

grid_3.fit(X_train, y_train)
grid_3.best_params_





{'classifier__min_samples_leaf': 0.01, 'classifier__n_estimators': 500}

In [121]:
from sklearn import  metrics


models = []
models.append(('SVM linear', grid_1.best_estimator_))
models.append(('SVM rbf', grid_2.best_estimator_))
models.append(('RF', grid_3.best_estimator_))


precision_score = []
recall_score = []
f1_score = []
accuracy_score = []
for name, model in models:
    print(name)
    print("precision_score: {}".format(metrics.precision_score(y_test, model.predict(X_test)) ))
    print("recall_score: {}".format( metrics.recall_score(y_test, model.predict(X_test)) ))
    print("f1_score: {}".format( metrics.f1_score(y_test, model.predict(X_test)) ))
    print("accuracy_score: {}".format( metrics.accuracy_score(y_test, model.predict(X_test)) ))
    precision_score.append(metrics.precision_score(y_test, model.predict(X_test)))
    recall_score.append(metrics.recall_score(y_test, model.predict(X_test)))
    f1_score.append( metrics.f1_score(y_test, model.predict(X_test)))
    accuracy_score.append(metrics.accuracy_score(y_test, model.predict(X_test)))

SVM linear
precision_score: 0.7
recall_score: 0.2916666666666667
f1_score: 0.4117647058823529
accuracy_score: 0.8
SVM rbf
precision_score: 1.0
recall_score: 0.20833333333333334
f1_score: 0.3448275862068966
accuracy_score: 0.81
RF
precision_score: 1.0
recall_score: 0.1875
f1_score: 0.3157894736842105
accuracy_score: 0.805


In [122]:
import pandas as pd
d = {'precision_score': precision_score, 
     'recall_score': recall_score, 
     'f1_score': f1_score,
     'accuracy_score' : accuracy_score
    }
df = pd.DataFrame(data=d)
df.insert(loc=0, column='Method', value=['SVM linear','SVM rbf','RF'])
df

Unnamed: 0,Method,precision_score,recall_score,f1_score,accuracy_score
0,SVM linear,0.7,0.291667,0.411765,0.8
1,SVM rbf,1.0,0.208333,0.344828,0.81
2,RF,1.0,0.1875,0.315789,0.805


In [63]:
pipe_final = Pipeline([
    ('preprocessing', preprocess_pipeline), 
    ('classifier', SVC(kernel='linear', C=10))])

In [64]:
pipe_final.fit(X, y)

Pipeline(memory=None,
     steps=[('preprocessing', FeatureUnion(n_jobs=None,
       transformer_list=[('num_pipeline', Pipeline(memory=None,
     steps=[('select_numeric', DataFrameSelector(attribute_names=['education-num'])), ('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0))])), ('c...r', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))])

In [86]:
# The file has no headers naming the columns, so we pass header=None
# and provide the column names explicitly in "names"
test_data = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test", na_values=[" ?"], 
    header=None, index_col=False,
    names=['age', 'workclass', 'fnlwgt', 'education', 'education-num',
    'marital-status', 'occupation', 'relationship', 'race', 'gender',
    'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
    'income'])
# For illustration purposes, we only select some of the columns
test_data = test_data[['workclass', 'age', 'education', 'education-num', 'occupation', 'capital-gain','gender', 'hours-per-week',  'income']]
# IPython.display allows nice output formatting within the Jupyter notebook
# add some none
test_data['education-num'][0]=None
display(test_data.head())

Unnamed: 0,workclass,age,education,education-num,occupation,capital-gain,gender,hours-per-week,income
0,,|1x3 Cross validator,,,,,,,
1,Private,25,11th,7.0,Machine-op-inspct,0.0,Male,40.0,<=50K.
2,Private,38,HS-grad,9.0,Farming-fishing,0.0,Male,50.0,<=50K.
3,Local-gov,28,Assoc-acdm,12.0,Protective-serv,0.0,Male,40.0,>50K.
4,Private,44,Some-college,10.0,Machine-op-inspct,7688.0,Male,40.0,>50K.


In [88]:
test_data.dropna(inplace=True)
display(test_data.head())

Unnamed: 0,workclass,age,education,education-num,occupation,capital-gain,gender,hours-per-week,income
1,Private,25,11th,7.0,Machine-op-inspct,0.0,Male,40.0,<=50K.
2,Private,38,HS-grad,9.0,Farming-fishing,0.0,Male,50.0,<=50K.
3,Local-gov,28,Assoc-acdm,12.0,Protective-serv,0.0,Male,40.0,>50K.
4,Private,44,Some-college,10.0,Machine-op-inspct,7688.0,Male,40.0,>50K.
6,Private,34,10th,6.0,Other-service,0.0,Male,30.0,<=50K.


In [89]:
X_tes_final = test_data.drop(['income'], axis=1)
y_tes_final = label_encoder.fit_transform(test_data['income'].values)

In [90]:
pipe_final.predict(X_tes_final)

array([0, 0, 0, ..., 0, 0, 1])

In [91]:
metrics.accuracy_score(y_pred= pipe_final.predict(X_tes_final), y_true = y_tes_final)

0.7563826314071173

In [92]:
metrics.f1_score(y_pred= pipe_final.predict(X_tes_final), y_true = y_tes_final)

0.3957894736842105