# import

In [8]:
from collections import Counter

from sklearn import feature_selection
from sklearn.feature_selection import SelectPercentile
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split as tts
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.metrics import classification_report, accuracy_score

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

# Dataset

In [3]:
### Generates a random n-class classification dataset problem ###
X, y = make_classification(n_classes=2,
                           class_sep=2,
                           weights=[0.1, 0.9],
                           n_informative=3,
                           n_redundant=1,
                           flip_y=0,
                           n_features=20,
                           n_clusters_per_class=1,
                           n_samples=1000,
                           random_state=10)

print('Original dataset shape {}'.format(Counter(y)))

Original dataset shape Counter({1: 900, 0: 100})


# With and without Pipeline

In [6]:
### Using imblearn Pipeline ###
# Pipeline assembles several steps that can cross validated together 
# It sequentially applies a list of transforms, sampling, and a final estimator
pca = PCA()
smote = SMOTE(random_state=42)
knn = KNN()
pipeline = Pipeline ([('smote', smote), ('pca', pca), ('knn', knn)])

X_train, X_test, y_train, y_test = tts(X,y,random_state=42)

pipeline.fit(X_train, y_train)

y_hat = pipeline.predict(X_test)
print ("Classification accuracy:",accuracy_score(y_test, y_hat))

Classification accuracy: 0.984


In [7]:
### Using traditional approach ###
X_pca2 = PCA(random_state=42).fit_transform(X)

X_train2, X_test2, y_train2, y_test2 = tts(X_pca2,y,random_state=42)
#print (X_train2.shape, Counter(y_train2))

X_smote2, y_smote2  = SMOTE(random_state=42).fit_resample(X_train2, y_train2)
#print (X_smote2.shape, Counter(y_smote2))

knn2 = KNN().fit(X_smote2, y_smote2)
y_hat2 = knn2.predict(X_test2)
print ("Classification accuracy:", accuracy_score(y_test2, y_hat2))

Classification accuracy: 0.984


In [11]:
pca = PCA()
smote = SMOTE(random_state=42)
knn = KNN()
pipeline = Pipeline([('smote', smote), ('feat_sel', SelectPercentile(feature_selection.f_classif)), ('knn', knn)])

para_steps = {}
para_steps.update({
    'feat_sel': [SelectPercentile(feature_selection.f_classif)],
    'feat_sel' + '__percentile': [100]
})

In [12]:
para_steps

{'feat_sel': [SelectPercentile(percentile=10,
           score_func=<function f_classif at 0x7f7dbc34fa60>)],
 'feat_sel__percentile': [100]}