In [170]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
#from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, Normalizer
from xgboost import XGBClassifier
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score
import matplotlib.pyplot as plt

%matplotlib inline

In [171]:
data = pd.read_csv("/Users/svetlanaskobeltcyna/Edu/ML_in_business/lesson_6/BankNote_Authentication.csv")
data.head(3)

Unnamed: 0,variance,skewness,curtosis,entropy,class
0,3.6216,8.6661,-2.8073,-0.44699,0
1,4.5459,8.1674,-2.4586,-1.4621,0
2,3.866,-2.6383,1.9242,0.10645,0


In [172]:
print(data.shape)

(1372, 5)


In [173]:
data.iloc[:, -1].value_counts()

0    762
1    610
Name: class, dtype: int64

In [174]:
x_data = data.iloc[:,:-1]
y_data = data.iloc[:,-1]

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=13)

In [175]:
model = xgb.XGBClassifier()

model.fit(x_train, y_train)
y_predict = model.predict(x_test)





In [176]:
def evaluate_results(y_test, y_predict):
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_predict)
    print("roc: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0)) 

    
evaluate_results(y_test, y_predict)

Classification results:
f1: 99.17%
roc: 99.26%
recall: 99.17%
precision: 99.17%


### Теперь очередь за PU learning

In [177]:
mod_data = data.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 153/610 as positives and unlabeling the rest


Создаем столбец для новой целевой переменной, где у нас два класса - P (1) и U (-1)

In [178]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    1219
 1     153
Name: class_test, dtype: int64


* We now have just 153 positive samples labeled as 1 in the 'class_test' col while the rest is unlabeled as -1.

* Recall that col 4 still holds the actual label

In [179]:
mod_data.head(10)

Unnamed: 0,variance,skewness,curtosis,entropy,class,class_test
0,3.6216,8.6661,-2.8073,-0.44699,0,-1
1,4.5459,8.1674,-2.4586,-1.4621,0,-1
2,3.866,-2.6383,1.9242,0.10645,0,-1
3,3.4566,9.5228,-4.0112,-3.5944,0,-1
4,0.32924,-4.4552,4.5718,-0.9888,0,-1
5,4.3684,9.6718,-3.9606,-3.1625,0,-1
6,3.5912,3.0129,0.72888,0.56421,0,-1
7,2.0922,-6.81,8.4636,-0.60216,0,-1
8,3.2032,5.7588,-0.75345,-0.61251,0,-1
9,1.5356,9.1772,-2.2718,-0.73535,0,-1


Remember that this data frame (x_data) includes the former target variable that we keep here just to compare the results

[:-2] is the original class label for positive and negative data [:-1] is the new class for positive and unlabeled data

In [180]:
x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

### 1. random negative sampling

In [181]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(153, 6) (153, 6)


In [182]:
model = xgb.XGBClassifier()

model.fit(sample_train.iloc[:,:-2].values, 
          sample_train.iloc[:,-2].values)
y_predict = model.predict(sample_test.iloc[:,:-2].values)
evaluate_results(sample_test.iloc[:,-2].values, y_predict)

Classification results:
f1: 96.23%
roc: 97.27%
recall: 98.02%
precision: 94.51%




### Домашнее задание

1. взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)


In [183]:
data = pd.read_csv("/Users/svetlanaskobeltcyna/Edu/ML_in_business/lesson_6/winequality-red.csv")
data.head(3)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5


In [184]:
print(data.shape)

(1599, 12)


In [185]:
data.iloc[:, -1].value_counts()

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64

In [186]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
fixed acidity           1599 non-null float64
volatile acidity        1599 non-null float64
citric acid             1599 non-null float64
residual sugar          1599 non-null float64
chlorides               1599 non-null float64
free sulfur dioxide     1599 non-null float64
total sulfur dioxide    1599 non-null float64
density                 1599 non-null float64
pH                      1599 non-null float64
sulphates               1599 non-null float64
alcohol                 1599 non-null float64
quality                 1599 non-null int64
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


3. сделать feature engineering


In [187]:
data1 = data.copy()

data1['quality'] = np.where(data1['quality'] >= 7, 1, 0)
data1.head(3)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,0


In [188]:
data1['quality'].value_counts()

0    1382
1     217
Name: quality, dtype: int64

In [189]:
features = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 
            'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 
           'sulphates', 'alcohol']
x_data = data1[features]
y_data = data1['quality']

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=13)

In [190]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

In [191]:
final_transformers = list()

for cont_col in features:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col)),
                ('standard', StandardScaler())
            ])
    final_transformers.append((cont_col, cont_transformer))

In [192]:
feats = FeatureUnion(final_transformers)

feature_processing = Pipeline([('feats', feats)])

In [193]:
feature_processing.fit_transform(x_train)

array([[ 0.02872658,  0.37227979, -0.87944031, ..., -1.02113691,
         0.02741263, -0.59067052],
       [ 1.33888127,  2.00767684,  0.3453918 , ..., -0.57071375,
         0.14506617, -1.05462196],
       [-0.4269794 , -0.5544452 ,  0.09021845, ...,  0.58751721,
        -0.09024091, -0.86904138],
       ...,
       [ 0.7692488 , -1.15409079,  1.36608524, ..., -0.18463677,
         0.96864096, -0.77625109],
       [-0.59786914, -0.82701138, -0.31805893, ...,  0.13709406,
        -1.09029601, -0.86904138],
       [ 2.30725648, -0.2273658 ,  1.57022392, ..., -0.95679074,
         2.02752283,  0.98676437]])

4. обучить любой классификатор (какой вам нравится)


In [194]:
pipeline = Pipeline([
    ('features',feats),
    ('classifier', XGBClassifier()),
])

In [195]:
pipeline.fit(x_train, y_train)





Pipeline(memory=None,
         steps=[('features',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('fixed acidity',
                                                 Pipeline(memory=None,
                                                          steps=[('selector',
                                                                  NumberSelector(key='fixed '
                                                                                     'acidity')),
                                                                 ('standard',
                                                                  StandardScaler(copy=True,
                                                                                 with_mean=True,
                                                                                 with_std=True))],
                                                          verbose=False)),
                                                ('volatile

In [196]:
y_predict = pipeline.predict(x_test)
y_predict

array([1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [200]:
def evaluate_results(y_test, y_predict):
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_predict)
    print("roc: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0)) 
    return {'f1': f1, 'roc_auc': roc, 'recall': rec, 'precision': prc}
    
XGBClf = evaluate_results(y_test, y_predict)

Classification results:
f1: 61.54%
roc: 76.94%
recall: 58.54%
precision: 64.86%


5. далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть


In [201]:
mod_data = data1.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 55/217 as positives and unlabeling the rest


Создаем столбец для новой целевой переменной, где у нас два класса - P (1) и U (-1)

In [202]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    1544
 1      55
Name: class_test, dtype: int64


In [203]:
mod_data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,class_test
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,0,-1
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,0,-1
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,0,-1
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,0,-1
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,0,-1
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,0,-1
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,0,-1
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,0,-1


* We now have just 55 positive samples labeled as 1 in the 'class_test' col while the rest is unlabeled as -1.

* Recall that col 'quality' still holds the actual label

In [204]:
x_data = mod_data[features].values # just the X 
y_labeled = mod_data['class_test'].values # new class (just the P & U)
y_positive = mod_data['quality'].values # original class

6. применить random negative sampling для построения классификатора в новых условиях


In [205]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(55, 13) (55, 13)


In [206]:
sample_train.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,class_test
1043,9.5,0.39,0.41,8.9,0.069,18.0,39.0,0.99859,3.29,0.81,10.9,1,1
840,11.1,0.42,0.47,2.65,0.085,9.0,34.0,0.99736,3.24,0.77,12.1,1,1
1552,6.3,0.68,0.01,3.7,0.103,32.0,54.0,0.99586,3.51,0.66,11.3,0,-1
611,13.2,0.38,0.55,2.7,0.081,5.0,16.0,1.0006,2.98,0.54,9.4,0,-1
588,5.0,0.42,0.24,2.0,0.06,19.0,50.0,0.9917,3.72,0.74,14.0,1,1


In [220]:
feature_processing.fit_transform(sample_train)

In [208]:
pipeline.fit(sample_train[features], sample_train['quality'])





Pipeline(memory=None,
         steps=[('features',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('fixed acidity',
                                                 Pipeline(memory=None,
                                                          steps=[('selector',
                                                                  NumberSelector(key='fixed '
                                                                                     'acidity')),
                                                                 ('standard',
                                                                  StandardScaler(copy=True,
                                                                                 with_mean=True,
                                                                                 with_std=True))],
                                                          verbose=False)),
                                                ('volatile

In [209]:
y_predict = pipeline.predict(sample_test[features])
y_predict

array([0, 1, 1, ..., 0, 1, 1])

In [212]:
XGBClf_pu = evaluate_results(sample_test['quality'], y_predict)

Classification results:
f1: 39.39%
roc: 76.93%
recall: 81.76%
precision: 25.95%


7. сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)


In [216]:
metrics = [XGBClf, XGBClf_pu]

df = pd.DataFrame(metrics, index=['XGBClf','XGBClf_pu'])
df

Unnamed: 0,f1,roc_auc,recall,precision
XGBClf,0.615385,0.769385,0.585366,0.648649
XGBClf_pu,0.393939,0.769331,0.81761,0.259481


8. поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

In [217]:
mod_data = data1.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 40% of the positives marked
pos_sample_len = int(np.ceil(0.4 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 87/217 as positives and unlabeling the rest


In [223]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    1512
 1      87
Name: class_test, dtype: int64


In [224]:
x_data = mod_data[features].values # just the X 
y_labeled = mod_data['class_test'].values # new class (just the P & U)
y_positive = mod_data['quality'].values # original class

In [225]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(87, 13) (87, 13)


In [226]:
feature_processing.fit_transform(sample_train)

array([[-0.93243597, -1.02001391, -0.19527316, ..., -0.28199276,
        -1.90895996,  0.75394506],
       [ 1.25668693, -0.86050305,  0.82405274, ..., -1.15373688,
         2.22988875, -0.21682577],
       [ 0.16212548, -1.23269506,  0.33865945, ..., -1.15373688,
        -0.59503973, -1.1875966 ],
       ...,
       [-0.78317759,  0.6814353 , -0.97190242, ...,  1.33696061,
        -0.3979517 , -0.56983335],
       [ 0.95817017, -0.27562988,  1.64922132, ..., -0.40652764,
         0.45609645,  0.75394506],
       [-0.88268317,  1.90435191, -1.1175204 , ...,  0.46521648,
        -1.51478389, -1.09934471]])

In [227]:
pipeline.fit(sample_train[features], sample_train['quality'])





Pipeline(memory=None,
         steps=[('features',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('fixed acidity',
                                                 Pipeline(memory=None,
                                                          steps=[('selector',
                                                                  NumberSelector(key='fixed '
                                                                                     'acidity')),
                                                                 ('standard',
                                                                  StandardScaler(copy=True,
                                                                                 with_mean=True,
                                                                                 with_std=True))],
                                                          verbose=False)),
                                                ('volatile

In [228]:
y_predict = pipeline.predict(sample_test[features])
y_predict

array([1, 1, 0, ..., 1, 1, 0])

In [229]:
XGBClf_pu_40 = evaluate_results(sample_test['quality'], y_predict)

Classification results:
f1: 39.36%
roc: 82.11%
recall: 89.52%
precision: 25.23%


In [230]:
metrics = [XGBClf, XGBClf_pu, XGBClf_pu_40]

df = pd.DataFrame(metrics, index=['XGBClf','XGBClf_pu_25', 'XGBClf_pu_40'])
df

Unnamed: 0,f1,roc_auc,recall,precision
XGBClf,0.615385,0.769385,0.585366,0.648649
XGBClf_pu_25,0.393939,0.769331,0.81761,0.259481
XGBClf_pu_40,0.393617,0.821139,0.895161,0.252273
