1. взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)
3. сделать feature engineering
4. обучить любой классификатор (какой вам нравится)
5. далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть
6. применить random negative sampling для построения классификатора в новых условиях
7. сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)
8. поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

#### Загружаем датасет

In [98]:
import pandas as pd
import numpy as np
data = pd.read_csv("adult.data", header=None)
data.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K


In [99]:
print(data.shape)

(32561, 15)


Задача: спрогнозировать, превышает ли доход 50 тысяч долларов в год. Наша целевая переменная столбец номер 14. Надо привести его
к бинарному виду

#### Feature engeneering

In [100]:
data[14] = data[14].map({' <=50K': 0, ' >50K': 1}) 
data[14]

0        0
1        0
2        0
3        0
4        0
        ..
32556    0
32557    1
32558    0
32559    0
32560    1
Name: 14, Length: 32561, dtype: int64

In [101]:
data.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0


In [102]:
data.iloc[:, -1].value_counts()

0    24720
1     7841
Name: 14, dtype: int64

#### Делим на train и test выборки

In [103]:
from sklearn.model_selection import train_test_split

x_data = pd.get_dummies(data.iloc[:,:-1])
y_data = data.iloc[:,-1]

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=7)

#### Обучаем модель xgboost

In [104]:
import xgboost as xgb

model = xgb.XGBClassifier()

model.fit(x_train, y_train)
y_predict = model.predict(x_test)





#### Создаем функцию расчета f1, recall, precision, roc_auc и 

In [172]:
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

def evaluate_results(y_test, y_predict):
    
    f1 = f1_score(y_test, y_predict) * 100

    roc = roc_auc_score(y_test, y_predict) * 100

    rec = recall_score(y_test, y_predict, average='binary') * 100

    prc = precision_score(y_test, y_predict, average='binary') * 100

    return f1, roc, rec, prc


#### Делим наш набор данных на два множества: P (positives) и U (unlabeled)

In [173]:
mod_data = pd.get_dummies(data.copy())
#get the indices of the positives samples
pos_ind = np.where(mod_data.loc[:, 14].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 1961/7841 as positives and unlabeling the rest


Создаем столбец для новой целевой переменной, где у нас два класса - P (1) и U (-1)

In [174]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    30600
 1     1961
Name: class_test, dtype: int64


In [175]:
mod_data.head(10)

Unnamed: 0,0,2,4,10,11,12,14,1_ ?,1_ Federal-gov,1_ Local-gov,...,13_ Puerto-Rico,13_ Scotland,13_ South,13_ Taiwan,13_ Thailand,13_ Trinadad&Tobago,13_ United-States,13_ Vietnam,13_ Yugoslavia,class_test
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,1,0,0,-1
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,1,0,0,-1
2,38,215646,9,0,0,40,0,0,0,0,...,0,0,0,0,0,0,1,0,0,-1
3,53,234721,7,0,0,40,0,0,0,0,...,0,0,0,0,0,0,1,0,0,-1
4,28,338409,13,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1
5,37,284582,14,0,0,40,0,0,0,0,...,0,0,0,0,0,0,1,0,0,-1
6,49,160187,5,0,0,16,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1
7,52,209642,9,0,0,45,1,0,0,0,...,0,0,0,0,0,0,1,0,0,-1
8,31,45781,14,14084,0,50,1,0,0,0,...,0,0,0,0,0,0,1,0,0,-1
9,42,159449,13,5178,0,40,1,0,0,0,...,0,0,0,0,0,0,1,0,0,1


In [176]:
x_data = mod_data.drop([14, 'class_test'], axis = 1) # just the X 
y_labeled = mod_data.loc[:,'class_test'].values # new class (just the P & U)
y_positive = mod_data.loc[:,14].values # original class

#### random negative sampling

In [177]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(1961, 110) (1961, 110)


In [178]:
model_rns = xgb.XGBClassifier()

model_rns.fit(sample_train.drop([14, 'class_test'], axis=1).values,
          sample_train.loc[:, 14].values)
y_predict_rns = model_rns.predict(sample_test.drop([14, 'class_test'], axis=1).values)






In [179]:
pd.DataFrame([evaluate_results(y_test, y_predict), evaluate_results(sample_test.loc[:, 14].values, y_predict_rns)
             ], columns={'f1', 'precision', 'recall', 'roc'}, index={'xgboost', 'rns'}).round(3)

Unnamed: 0,roc,recall,f1,precision
xgboost,70.905,79.602,64.981,78.017
rns,60.634,82.212,89.662,45.805


#### Экспериментируем с долей р.

In [180]:
p_len_percent = [0.10, 0.20, 0.25, 0.30, 0.40]

In [181]:
results = []
for i in p_len_percent:
    mod_data = pd.get_dummies(data.copy())
    # mod_data['income'] = mod_data[14].copy()
    # mod_data = mod_data.drop([14], axis = 1)
    #get the indices of the positives samples
    pos_ind = np.where(mod_data.loc[:, 14].values == 1)[0]
    #shuffle them
    np.random.shuffle(pos_ind)
    # leave just 25% of the positives marked
    pos_sample_len = int(np.ceil(i * len(pos_ind)))
    print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
    pos_sample = pos_ind[:pos_sample_len]
    mod_data['class_test'] = -1
    mod_data.loc[pos_sample,'class_test'] = 1
    print('target variable:\n', mod_data.iloc[:,-1].value_counts())
    x_data = mod_data.drop([14, 'class_test'], axis = 1) # just the X 
    y_labeled = mod_data.loc[:,'class_test'].values # new class (just the P & U)
    y_positive = mod_data.loc[:,14].values # original class
    mod_data = mod_data.sample(frac=1)
    neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
    sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
    pos_sample = mod_data[mod_data['class_test']==1]
    print(neg_sample.shape, pos_sample.shape)
    sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)
    model_rns = xgb.XGBClassifier()

    model_rns.fit(sample_train.drop([14, 'class_test'], axis=1).values,
              sample_train.loc[:, 14].values)
    y_predict_rns = model_rns.predict(sample_test.drop([14, 'class_test'], axis=1).values)
   
    
    result = evaluate_results(sample_test.loc[:, 14].values, y_predict_rns)
    results.append(result)

Using 785/7841 as positives and unlabeling the rest
target variable:
 -1    31776
 1      785
Name: class_test, dtype: int64
(785, 110) (785, 110)




Using 1569/7841 as positives and unlabeling the rest
target variable:
 -1    30992
 1     1569
Name: class_test, dtype: int64
(1569, 110) (1569, 110)




Using 1961/7841 as positives and unlabeling the rest
target variable:
 -1    30600
 1     1961
Name: class_test, dtype: int64
(1961, 110) (1961, 110)




Using 2353/7841 as positives and unlabeling the rest
target variable:
 -1    30208
 1     2353
Name: class_test, dtype: int64
(2353, 110) (2353, 110)




Using 3137/7841 as positives and unlabeling the rest
target variable:
 -1    29424
 1     3137
Name: class_test, dtype: int64
(3137, 110) (3137, 110)




In [182]:
pd.DataFrame(results, columns={'f1', 'precision', 'recall', 'roc'})

Unnamed: 0,roc,recall,f1,precision
0,62.005987,80.431236,88.706634,47.660271
1,61.574535,81.594568,88.106022,47.323842
2,61.538462,82.572057,89.034132,47.018217
3,60.355189,82.684626,88.582755,45.770176
4,57.39077,83.063295,89.188544,42.307257


С уменьшением доли р качество модели улучшается, а с увеличением - ухудшается.