
    Взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)
    
    Обучить любой классификатор (какой вам нравится)
    
    Разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные примеры (класс 1), а только лишь часть
        
    Применить random negative sampling для построения классификатора в новых условиях
    
    Сравнить качество с решением из пункта 3 (построить отчет - таблицу метрик)
    
    *Поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)



In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import itertools

import matplotlib.pyplot as plt


df = pd.read_csv("adult.data", header=None)
df.rename(columns = {0 : 'age', 1 : 'workclass', 2:'fnlwgt',3:'education',4:'education-num',5:'marital-status',6:'occupation',7:'relationship',8:'race',9:'sex',10:'capital-gain',11:'capital-loss',12:'hours-per-week',13:'native-country',14:'life_len',}, inplace = True) 
# df.rename(columns={'index':'hhh'}, inplace = True)
df['user_id'] = df.index
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,life_len,user_id
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,1
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,2
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,3
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,4


In [2]:
df['life_len'].value_counts()

 <=50K    24720
 >50K      7841
Name: life_len, dtype: int64

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  life_len        32561 non-null  object
 15  user_id         32561 non-null  int64 
dtypes: int64(7), object(9)
memory usage: 4.0+ MB


7841 долгожителей (нам нужно 11000 по условию задачи). 

Давайте разметим наш датасет (долгожители - метка 1) - это класс 1. 

Мы предполагаем, что людям со схожими на класс 1 данными (но не долгажители), понравится наше предложение. 

In [4]:
df['y'] = df['life_len'].apply(lambda x: 1 if x == ' >50K' else 0)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,life_len,user_id,y
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,0,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,1,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,2,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,3,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,4,0


In [5]:
df['y'].value_counts()

0    24720
1     7841
Name: y, dtype: int64

In [6]:
df['all_cat'] = df[['workclass','education','marital-status','occupation','relationship','race','sex','native-country']].apply(lambda x: " ".join(x), 1)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,life_len,user_id,y,all_cat
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,0,0,State-gov Bachelors Never-married Adm-cler...
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,1,0,Self-emp-not-inc Bachelors Married-civ-spou...
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,2,0,Private HS-grad Divorced Handlers-cleaners...
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,3,0,Private 11th Married-civ-spouse Handlers-c...
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,4,0,Private Bachelors Married-civ-spouse Prof-...


In [7]:
X = df[['user_id', 'all_cat', 'y']]
X

Unnamed: 0,user_id,all_cat,y
0,0,State-gov Bachelors Never-married Adm-cler...,0
1,1,Self-emp-not-inc Bachelors Married-civ-spou...,0
2,2,Private HS-grad Divorced Handlers-cleaners...,0
3,3,Private 11th Married-civ-spouse Handlers-c...,0
4,4,Private Bachelors Married-civ-spouse Prof-...,0
...,...,...,...
32556,32556,Private Assoc-acdm Married-civ-spouse Tech...,0
32557,32557,Private HS-grad Married-civ-spouse Machine...,1
32558,32558,Private HS-grad Widowed Adm-clerical Unma...,0
32559,32559,Private HS-grad Never-married Adm-clerical...,0


Посмотрим на соотношение классов (P vs U), где P - позитивы и U - все остальные неразмеченные данные

In [8]:
X['y'].value_counts()

0    24720
1     7841
Name: y, dtype: int64

Будем используем обычный random negative sample

In [9]:
# разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(X, X['y'], random_state=0)

In [10]:
# соберем наш простой pipeline, но нам понадобится написать класс для выбора нужного поля

class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]

pipeline = Pipeline([('all_selector', FeatureSelector(column='all_cat')), 
                     ('all_tfidf', TfidfVectorizer()), 
                     ('clf', LogisticRegression())])

In [11]:
# обучим наш пайплайн
pipeline.fit(X_train, y_train)

Pipeline(steps=[('all_selector', FeatureSelector(column='all_cat')),
                ('all_tfidf', TfidfVectorizer()),
                ('clf', LogisticRegression())])

In [12]:
tfidf = pipeline.named_steps['all_tfidf']
len(tfidf.get_feature_names())



140

In [13]:
df_word = pd.DataFrame.sparse.from_spmatrix(tfidf.transform(X_test['all_cat'].iloc[:10]))
df_word = pd.DataFrame(df_word)
df_word.columns = tfidf.get_feature_names_out()
df_word

Unnamed: 0,10th,11th,12th,1st,4th,5th,6th,7th,8th,9th,...,us,usvi,vietnam,voc,white,widowed,wife,without,worked,yugoslavia
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.147304,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.151477,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.147336,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.389046,0.389046,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.474177,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.126674,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.151781,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.141355,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.419725,0.11769,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.144093,0.0,0.0,0.0,0.0,0.0


Получим прогнозы для "тестовой" выборки

In [14]:
# наши прогнозы для тестовой выборки
preds = pipeline.predict_proba(X_test)[:, 1]
preds[:10]

array([0.03426237, 0.07610895, 0.5087254 , 0.10523298, 0.0536911 ,
       0.27450939, 0.01895413, 0.05005484, 0.18976825, 0.00563125])

In [15]:
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.3183753522904963, F-Score=0.662, Precision=0.599, Recall=0.740


Средние метрики. Давайте теперь проскорим всю выборку и отсортируем по убыванию прогноза (ведь нам в итоге нужно взять топ 11000 пользователей)

In [16]:
X['y_pred'] = pipeline.predict_proba(X)[:, 1]
X = X.sort_values('y_pred', ascending=False)
X.head(15)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['y_pred'] = pipeline.predict_proba(X)[:, 1]


Unnamed: 0,user_id,all_cat,y,y_pred
10831,10831,Private Doctorate Married-civ-spouse Exec-...,1,0.934491
21641,21641,Private Doctorate Married-civ-spouse Exec-...,1,0.934491
25456,25456,Private Doctorate Married-civ-spouse Exec-...,1,0.934491
6281,6281,Private Doctorate Married-civ-spouse Exec-...,1,0.934491
11021,11021,Private Doctorate Married-civ-spouse Exec-...,1,0.934491
7777,7777,Private Doctorate Married-civ-spouse Exec-...,1,0.934491
413,413,Private Doctorate Married-civ-spouse Exec-...,1,0.934491
28267,28267,Private Doctorate Married-civ-spouse Exec-...,1,0.934491
14707,14707,Private Doctorate Married-civ-spouse Exec-...,0,0.934491
8234,8234,Private Doctorate Married-civ-spouse Exec-...,1,0.934491


Заметим что высокую вероятность получил 14707 у которого y=0

возьмем наши топ 11000 пользователей

In [17]:
selected_users = X.iloc[:11000]['user_id'].values

Можно попытаться проанализировать, чем они отличаются от всех остальных (если отличаются вообще)

In [18]:
df[df['user_id'].isin(selected_users)]['all_cat'].value_counts(normalize=True)

 Private  HS-grad  Married-civ-spouse  Craft-repair  Husband  White  Male  United-States               0.073000
 Private  Bachelors  Married-civ-spouse  Exec-managerial  Husband  White  Male  United-States          0.040727
 Private  Some-college  Married-civ-spouse  Craft-repair  Husband  White  Male  United-States          0.031364
 Private  Bachelors  Married-civ-spouse  Prof-specialty  Husband  White  Male  United-States           0.028091
 Private  HS-grad  Married-civ-spouse  Sales  Husband  White  Male  United-States                      0.024909
                                                                                                         ...   
 Federal-gov  Bachelors  Widowed  Adm-clerical  Not-in-family  White  Male  United-States              0.000091
 Private  9th  Married-civ-spouse  Exec-managerial  Wife  Asian-Pac-Islander  Female  United-States    0.000091
 ?  Assoc-acdm  Married-AF-spouse  ?  Wife  White  Female  United-States                               0

In [19]:
df[~df['user_id'].isin(selected_users)]['all_cat'].value_counts(normalize=True)

 Private  HS-grad  Married-civ-spouse  Machine-op-inspct  Husband  White  Male  United-States              0.017068
 Private  HS-grad  Married-civ-spouse  Transport-moving  Husband  White  Male  United-States               0.010992
 ?  HS-grad  Married-civ-spouse  ?  Husband  White  Male  United-States                                    0.006957
 Private  HS-grad  Married-civ-spouse  Handlers-cleaners  Husband  White  Male  United-States              0.006632
 Self-emp-not-inc  HS-grad  Married-civ-spouse  Craft-repair  Husband  White  Male  United-States          0.006447
                                                                                                             ...   
 Private  Some-college  Married-civ-spouse  Protective-serv  Other-relative  Black  Female  Haiti          0.000046
 Self-emp-not-inc  HS-grad  Never-married  Craft-repair  Other-relative  White  Male  United-States        0.000046
 ?  5th-6th  Married-civ-spouse  ?  Husband  Black  Male  United-States 

Посмотрим на feature_importances

In [20]:
feature_names = pipeline.named_steps["all_tfidf"].get_feature_names_out()
feature_weights = pipeline.named_steps["clf"].coef_[0]

weights = pd.DataFrame({
    'feature_name': feature_names,
    'feature_weight': feature_weights
}).sort_values('feature_weight', ascending=False)

# топ 100 слов
weights['feature_name'].iloc[:100].values

array(['doctorate', 'wife', 'school', 'masters', 'civ', 'spouse', 'prof',
       'private', 'exec', 'managerial', 'federal', 'sales', 'bachelors',
       'protective', 'italy', 'philippines', 'support', 'tech', 'af',
       'cambodia', 'self', 'inc', 'emp', 'gov', 'male', 'canada', 'serv',
       'cuba', 'in', 'family', 'specialty', 'japan', 'craft', 'repair',
       'taiwan', 'germany', 'moving', 'transport', 'yugoslavia',
       'clerical', 'adm', 'france', 'england', 'poland', 'iran', 'states',
       'united', 'ireland', 'pac', 'islander', 'asian', 'acdm',
       'scotland', 'assoc', 'widowed', 'local', 'op', 'machine', 'inspct',
       'thailand', 'south', 'india', 'voc', 'hong', 'nicaragua',
       'netherlands', 'holand', 'el', 'salvador', 'ecuador', 'armed',
       'forces', 'haiti', 'husband', 'worked', 'laos', 'honduras',
       'fishing', 'farming', 'peru', 'hungary', 'guatemala', 'outlying',
       'guam', 'usvi', 'etc', 'us', 'trinadad', 'tobago', 'jamaica',
       'greece

### Второй метод

In [54]:
df = pd.read_csv("adult.data", header=None)
df['y'] = df[14].apply(lambda x: 1 if x == ' >50K' else 0)
df.drop(columns=[14], inplace=True)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,y
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       32561 non-null  int64 
 1   1       32561 non-null  object
 2   2       32561 non-null  int64 
 3   3       32561 non-null  object
 4   4       32561 non-null  int64 
 5   5       32561 non-null  object
 6   6       32561 non-null  object
 7   7       32561 non-null  object
 8   8       32561 non-null  object
 9   9       32561 non-null  object
 10  10      32561 non-null  int64 
 11  11      32561 non-null  int64 
 12  12      32561 non-null  int64 
 13  13      32561 non-null  object
 14  y       32561 non-null  int64 
dtypes: int64(7), object(8)
memory usage: 3.7+ MB


In [56]:
for cat_colname in df.select_dtypes(include='object').columns[1:]:
    df = pd.concat([df, pd.get_dummies(df[cat_colname], prefix=cat_colname)], axis=1)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13_ Portugal,13_ Puerto-Rico,13_ Scotland,13_ South,13_ Taiwan,13_ Thailand,13_ Trinadad&Tobago,13_ United-States,13_ Vietnam,13_ Yugoslavia
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,...,0,0,0,0,0,0,0,1,0,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,...,0,0,0,0,0,0,0,1,0,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,...,0,0,0,0,0,0,0,1,0,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,...,0,0,0,0,0,0,0,1,0,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,...,0,0,0,0,0,0,0,0,0,0


In [57]:
df.drop(columns=[1, 3, 5, 6, 7, 8, 9, 13],inplace=True)
df

Unnamed: 0,0,2,4,10,11,12,y,3_ 10th,3_ 11th,3_ 12th,...,13_ Portugal,13_ Puerto-Rico,13_ Scotland,13_ South,13_ Taiwan,13_ Thailand,13_ Trinadad&Tobago,13_ United-States,13_ Vietnam,13_ Yugoslavia
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,257302,12,0,0,38,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
32557,40,154374,9,0,0,40,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
32558,58,151910,9,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
32559,22,201490,9,0,0,20,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [58]:
from sklearn.model_selection import train_test_split

x_data = df.drop(columns=['y',])
y_data = df['y']

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=7)

In [59]:
import xgboost as xgb

model = xgb.XGBClassifier()

model.fit(x_train, y_train)
y_predict = model.predict(x_test)





Проверяем качество

In [60]:
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

def evaluate_results(y_test, y_predict):
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    print(f"f1: {f1 * 100.0:.2f}%") 
    rec = recall_score(y_test, y_predict, average='binary')
    print(f"recall: {rec * 100.0:.2f}%") 
    prc = precision_score(y_test, y_predict, average='binary')
    print(f"precision: {prc * 100.0:.2f}%" ) 

    
evaluate_results(y_test, y_predict)

Classification results:
f1: 70.90%
recall: 65.04%
precision: 77.91%


### Теперь очередь PU learning

Представим, что нам неизвестны негативы и часть позитивов

In [61]:
mod_data = x_train.copy()
mod_data['label'] = y_train
mod_data = mod_data.reset_index(drop=True)

# mod_data = data.copy()
# get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:, -1].values == 1)[0]

# shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
perc = 0.15
pos_sample_len = int(np.ceil(perc * len(pos_ind)))

print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 942/6279 as positives and unlabeling the rest


Создаем столбец для новой целевой переменной, где у нас два класса - P (1) и U (-1)

In [62]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    25106
 1      942
Name: class_test, dtype: int64


* 1570 позитивных примеров (1)
* 24478 без разметки (-1)

In [63]:
mod_data.head(10)

Unnamed: 0,0,2,4,10,11,12,3_ 10th,3_ 11th,3_ 12th,3_ 1st-4th,...,13_ Scotland,13_ South,13_ Taiwan,13_ Thailand,13_ Trinadad&Tobago,13_ United-States,13_ Vietnam,13_ Yugoslavia,label,class_test
0,51,169364,10,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1
1,51,254211,14,0,0,50,0,0,0,0,...,0,0,0,0,0,0,0,0,1,-1
2,21,202373,11,0,0,40,0,0,0,0,...,0,0,0,0,0,1,0,0,0,-1
3,25,391192,10,0,0,24,0,0,0,0,...,0,0,0,0,0,1,0,0,0,-1
4,48,193775,13,0,0,38,0,0,0,0,...,0,0,0,0,0,1,0,0,1,1
5,29,19793,10,0,0,8,0,0,0,0,...,0,0,0,0,0,1,0,0,0,-1
6,42,32533,10,0,0,45,0,0,0,0,...,0,0,0,0,0,1,0,0,0,-1
7,31,194901,9,0,0,50,0,0,0,0,...,0,0,0,0,0,1,0,0,1,-1
8,49,189885,9,0,0,40,0,0,0,0,...,0,0,0,0,0,1,0,0,0,-1
9,41,121055,9,0,0,40,0,0,0,0,...,0,0,0,0,0,1,0,0,0,-1


### random negative sampling

Помним, что (x_data) содержит целевой признак, который будем использовать для оценки качества

Отделими [:-2] как истиный класс для проверки, и [:-1] как данные для входной разметки PUL


In [64]:
mod_data = mod_data.sample(frac=1)


data_N = mod_data[mod_data['class_test'] == -1]
data_P = mod_data[mod_data['class_test'] == 1]

neg_sample = data_N[:data_P.shape[0]]
sample_test = data_N[data_P.shape[0]:]
pos_sample = data_P.copy()

print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(942, 101) (942, 101)


In [65]:
sample_train

Unnamed: 0,0,2,4,10,11,12,3_ 10th,3_ 11th,3_ 12th,3_ 1st-4th,...,13_ Scotland,13_ South,13_ Taiwan,13_ Thailand,13_ Trinadad&Tobago,13_ United-States,13_ Vietnam,13_ Yugoslavia,label,class_test
19230,40,193882,9,0,0,45,0,0,0,0,...,0,0,0,0,0,1,0,0,1,1
25845,33,150570,13,3103,0,43,0,0,0,0,...,0,0,0,0,0,1,0,0,1,-1
386,60,198170,10,0,0,40,0,0,0,0,...,0,0,0,0,0,1,0,0,1,1
23799,65,115880,16,0,0,40,0,0,0,0,...,0,0,0,0,0,1,0,0,1,1
10586,27,278720,9,0,0,45,0,0,0,0,...,0,0,0,0,0,1,0,0,1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12993,56,95763,9,0,0,55,0,0,0,0,...,0,0,0,0,0,1,0,0,0,-1
16676,59,91384,16,0,0,60,0,0,0,0,...,0,0,0,0,0,1,0,0,1,1
5245,23,60409,9,0,0,40,0,0,0,0,...,0,0,0,0,0,1,0,0,0,-1
7644,42,200574,10,0,0,40,0,0,0,0,...,0,0,0,0,0,1,0,0,0,-1


In [66]:
model = xgb.XGBClassifier()
sample_train.loc[sample_train['class_test'] == -1, 'class_test'] = 0

model.fit(sample_train.drop(columns=['class_test', 'label']), 
          sample_train['class_test'])

y_predict = model.predict(x_test)
evaluate_results(y_test, y_predict)



Classification results:
f1: 62.43%
recall: 76.06%
precision: 52.94%


при perc = 0.35
f1: 66.90%
recall: 80.03%
precision: 57.47%

при perc = 0.25
   Classification results:
f1: 66.58%
recall: 81.69%
precision: 56.19%


при perc = 0.15 f1: 62.43%
recall: 76.06%
precision: 52.94%

Вывод: при увеличении  perc precision растет, а recall падает. при уменьшении наоборот