### Домашнее задание

1. взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)
3. сделать feature engineering
4. обучить любой классификатор (какой вам нравится)
5. далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть
6. применить random negative sampling для построения классификатора в новых условиях
7. сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)
8. поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

import xgboost as xgb

### п.1. - взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)

In [48]:
df = pd.read_csv("./HW_data/adult.data", names=['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'target'])
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [49]:
df.describe(include='all',).T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
age,32561.0,,,,38.581647,13.640433,17.0,28.0,37.0,48.0,90.0
workclass,32561.0,9.0,Private,22696.0,,,,,,,
fnlwgt,32561.0,,,,189778.366512,105549.977697,12285.0,117827.0,178356.0,237051.0,1484705.0
education,32561.0,16.0,HS-grad,10501.0,,,,,,,
education_num,32561.0,,,,10.080679,2.57272,1.0,9.0,10.0,12.0,16.0
marital_status,32561.0,7.0,Married-civ-spouse,14976.0,,,,,,,
occupation,32561.0,15.0,Prof-specialty,4140.0,,,,,,,
relationship,32561.0,6.0,Husband,13193.0,,,,,,,
race,32561.0,5.0,White,27816.0,,,,,,,
sex,32561.0,2.0,Male,21790.0,,,,,,,


In [50]:
# look at NaN
len(df) - df.count()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
target            0
dtype: int64

In [51]:
df['target'].unique()

array([' <=50K', ' >50K'], dtype=object)

In [52]:
df['target'] = df['target'].apply(lambda i : 1 if i == ' >50K' else 0)

### п.2. сделать feature engineering

Посмотрим на категории

In [54]:
for col in df.select_dtypes(include='object'):
    print(f'column: {col}, uniques: {len(df[col].unique())}')

column: workclass, uniques: 9
column: education, uniques: 16
column: marital_status, uniques: 7
column: occupation, uniques: 15
column: relationship, uniques: 6
column: race, uniques: 5
column: sex, uniques: 2
column: native_country, uniques: 42


Уникальных не много - можно просто перевести в one-hot-encoding

In [55]:
df = pd.get_dummies(df)

In [56]:
df.shape

(32561, 109)

### п.3 обучить любой классификатор (какой вам нравится)

In [57]:
x_data = df.drop('target', axis=1)
y_data = df['target']

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=42, shuffle=True)

In [94]:
model = xgb.XGBClassifier()

model.fit(x_train, y_train)
y_predict = model.predict(x_test)

In [95]:
def evaluate_results(y_test, y_predict):
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_predict)
    print("roc: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0)) 

    
evaluate_results(y_test, y_predict)

Classification results:
f1: 72.60%
roc: 80.82%
recall: 67.54%
precision: 78.48%


### п.4 разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть

In [80]:
mod_data = df.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data['target'].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 1961/7841 as positives and unlabeling the rest


### применить random negative sampling для построения классификатора в новых условиях

In [81]:
mod_data['class_test'] = 0
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 0    30600
1     1961
Name: class_test, dtype: int64


In [82]:
x_data = mod_data.drop(['target', 'class_test'], axis=1).values # just the X 
y_labeled = mod_data['class_test'].values # new class (just the P & U)
y_positive = mod_data['target'].values # original class

In [83]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==0][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==0][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(1961, 110) (1961, 110)


In [92]:
model = xgb.XGBClassifier()

model.fit(sample_train.drop(['target', 'class_test'], axis=1), 
          sample_train['class_test'])
y_predict_1 = model.predict(sample_test.drop(['target', 'class_test'], axis=1))
evaluate_results(sample_test['target'].values, y_predict_1)

Classification results:
f1: 60.79%
roc: 79.94%
recall: 79.15%
precision: 49.34%


In [98]:
res = pd.DataFrame(columns=['f1 score', 'roc_auc_score', 'recall', 'precision'])

for test, predict, model in ((y_test, y_predict, 'source'), (sample_test['target'].values, y_predict_1, ' random negative sampling')):
    f1 = f1_score(test, predict)
    roc = roc_auc_score(test, predict)
    rec = recall_score(test, predict, average='binary')
    prc = precision_score(test, predict, average='binary')

    res.loc[model] = [f1, roc, rec, prc]


In [99]:
res

Unnamed: 0,f1 score,roc_auc_score,recall,precision
source,0.725966,0.808241,0.675366,0.784763
random negative sampling,0.607887,0.799352,0.791515,0.493417
