Урок 6. #Задача lookalike (Positive Unlabeled Learning)

взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)

сделать feature engineering

обучить любой классификатор (какой вам нравится)

далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть

применить random negative sampling для построения классификатора в новых условиях

сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)

<b>Бонусный вопрос:</b>
Как вы думаете, какой из методов на практике является более предпочтительным: random negative sampling или 2-step approach?
Ссылки:
1. https://arxiv.org/pdf/1811.04820.pdf
2. https://habr.com/ru/company/JetBrains-education/blog/512032/
3. https://en.wikipedia.org/wiki/Bootstrap_aggregating
4. https://www.cs.uic.edu/~liub/publications/EMNLP-2010-no-negative.pdf

In [159]:
import numpy as np
import pandas as pd

In [248]:
# Какой-то датасет с Kaggle... про дожди в Австралии
data = pd.read_csv('weatherAUS.csv')
data.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0.0,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,25.0,1010.6,1007.8,,,17.2,24.3,No,0.0,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,0.0,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,16.0,1017.6,1012.8,,,18.1,26.5,No,1.0,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0.2,No


In [249]:
data.shape

(142193, 24)

In [250]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142193 entries, 0 to 142192
Data columns (total 24 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           142193 non-null  object 
 1   Location       142193 non-null  object 
 2   MinTemp        141556 non-null  float64
 3   MaxTemp        141871 non-null  float64
 4   Rainfall       140787 non-null  float64
 5   Evaporation    81350 non-null   float64
 6   Sunshine       74377 non-null   float64
 7   WindGustDir    132863 non-null  object 
 8   WindGustSpeed  132923 non-null  float64
 9   WindDir9am     132180 non-null  object 
 10  WindDir3pm     138415 non-null  object 
 11  WindSpeed9am   140845 non-null  float64
 12  WindSpeed3pm   139563 non-null  float64
 13  Humidity9am    140419 non-null  float64
 14  Humidity3pm    138583 non-null  float64
 15  Pressure9am    128179 non-null  float64
 16  Pressure3pm    128212 non-null  float64
 17  Cloud9am       88536 non-null

In [251]:
data['RainTomorrow'].value_counts(normalize=True)

No     0.775819
Yes    0.224181
Name: RainTomorrow, dtype: float64

In [252]:
# Преобразуем Yes/No в 1/0 для целевой переменной
def to_digit(source):
    if type(source) is str:
        return 1 if 'Yes' in source else 0
    else:
        return 0

data.RainTomorrow = data.apply(lambda x: to_digit(x.RainTomorrow), axis=1)

In [253]:
data.head(3)

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0.0,0
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,25.0,1010.6,1007.8,,,17.2,24.3,No,0.0,0
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,0.0,0


In [254]:
# Пайплайнизируем
categorical_columns = ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']
continuous_columns = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm']

In [255]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer

In [256]:
cat_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')), 
    ('lab', OrdinalEncoder()),
])

In [257]:
cont_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')), 
    ('scaler', StandardScaler())
])

In [258]:
feat_reg_prep = ColumnTransformer(
        transformers=[
            ('cat', cat_transformer, categorical_columns),
            ('cont', cont_transformer, continuous_columns)]
             ,remainder='drop'
)

In [259]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [260]:
log_reg = make_pipeline(feat_reg_prep, LogisticRegression(max_iter=1000))

In [261]:
#разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(data.drop(['RainTomorrow'], 1), data['RainTomorrow'], random_state=0)

In [262]:
log_reg.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('lab',
                                                                   OrdinalEncoder())]),
                                                  ['Location', 'WindGustDir',
                                                   'WindDir9am', 'WindDir3pm',
                                                   'RainToday']),
                                                 ('cont',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                  

In [263]:
#наши прогнозы для тестовой выборки
normal_result = log_reg.predict_proba(X_test)[:, 1]

In [264]:
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix
precision, recall, thresholds = precision_recall_curve(y_test, normal_result)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.316602, F-Score=0.636, Precision=0.603, Recall=0.674


In [231]:
# Окей - такой получился скор
# Теперь пробуем скрыть часть позитивов
# Делать это будем в X_train, чтобы потом можно было на тесте провериться
# Сколько их там у нас?
y_train.value_counts(normalize=False)

0    88664
1    17980
Name: RainTomorrow, dtype: int64

In [234]:
# 23871. Скрываем часть из них (и ставим им класс 0)
def obfuscate_positives(df, index):
    df.loc[index] = 0
    
obfuscate_positives(y_train, np.random.choice(X_train.index, 30000))

In [235]:
y_train.value_counts(normalize=False)

0    96458
1    10186
Name: RainTomorrow, dtype: int64

In [236]:
# Окей - близко к половине
# Берем подвыборку со всеми оставшимися положительными, и каким-то количеством новых (и старых) отрицательных
# Вернем целевую переменную в датасет
new_data = X_train.copy()
new_data['RainTomorrow'] = y_train.values

In [237]:
new_data.head(3)

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
12826,2011-09-04,Moree,7.7,21.9,0.0,4.6,11.0,NNE,41.0,NE,...,,1029.0,1024.8,3.0,2.0,16.1,21.5,No,0.0,0
32391,2016-04-09,Sydney,16.0,25.0,0.4,1.2,9.5,ESE,26.0,WNW,...,57.0,1022.0,1018.8,2.0,7.0,19.3,24.0,No,0.0,0
185,2009-06-05,Albury,2.8,16.1,0.0,,,W,20.0,NNE,...,57.0,1017.9,1015.1,,,7.6,15.6,No,0.2,0


In [238]:
# Отделим оставшиеся положительные
new_data_P = new_data[new_data['RainTomorrow'] == 1]
new_data_N = new_data[new_data['RainTomorrow'] == 0]

In [239]:
# Делаем подвыборку из новых (и старых) отрицательных
new_data_N.shape

(96458, 24)

In [240]:
data_for_random = new_data_P.append(new_data_N.sample(50000))

In [241]:
data_for_random.shape

(60186, 24)

In [242]:
# Обучим модель на новом сете
log_reg.fit(data_for_random.drop(['RainTomorrow'], 1), data_for_random['RainTomorrow'])

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('lab',
                                                                   OrdinalEncoder())]),
                                                  ['Location', 'WindGustDir',
                                                   'WindDir9am', 'WindDir3pm',
                                                   'RainToday']),
                                                 ('cont',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                  

In [246]:
# Проверим результаты на тесте
rand_neg_result = log_reg.predict_proba(X_test)[:, 1]

In [247]:
precision, recall, thresholds = precision_recall_curve(y_test, rand_neg_result)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.202816, F-Score=0.633, Precision=0.599, Recall=0.671


### Немного съехал скор. Как точность так и полнота. 
Незначительная в данном случае получилась плата за возможность работать с неклассифицированными данными

In [265]:
def stats(y, preds):
    precision, recall, thresholds = precision_recall_curve(y, preds)
    fscore = (2 * precision * recall) / (precision + recall)
    ix = np.argmax(fscore)
    return fscore[ix], precision[ix], recall[ix], roc_auc_score(y, preds)

In [266]:
rows = []
for (method, yy) in zip([normal_result, rand_neg_result], [y_test, y_test]):
    rows.append(stats(yy, method))
table = pd.DataFrame(rows)
table.columns = ['fscore', 'precission', 'recall', 'roc_auc']
table['method'] = ['Normal', 'Random Negative']
table = table[['method'] + ['fscore', 'precission', 'recall', 'roc_auc']]
table

Unnamed: 0,method,fscore,precission,recall,roc_auc
0,Normal,0.636433,0.602838,0.673995,0.864645
1,Random Negative,0.633043,0.599152,0.670997,0.862376
