In [1]:
import lightgbm as lgbm
import pandas as pd
import numpy as np
import re

from pymorphy2 import MorphAnalyzer

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import CountVectorizer

from matplotlib import pyplot as plt
from tqdm import tqdm
from scipy import sparse
from scipy.stats.mstats import gmean

import wordbatch
from wordbatch.extractors import WordBag

%matplotlib inline

In [2]:
train_df = pd.read_csv('../dataset/RoboMed/new_train.csv')
test_df = pd.read_csv('../dataset/RoboMed/new_test.csv')

In [3]:
train_df.head(3)

Unnamed: 0.1,Unnamed: 0,ID,Код_диагноза,Диагноз,Возраст,Пол,Общее состояние,аллергия,Анамнез заболевания,Внешний осмотр,Revisit,Типичные_жалобы,Типичные_услуги,Типичные_источники_рекламы
0,0,0,J06.0,Острый ларингофарингит,29,2,,,,На жевательной поверхности 2.6- кариозная поло...,1,"на першение, осиплость | на периодическое затр...","Прием врача-оториноларинголога повторный, амбу...",Другое | Рекомендации знакомых
1,1,1,N76.1,Подострый и хронический вагинит,45,2,,,считает себя больной на протяжении многих лет....,,1,"на выделения из половых путей ,периодические т...","Прием врача-акушера-гинеколога повторный, амбу...",Другое | Рекомендации знакомых | Интернет
2,2,2,L23.9,"Аллергический контактный дерматит, причина не ...",63,2,,,"Болен около 2-х дней, когда появились описанны...",,1,,,


In [4]:
def preprocess_text_feature(feature):
    return feature.str.lower() \
                  .str.replace('\n', '') \
                  .apply(lambda string: re.sub(r"([.\!?,'/()\-№:;])", ' \\1 ', string))

In [5]:
def preprocess_dataset(df):
    df['Состояние_неизвестно'] = df['Общее состояние'].isna()
    df['Аллергия_неизвестна'] = df['аллергия'].isna()
    df['Общее_состояние_неизвестно'] = df['Общее состояние'].isna()
    df['Анамнез_заболевания_неизвестен'] = df['Анамнез заболевания'].isna()
    df['Внешний_осмотр_неизвестен'] = df['Внешний осмотр'].isna()
    df['Типичные_жалобы_неизвестны'] = df['Типичные_жалобы'].isna()
    df['Типичные_услуги_неизвестны'] = df['Типичные_услуги'].isna()
    df['Типичные_источники_рекламы_неизвестны'] = df['Типичные_источники_рекламы'].isna()
    
    df['Общее состояние'].fillna('общее_состояние_неизвестно', inplace=True)
    df['аллергия'].fillna('аллергии_неизвестны', inplace=True)
    df['Анамнез заболевания'].fillna('нет_анамнеза', inplace=True)
    df['Внешний осмотр'].fillna('нет_осмотра', inplace=True)
    df['Типичные_жалобы'].fillna('типичные_жалобы_неизвестны', inplace=True)
    df['Типичные_услуги'].fillna('типичные_услги_неизвестны', inplace=True)
    df['Типичные_источники_рекламы'].fillna('типичные_источники_рекламы_неизвестны', inplace=True)
    
    df['Анамнез заболевания'] = preprocess_text_feature(df['Анамнез заболевания'])
    df['Внешний осмотр'] = preprocess_text_feature(df['Внешний осмотр'])
    df['Диагноз'] = preprocess_text_feature(df['Диагноз'])
    df['Общее состояние'] = preprocess_text_feature(df['Общее состояние'])
    df['Типичные_услуги'] = preprocess_text_feature(df['Типичные_услуги'])
    df['Типичные_жалобы'] = preprocess_text_feature(df['Типичные_жалобы'])
    df['Типичные_источники_рекламы'] = preprocess_text_feature(df['Типичные_источники_рекламы'])
    df['аллергия'] = preprocess_text_feature(df['аллергия'])
    
    df['Возраст_7'] = np.round(df['Возраст'] / 7).astype('category')
    df['Возраст_5'] = np.round(df['Возраст'] / 5).astype('category')
    df['Возраст_3'] = np.round(df['Возраст'] / 3).astype('category')
    df['Возраст_2'] = np.round(df['Возраст'] / 2).astype('category')   
    df['Возраст'] = df['Возраст'].astype('category')
    
    df['Пол'] = df['Пол'].astype('category')
    df['ID'] = df['ID'].astype('category')
    df['Код_диагноза'] = df['Код_диагноза'].astype('category')
    df['аллергия'] = df['аллергия'].astype('category')
    df['Анамнез заболевания'] = df['Анамнез заболевания'].astype('category')
    df['Внешний осмотр'] = df['Внешний осмотр'].astype('category')
    return df

In [6]:
train_df = preprocess_dataset(train_df)
test_df = preprocess_dataset(test_df)

In [7]:
y_train = train_df.Revisit
train_df = train_df.drop('Revisit', axis='columns')

In [8]:
whole_df = pd.concat([train_df, test_df])

In [9]:
def label_with_pos(text):
    analyzer = MorphAnalyzer()
    tokens = re.compile('[\W+]').sub(' ', text).lower().strip().split()
    tokens = [x for x in tokens if len(x) > 1]
    result = []
    for token in tokens:
        parsed_result = analyzer.parse(token)
        pos = parsed_result[0].tag.POS
        pos_token = token + ('' if pos is None else '_' + pos)
        result.append(pos_token)
    return ' '.join(result)

In [10]:
def one_hot_texts(text_feature):
    wb = wordbatch.WordBatch(extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0],
                                                  "hash_size": 2 ** 22, "norm": None,
                                                  "tf": 5.0, "idf": 5.0,}), procs=32)
    wb.dictionary_freeze = True
    transformed_feature = wb.fit_transform(text_feature)
    nnz_features_mask = np.clip(transformed_feature.getnnz(axis=0) - 1, 0, 1).astype(bool)
    transformed_feature = transformed_feature[:, nnz_features_mask]
    return transformed_feature

In [11]:
text_feature_names = ['Общее состояние', 'аллергия', 'Анамнез заболевания', 'Внешний осмотр',
                      'Диагноз', 'Типичные_жалобы', 'Типичные_услуги', 'Типичные_источники_рекламы']
transformed_matrices = [one_hot_texts(whole_df[feature_name])
                        for feature_name in tqdm(text_feature_names)]
text_features_matrix = sparse.hstack(transformed_matrices)

  0%|          | 0/8 [00:00<?, ?it/s]

Normalize text
Extract wordbags


 12%|█▎        | 1/8 [00:12<01:25, 12.26s/it]

Normalize text
Extract wordbags


 25%|██▌       | 2/8 [00:25<01:15, 12.63s/it]

Normalize text
Extract wordbags


 38%|███▊      | 3/8 [00:37<01:02, 12.53s/it]

Normalize text
Extract wordbags


 50%|█████     | 4/8 [00:49<00:49, 12.29s/it]

Normalize text
Extract wordbags


 62%|██████▎   | 5/8 [01:01<00:37, 12.33s/it]

Normalize text
Extract wordbags


 75%|███████▌  | 6/8 [01:16<00:25, 12.72s/it]

Normalize text
Extract wordbags


 88%|████████▊ | 7/8 [01:29<00:12, 12.72s/it]

Normalize text
Extract wordbags


100%|██████████| 8/8 [01:42<00:00, 12.77s/it]


In [12]:
patient_dummies = sparse.csr_matrix(
    pd.get_dummies(whole_df[['Возраст', 'Код_диагноза', 'Пол']], sparse=True).values)
age_dummies = sparse.csr_matrix(
    pd.get_dummies(whole_df[['Возраст_2', 'Возраст_3', 'Возраст_5', 'Возраст_7']], sparse=True).values)

In [13]:
nan_feature_names = ['Состояние_неизвестно', 'Аллергия_неизвестна', 'Общее_состояние_неизвестно',
                     'Анамнез_заболевания_неизвестен', 'Внешний_осмотр_неизвестен',
                     'Типичные_жалобы_неизвестны', 'Типичные_услуги_неизвестны',
                     'Типичные_источники_рекламы_неизвестны']
nan_dummies = sparse.csr_matrix(pd.get_dummies(whole_df[nan_feature_names], sparse=True).values)

In [14]:
vectorizer = CountVectorizer()
diagnosis_counts = vectorizer.fit_transform(whole_df['Код_диагноза'].apply(str))

vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 2))
diagnosis_counts_char = vectorizer.fit_transform(whole_df['Код_диагноза'].apply(str))

In [15]:
nnz_patient_dummies_mask = np.clip(patient_dummies.getnnz(axis=0) - 1, 0, 1).astype(bool)
patient_dummies = patient_dummies[:, nnz_patient_dummies_mask]

nnz_nan_dummies_mask = np.clip(nan_dummies.getnnz(axis=0) - 1, 0, 1).astype(bool)
nan_dummies = nan_dummies[:, nnz_nan_dummies_mask]

nnz_age_dummies_mask = np.clip(age_dummies.getnnz(axis=0) - 1, 0, 1).astype(bool)
age_dummies = age_dummies[:, nnz_age_dummies_mask]

In [16]:
whole_features_matrix = sparse.hstack([
    text_features_matrix, patient_dummies, age_dummies,
    nan_dummies, diagnosis_counts, diagnosis_counts_char
]).tocsr()

In [17]:
features_train = whole_features_matrix[:len(train_df)]
features_test = whole_features_matrix[len(train_df):]

In [20]:
predictions = []

for random_state in tqdm(range(0, 100, 10)):
    params = {
        'application': 'binary',
        'verbosity': -1,
        'metric': 'auc',
        'feature_fraction_seed' : random_state + 10,
        'bagging_seed' : random_state + 20
    }
    stratified = KFold(n_splits=10, shuffle=True, random_state=random_state)
    for train_index, test_index in stratified.split(features_train, y_train):
        x_t, x_v = features_train[train_index], features_train[test_index]
        y_t, y_v = y_train[train_index], y_train[test_index]

        dataset_train = lgbm.Dataset(x_t, label=y_t)
        dataset_valid = lgbm.Dataset(x_v, label=y_v)
        watchlist = [dataset_train, dataset_valid]
        
        model = lgbm.train(params, train_set=dataset_train, valid_sets=watchlist,
                          num_boost_round=400, early_stopping_rounds=30, verbose_eval=1000)
        preds = model.predict(x_v)
        print("roc_auc_score: ", roc_auc_score(y_v, preds))
        predictions.append(model.predict(features_test))
        
preds = gmean(np.array(predictions), axis=0)

  0%|          | 0/10 [00:00<?, ?it/s]

Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[130]	training's auc: 0.739225	valid_1's auc: 0.723141
roc_auc_score:  0.7231408746208969
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[139]	training's auc: 0.74055	valid_1's auc: 0.718682
roc_auc_score:  0.7186824811572663
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[121]	training's auc: 0.738223	valid_1's auc: 0.72236
roc_auc_score:  0.7223599665820786
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[162]	training's auc: 0.742155	valid_1's auc: 0.724649
roc_auc_score:  0.7246485371636582
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[166]	training's auc: 0.74282	valid_1's auc: 0.721735
roc_auc_score:  0.7217351725704735
Training until validation scores don't improve for 30 rounds.
Early 

 10%|█         | 1/10 [01:44<15:37, 104.20s/it]

Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[75]	training's auc: 0.731086	valid_1's auc: 0.726152
roc_auc_score:  0.7261515276424817
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[118]	training's auc: 0.737028	valid_1's auc: 0.727124
roc_auc_score:  0.7271237380051813
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[182]	training's auc: 0.744791	valid_1's auc: 0.715802
roc_auc_score:  0.7158018933006086
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[124]	training's auc: 0.739206	valid_1's auc: 0.71637
roc_auc_score:  0.7163702824794401
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[196]	training's auc: 0.745204	valid_1's auc: 0.7282
roc_auc_score:  0.7282000750858241
Training until validation scores don't improve for 30 rounds.
Early s

 20%|██        | 2/10 [03:34<14:19, 107.39s/it]

Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[182]	training's auc: 0.744503	valid_1's auc: 0.72586
roc_auc_score:  0.7258603407626247
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[115]	training's auc: 0.737335	valid_1's auc: 0.721904
roc_auc_score:  0.7219044411302655
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[164]	training's auc: 0.742675	valid_1's auc: 0.722453
roc_auc_score:  0.7224528956434901
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[166]	training's auc: 0.743062	valid_1's auc: 0.720982
roc_auc_score:  0.7209815555015612
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[168]	training's auc: 0.742944	valid_1's auc: 0.723052
roc_auc_score:  0.72305170655123
Training until validation scores don't improve for 30 rounds.
Early 

 30%|███       | 3/10 [05:22<12:31, 107.40s/it]

Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[226]	training's auc: 0.748149	valid_1's auc: 0.723399
roc_auc_score:  0.7233990708372898
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[121]	training's auc: 0.737664	valid_1's auc: 0.725093
roc_auc_score:  0.7250926446067238
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[148]	training's auc: 0.740944	valid_1's auc: 0.722125
roc_auc_score:  0.7221248960400357
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[105]	training's auc: 0.735983	valid_1's auc: 0.722382
roc_auc_score:  0.7223817298195662
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[104]	training's auc: 0.736023	valid_1's auc: 0.722774
roc_auc_score:  0.7227735518600291
Training until validation scores don't improve for 30 rounds.
Ear

 40%|████      | 4/10 [07:13<10:50, 108.39s/it]

Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[129]	training's auc: 0.739426	valid_1's auc: 0.72102
roc_auc_score:  0.7210196143672641
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[150]	training's auc: 0.741258	valid_1's auc: 0.724891
roc_auc_score:  0.7248910367526303
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[142]	training's auc: 0.740029	valid_1's auc: 0.726844
roc_auc_score:  0.7268443999244492
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[141]	training's auc: 0.739939	valid_1's auc: 0.726478
roc_auc_score:  0.7264777354217622
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[211]	training's auc: 0.747591	valid_1's auc: 0.717351
roc_auc_score:  0.7173506825566577
Training until validation scores don't improve for 30 rounds.
Earl

 50%|█████     | 5/10 [09:15<09:15, 111.12s/it]

Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[155]	training's auc: 0.742099	valid_1's auc: 0.721094
roc_auc_score:  0.7210938298422984
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[131]	training's auc: 0.739563	valid_1's auc: 0.722869
roc_auc_score:  0.7228689986626126
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[124]	training's auc: 0.738349	valid_1's auc: 0.723494
roc_auc_score:  0.7234944888150361
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[154]	training's auc: 0.74167	valid_1's auc: 0.725496
roc_auc_score:  0.7254962892319533
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[140]	training's auc: 0.739769	valid_1's auc: 0.72843
roc_auc_score:  0.7284301233476563
Training until validation scores don't improve for 30 rounds.
Early

 60%|██████    | 6/10 [10:59<07:19, 109.96s/it]

Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[145]	training's auc: 0.741451	valid_1's auc: 0.720307
roc_auc_score:  0.720306770885514
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[126]	training's auc: 0.73841	valid_1's auc: 0.727286
roc_auc_score:  0.7272860723783866
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[130]	training's auc: 0.739015	valid_1's auc: 0.723582
roc_auc_score:  0.7235819824719804
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[119]	training's auc: 0.737967	valid_1's auc: 0.723141
roc_auc_score:  0.7231409097291361
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[108]	training's auc: 0.736378	valid_1's auc: 0.720639
roc_auc_score:  0.7206387164083923
Training until validation scores don't improve for 30 rounds.
Early

 70%|███████   | 7/10 [12:42<05:26, 108.97s/it]

Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[130]	training's auc: 0.73986	valid_1's auc: 0.71506
roc_auc_score:  0.7150599314463779
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[140]	training's auc: 0.739863	valid_1's auc: 0.726211
roc_auc_score:  0.726210967975287
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[185]	training's auc: 0.744682	valid_1's auc: 0.72594
roc_auc_score:  0.7259395174956931
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[133]	training's auc: 0.739085	valid_1's auc: 0.727183
roc_auc_score:  0.7271829306981477
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[184]	training's auc: 0.744297	valid_1's auc: 0.725642
roc_auc_score:  0.7256423108077217
Training until validation scores don't improve for 30 rounds.
Early s

 80%|████████  | 8/10 [14:42<03:40, 110.37s/it]

Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[160]	training's auc: 0.742248	valid_1's auc: 0.724959
roc_auc_score:  0.7249594130250968
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[218]	training's auc: 0.74789	valid_1's auc: 0.72296
roc_auc_score:  0.7229599496206427
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[113]	training's auc: 0.737221	valid_1's auc: 0.719427
roc_auc_score:  0.7194271302535483
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[141]	training's auc: 0.740203	valid_1's auc: 0.724689
roc_auc_score:  0.7246887002166993
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[151]	training's auc: 0.741156	valid_1's auc: 0.726683
roc_auc_score:  0.7266834002614209
Training until validation scores don't improve for 30 rounds.
Early

 90%|█████████ | 9/10 [16:40<01:51, 111.15s/it]

Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[205]	training's auc: 0.746647	valid_1's auc: 0.722154
roc_auc_score:  0.7221543498601839
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[118]	training's auc: 0.737419	valid_1's auc: 0.722929
roc_auc_score:  0.7229294631281878
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[137]	training's auc: 0.739751	valid_1's auc: 0.723692
roc_auc_score:  0.7236915342812741
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[151]	training's auc: 0.741435	valid_1's auc: 0.723277
roc_auc_score:  0.7232772761680731
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[162]	training's auc: 0.742124	valid_1's auc: 0.724547
roc_auc_score:  0.7245474761497128
Training until validation scores don't improve for 30 rounds.
Ear

100%|██████████| 10/10 [18:29<00:00, 110.94s/it]


In [21]:
submission = pd.read_csv('/root/dataset/RoboMed/sample_submission.csv')
submission.proba = preds
submission.to_csv('submission_boosting.csv', index=False)