In [1]:
import lightgbm as lgbm
import pandas as pd
import numpy as np
import re

from pymorphy2 import MorphAnalyzer
from nltk import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import CountVectorizer

from matplotlib import pyplot as plt
from tqdm import tqdm
from scipy import sparse
from scipy.stats.mstats import gmean

import wordbatch
from wordbatch.extractors import WordBag, WordHash

%matplotlib inline

In [2]:
train_df = pd.read_csv('../dataset/RoboMed/new_train.csv')
test_df = pd.read_csv('../dataset/RoboMed/new_test.csv')

In [3]:
train_df.head(3)

Unnamed: 0.1,Unnamed: 0,ID,Код_диагноза,Диагноз,Возраст,Пол,Общее состояние,аллергия,Анамнез заболевания,Внешний осмотр,Revisit,Типичные_жалобы,Типичные_услуги,Типичные_источники_рекламы
0,0,0,J06.0,Острый ларингофарингит,29,2,,,,На жевательной поверхности 2.6- кариозная поло...,1,"на першение, осиплость | на периодическое затр...","Прием врача-оториноларинголога повторный, амбу...",Другое | Рекомендации знакомых
1,1,1,N76.1,Подострый и хронический вагинит,45,2,,,считает себя больной на протяжении многих лет....,,1,"на выделения из половых путей ,периодические т...","Прием врача-акушера-гинеколога повторный, амбу...",Другое | Рекомендации знакомых | Интернет
2,2,2,L23.9,"Аллергический контактный дерматит, причина не ...",63,2,,,"Болен около 2-х дней, когда появились описанны...",,1,,,


In [4]:
def preprocess_text_feature(feature):
    return feature.str.lower() \
                  .str.replace('\n', '') \
                  .apply(lambda string: re.sub(r"([.\!?,'/()\-№:;])", ' \\1 ', string))

In [5]:
def preprocess_dataset(df):
    df['Состояние_неизвестно'] = df['Общее состояние'].isna()
    df['Аллергия_неизвестна'] = df['аллергия'].isna()
    df['Общее_состояние_неизвестно'] = df['Общее состояние'].isna()
    df['Анамнез_заболевания_неизвестен'] = df['Анамнез заболевания'].isna()
    df['Внешний_осмотр_неизвестен'] = df['Внешний осмотр'].isna()
    df['Типичные_жалобы_неизвестны'] = df['Типичные_жалобы'].isna()
    df['Типичные_услуги_неизвестны'] = df['Типичные_услуги'].isna()
    df['Типичные_источники_рекламы_неизвестны'] = df['Типичные_источники_рекламы'].isna()
    
    df['Общее состояние'].fillna('общее_состояние_неизвестно', inplace=True)
    df['аллергия'].fillna('аллергии_неизвестны', inplace=True)
    df['Анамнез заболевания'].fillna('нет_анамнеза', inplace=True)
    df['Внешний осмотр'].fillna('нет_осмотра', inplace=True)
    df['Типичные_жалобы'].fillna('типичные_жалобы_неизвестны', inplace=True)
    df['Типичные_услуги'].fillna('типичные_услги_неизвестны', inplace=True)
    df['Типичные_источники_рекламы'].fillna('типичные_источники_рекламы_неизвестны', inplace=True)
    
    df['Анамнез заболевания'] = preprocess_text_feature(df['Анамнез заболевания'])
    df['Внешний осмотр'] = preprocess_text_feature(df['Внешний осмотр'])
    df['Диагноз'] = preprocess_text_feature(df['Диагноз'])
    df['Общее состояние'] = preprocess_text_feature(df['Общее состояние'])
    df['Типичные_услуги'] = preprocess_text_feature(df['Типичные_услуги'])
    df['Типичные_жалобы'] = preprocess_text_feature(df['Типичные_жалобы'])
    df['Типичные_источники_рекламы'] = preprocess_text_feature(df['Типичные_источники_рекламы'])
    df['аллергия'] = preprocess_text_feature(df['аллергия'])
    
    df['Возраст_7'] = np.round(df['Возраст'] / 7).astype('category')
    df['Возраст_5'] = np.round(df['Возраст'] / 5).astype('category')
    df['Возраст_3'] = np.round(df['Возраст'] / 3).astype('category')
    df['Возраст_2'] = np.round(df['Возраст'] / 2).astype('category')   
    df['Возраст'] = df['Возраст'].astype('category')
    
    df['Пол'] = df['Пол'].astype('category')
    df['ID'] = df['ID'].astype('category')
    df['Код_диагноза'] = df['Код_диагноза'].astype('category')
    df['аллергия'] = df['аллергия'].astype('category')
    df['Анамнез заболевания'] = df['Анамнез заболевания'].astype('category')
    df['Внешний осмотр'] = df['Внешний осмотр'].astype('category')
    return df

In [None]:
train_df = preprocess_dataset(train_df)
test_df = preprocess_dataset(test_df)

In [None]:
y_train = train_df.Revisit
train_df = train_df.drop('Revisit', axis='columns')

In [None]:
whole_df = pd.concat([train_df, test_df])

In [None]:
def label_with_pos(text):
    analyzer = MorphAnalyzer()
    tokens = re.compile('[\W+]').sub(' ', text).lower().strip().split()
    tokens = [x for x in tokens if len(x) > 1]
    result = []
    for token in tokens:
        parsed_result = analyzer.parse(token)
        pos = parsed_result[0].tag.POS
        pos_token = token + ('' if pos is None else '_' + pos)
        result.append(pos_token)
    return ' '.join(result)

In [None]:
def one_hot_texts(text_feature):
    wb = wordbatch.WordBatch(extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0],
                                                  "hash_size": 2 ** 22, "norm": None,
                                                  "tf": 5.0, "idf": 5.0,}), procs=32)
    wb.dictionary_freeze = True
    transformed_feature = wb.fit_transform(text_feature)
    nnz_features_mask = np.clip(transformed_feature.getnnz(axis=0) - 1, 0, 1).astype(bool)
    transformed_feature = transformed_feature[:, nnz_features_mask]
    return transformed_feature

In [None]:
text_feature_names = ['Общее состояние', 'аллергия', 'Анамнез заболевания', 'Внешний осмотр',
                      'Диагноз', 'Типичные_жалобы', 'Типичные_услуги', 'Типичные_источники_рекламы']
transformed_matrices = [one_hot_texts(whole_df[feature_name])
                        for feature_name in tqdm(text_feature_names)]
text_features_matrix = sparse.hstack(transformed_matrices)

In [None]:
patient_dummies = sparse.csr_matrix(
    pd.get_dummies(whole_df[['Возраст', 'Код_диагноза', 'Пол']], sparse=True).values)
age_dummies = sparse.csr_matrix(
    pd.get_dummies(whole_df[['Возраст_2', 'Возраст_3', 'Возраст_5', 'Возраст_7']], sparse=True).values)

In [None]:
nan_feature_names = ['Состояние_неизвестно', 'Аллергия_неизвестна', 'Общее_состояние_неизвестно',
                     'Анамнез_заболевания_неизвестен', 'Внешний_осмотр_неизвестен',
                     'Типичные_жалобы_неизвестны', 'Типичные_услуги_неизвестны',
                     'Типичные_источники_рекламы_неизвестны']
nan_dummies = sparse.csr_matrix(pd.get_dummies(whole_df[nan_feature_names], sparse=True).values)

In [None]:
vectorizer = CountVectorizer()
diagnosis_counts = vectorizer.fit_transform(whole_df['Код_диагноза'].apply(str))

vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 2))
diagnosis_counts_char = vectorizer.fit_transform(whole_df['Код_диагноза'].apply(str))

In [None]:
nnz_patient_dummies_mask = np.clip(patient_dummies.getnnz(axis=0) - 1, 0, 1).astype(bool)
patient_dummies = patient_dummies[:, nnz_patient_dummies_mask]

nnz_nan_dummies_mask = np.clip(nan_dummies.getnnz(axis=0) - 1, 0, 1).astype(bool)
nan_dummies = nan_dummies[:, nnz_nan_dummies_mask]

nnz_age_dummies_mask = np.clip(age_dummies.getnnz(axis=0) - 1, 0, 1).astype(bool)
age_dummies = age_dummies[:, nnz_age_dummies_mask]

In [None]:
whole_features_matrix = sparse.hstack([
    text_features_matrix, patient_dummies, age_dummies,
    nan_dummies, diagnosis_counts, diagnosis_counts_char
]).tocsr()

In [None]:
features_train = whole_features_matrix[:len(train_df)]
features_test = whole_features_matrix[len(train_df):]

In [None]:
print('Done preprocessing!')

In [27]:
predictions = []

for random_state in tqdm(range(0, 100, 10)):
    params = {
        'application': 'binary',
        'verbosity': -1,
        'metric': 'auc',
        'feature_fraction_seed' : random_state + 10,
        'bagging_seed' : random_state + 20
    }
    stratified = KFold(n_splits=10, shuffle=True, random_state=random_state)
    for train_index, test_index in stratified.split(features_train, y_train):
        x_t, x_v = features_train[train_index], features_train[test_index]
        y_t, y_v = y_train[train_index], y_train[test_index]

        dataset_train = lgbm.Dataset(x_t, label=y_t)
        dataset_valid = lgbm.Dataset(x_v, label=y_v)
        watchlist = [dataset_train, dataset_valid]
        
        model = lgbm.train(params, train_set=dataset_train, valid_sets=watchlist,
                          num_boost_round=400, early_stopping_rounds=30, verbose_eval=1000)
        preds = model.predict(x_v)
        print("roc_auc_score: ", roc_auc_score(y_v, preds))
        predictions.append(model.predict(features_test))
        
preds = gmean(np.array(predictions), axis=0)


  0%|          | 0/10 [00:00<?, ?it/s][A

Training until validation scores don't improve for 30 rounds.



Exception in thread Thread-56:
Traceback (most recent call last):
  File "/root/miniconda3/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/root/miniconda3/lib/python3.6/site-packages/tqdm/_tqdm.py", line 148, in run
    for instance in self.tqdm_cls._instances:
  File "/root/miniconda3/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



Early stopping, best iteration is:
[172]	training's auc: 0.717086	valid_1's auc: 0.698855
roc_auc_score:  0.6988551115505431
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[188]	training's auc: 0.719313	valid_1's auc: 0.689454
roc_auc_score:  0.689454252998775
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[140]	training's auc: 0.71374	valid_1's auc: 0.691911
roc_auc_score:  0.6919110938796764
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[230]	training's auc: 0.722107	valid_1's auc: 0.697193
roc_auc_score:  0.6971925149006019
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[150]	training's auc: 0.71469	valid_1's auc: 0.69664
roc_auc_score:  0.6966396604918168
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[178]	training's auc: 0.717834	val

 10%|█         | 1/10 [01:25<12:49, 85.51s/it]

Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[157]	training's auc: 0.71564	valid_1's auc: 0.697658
roc_auc_score:  0.6976580157809318
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[187]	training's auc: 0.71819	valid_1's auc: 0.699267
roc_auc_score:  0.6992673161753
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[171]	training's auc: 0.71765	valid_1's auc: 0.689123
roc_auc_score:  0.6891232837342443
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[108]	training's auc: 0.710698	valid_1's auc: 0.686567
roc_auc_score:  0.6865670725931968
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[184]	training's auc: 0.717851	valid_1's auc: 0.699661
roc_auc_score:  0.6996608053848924
Training until validation scores don't improve for 30 rounds.
Early sto

 20%|██        | 2/10 [02:53<11:35, 86.90s/it]

Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[223]	training's auc: 0.721903	valid_1's auc: 0.696323
roc_auc_score:  0.6963233423273333
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[151]	training's auc: 0.714782	valid_1's auc: 0.695008
roc_auc_score:  0.695007985021346
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[114]	training's auc: 0.710488	valid_1's auc: 0.697404
roc_auc_score:  0.6974036709583177
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[242]	training's auc: 0.723366	valid_1's auc: 0.694233
roc_auc_score:  0.6942333394859024
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[130]	training's auc: 0.71236	valid_1's auc: 0.695774
roc_auc_score:  0.6957741028832083
Training until validation scores don't improve for 30 rounds.
Early

 30%|███       | 3/10 [04:16<09:58, 85.48s/it]

Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[230]	training's auc: 0.721937	valid_1's auc: 0.699031
roc_auc_score:  0.6990305562264686
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[193]	training's auc: 0.71871	valid_1's auc: 0.698183
roc_auc_score:  0.6981828670156899
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[161]	training's auc: 0.716497	valid_1's auc: 0.691679
roc_auc_score:  0.6916790742960777
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[155]	training's auc: 0.71517	valid_1's auc: 0.696215
roc_auc_score:  0.696214707591456
Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[176]	training's auc: 0.717911	valid_1's auc: 0.693624
roc_auc_score:  0.6936242146096948
Training until validation scores don't improve for 30 rounds.
Early 




KeyboardInterrupt: 