In [1]:
%matplotlib inline

import re
import numpy as np
import pandas as pd
import catboost as cb
import matplotlib.pyplot as plt

from fuzzywuzzy import fuzz
from tqdm import tqdm_notebook
from txt_feat import get_features
from sklearn.cluster import KMeans
from utils import get_disease_class
from lstm import KTokenizer, get_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
TRAIN_PATH = '~/.kaggle/competitions/ai-hack-2018-spb-robomed/train_data.csv'
TEST_PATH = '~/.kaggle/competitions/ai-hack-2018-spb-robomed/test_data.csv'
SUBMISSION_PATH = 'submission.csv'
MODEL_SAVE_PATH = 'model.pkl'

In [3]:
def initial_pre(df):
    df = df.copy()
    del df['ID']
    rename_map = {
        'Код_диагноза': 'diag_code',
        'Диагноз': 'diag',
        'Возраст': 'age',
        'Пол': 'sex',
        'Общее состояние': 'state',
        'аллергия': 'allergies',
        'Анамнез заболевания': 'anamnesis',
        'Внешний осмотр': 'checkup',
        'Revisit': 'revisit'
    }
    return df.rename(columns=rename_map)

In [4]:
df = initial_pre(pd.read_csv(TRAIN_PATH))
len(df)

283086

In [6]:
df[df.revisit == 0].describe(include='all')

Unnamed: 0,diag_code,diag,age,sex,state,allergies,anamnesis,checkup,revisit
count,48510,48510,48510.0,48510.0,3161,2943,43030,557,48510.0
unique,2328,2312,,,470,619,34702,87,
top,M42.1,Остеохондроз позвоночника у взрослых,,,Общее самочувствие удовлетворительное. Кожные ...,не отягощен,Без отрицательной динамики в неврологическом с...,удовлетворительное.,
freq,2576,2576,,,735,1255,618,300,
mean,,,43.16677,1.613296,,,,,0.0
std,,,37.410713,0.487,,,,,0.0
min,,,1.0,1.0,,,,,0.0
25%,,,31.0,1.0,,,,,0.0
50%,,,39.0,2.0,,,,,0.0
75%,,,54.0,2.0,,,,,0.0


In [7]:
df[df.revisit == 1].describe(include='all')

Unnamed: 0,diag_code,diag,age,sex,state,allergies,anamnesis,checkup,revisit
count,234576,234576,234576.0,234576.0,38535,37965,160626,12738,234576.0
unique,3113,3077,,,2800,4900,121844,767,
top,N77.1*,"Вагинит, вульвит и вульвовагинит при инфекцион...",,,Общее самочувствие удовлетворительное. Кожные ...,не отягощен,консультация по проведенному обследованию и ко...,удовлетворительное.,
freq,10721,10721,,,8225,16106,2464,6322,
mean,,,41.392939,1.728003,,,,,1.0
std,,,16.431538,0.444989,,,,,0.0
min,,,1.0,1.0,,,,,1.0
25%,,,31.0,1.0,,,,,1.0
50%,,,38.0,2.0,,,,,1.0
75%,,,50.0,2.0,,,,,1.0
