## Imports

In [2]:
import sys
sys.path.append('/Users/shams/logic-lab/TextPreprocessing/')
import __preprocessing as pre
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
import re
from math import log
import _pickle as pkl
pd.options.display.max_rows = 100000

## Datafile Path and Configuration

In [3]:
DATAFILE = '/Users/shams/querent-datascience/Diagnosis_ICD/data/icd_10_2017.csv'
COLUMNS = ['icd_code', 'xyz', 'short_desc', 'long_desc']
HEADERLINE = None
DOCUMENT_COLUMN = 'long_desc'
df_data = pd.DataFrame.from_csv(DATAFILE, header=HEADERLINE)
if len(COLUMNS):
    df_data.columns = COLUMNS
df_data.head()

Unnamed: 0_level_0,icd_code,xyz,short_desc,long_desc
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,A00,0,Cholera,Cholera
2,A000,1,"Cholera due to Vibrio cholerae 01, biovar chol...","Cholera due to Vibrio cholerae 01, biovar chol..."
3,A001,1,"Cholera due to Vibrio cholerae 01, biovar eltor","Cholera due to Vibrio cholerae 01, biovar eltor"
4,A009,1,"Cholera, unspecified","Cholera, unspecified"
5,A01,0,Typhoid and paratyphoid fevers,Typhoid and paratyphoid fevers


In [None]:
df_data[DOCUMENT_COLUMN] = df_data[DOCUMENT_COLUMN].apply(pre._text_preprocessing)

In [14]:
documents = df_data[DOCUMENT_COLUMN].tolist()

# UMLS Concepts Import from Pickle File

In [5]:
# documents = [pre._text_preprocessing(i) for i in pkl.load(open('./../../querent-datascience/Diagnosis_ICD/ipynb/umls_concepts.dump.pkl', 'rb'))]
documents = pkl.load(open('./../../querent-datascience/Diagnosis_ICD/ipynb/with_acab_concepts.pkl', 'rb'))
print('%d documents imported from pickle file.' % len(documents))

792156 documents imported from pickle file.


In [None]:
pkl.dump(documents, open('./../../querent-datascience/Diagnosis_ICD/ipynb/umls_concepts.dump.pkl', 'wb'))

## Collection Analysis 

In [15]:
tokens = Counter(re.findall(r'\w+', " ".join(documents)))

In [16]:
def F(word):
    return tokens.get(word)
TOKEN_C = len(tokens)
def P_c(word):
    return float(F(word))/TOKEN_C

## Inverted Indexing

In [17]:
def create_index(data):
    index = defaultdict(list)
    for i, document in enumerate(data):
        for token in document.strip().split():
            index[token].append(i)
    return index
inv_index = create_index(documents)

## Sample Analysis

In [18]:
def P_x(word):
    sample = [documents[i] for i in inv_index[word]]
    tokens_sample = Counter(re.findall(r'\w+', ' '.join(sample)))
    L_x = 0
    for k, v in tokens_sample.items():
        L_x += v
    return float(tokens_sample[word])/L_x


## Kullback Leibler Divergence

In [19]:
def kl_div(word):
    p_x = P_x(word)
    p_c = P_c(word)
    return p_x * log(p_x/p_c, 2)

In [20]:
terms = list(tokens.keys())
kl_div_val = []
not_found = []
for t in terms:
    try:
        kl_div_val.append(kl_div(t))
    except:
        kl_div_val.append(-100000)

In [21]:
print(len(terms))
print(len(kl_div_val))

7115
7115


In [22]:
df_kl_div = pd.DataFrame({'term': terms, 'kl_div': kl_div_val})
df_kl_div = df_kl_div.loc[df_kl_div.kl_div > -100000]
df_kl_div.sort_values('kl_div')

Unnamed: 0,kl_div,term
5292,-0.888336,of
2478,-0.655505,fracture
2454,-0.579058,unspecified
3528,-0.501387,other
6311,-0.464344,and
573,-0.457399,encounter
1744,-0.41036,sequela
791,-0.409353,in
3945,-0.403453,with
2480,-0.396988,left


In [None]:
df_kl_div.loc[df_kl_div.term == 'acute']