# AI Model Notebook

## Imports

In [3]:
import numpy as np
from spacy.lang.en import English
import pandas as pd
import os
os.chdir('../')

## Data Processing

In [4]:
data = "./data/main_data.csv"
read_csv = pd.read_csv(data)
read_csv

Unnamed: 0,code,display
0,303921000119109,Chronic tophaceous gout of hand due to renal i...
1,450309001,Major systemic to pulmonary collateral artery ...
2,428799009,Rotatory subluxation of atlantoaxial joint
3,93936002,Primary malignant neoplasm of palatine bone
4,164510000,O/E - elbow joint abnormal
5,200370007,Obstetric nipple infection with postnatal comp...
6,247002006,Anterior chamber pseudo-exfoliation deposits
7,280948003,Normal sense of identity
8,288642003,Does initiate conversation
9,783619003,DYRK1A-related intellectual disability syndrom...


In [5]:
data_frame = pd.DataFrame(read_csv)
data_frame.head()

Unnamed: 0,code,display
0,303921000119109,Chronic tophaceous gout of hand due to renal i...
1,450309001,Major systemic to pulmonary collateral artery ...
2,428799009,Rotatory subluxation of atlantoaxial joint
3,93936002,Primary malignant neoplasm of palatine bone
4,164510000,O/E - elbow joint abnormal


In [6]:
feature = data_frame["display"]
feature.head()

0    Chronic tophaceous gout of hand due to renal i...
1    Major systemic to pulmonary collateral artery ...
2           Rotatory subluxation of atlantoaxial joint
3          Primary malignant neoplasm of palatine bone
4                           O/E - elbow joint abnormal
Name: display, dtype: object

In [7]:
X_train = data_frame["display"]
Y_train = data_frame["code"]

## NLP Imports

In [8]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk
from nltk.tokenize import word_tokenize

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

## Word Tokenizing

In [9]:
tokenized_doc = []
doc = X_train.values.tolist()
for d in doc:
    d = str(d)
    tokenized_doc.append(word_tokenize(d.lower()))
print(tokenized_doc[:10])

[['chronic', 'tophaceous', 'gout', 'of', 'hand', 'due', 'to', 'renal', 'impairment'], ['major', 'systemic', 'to', 'pulmonary', 'collateral', 'artery', 'supplying', 'part', 'of', 'right', 'lung'], ['rotatory', 'subluxation', 'of', 'atlantoaxial', 'joint'], ['primary', 'malignant', 'neoplasm', 'of', 'palatine', 'bone'], ['o/e', '-', 'elbow', 'joint', 'abnormal'], ['obstetric', 'nipple', 'infection', 'with', 'postnatal', 'complication'], ['anterior', 'chamber', 'pseudo-exfoliation', 'deposits'], ['normal', 'sense', 'of', 'identity'], ['does', 'initiate', 'conversation'], ['dyrk1a-related', 'intellectual', 'disability', 'syndrome', 'due', 'to', '21q22.13q22.2', 'microdeletion']]


## Converting to Tagged Documents

In [10]:
features = Y_train.values.tolist()
tagged_data = []
for value in range(len(doc)):
    feature = features[value]
    tokenized_sent = tokenized_doc[value]
    aj = TaggedDocument(tokenized_sent, [str(feature)])
    tagged_data.append(aj)
print(tagged_data[:100])
tagged_data_mini = tagged_data[3:5]
print(len(tagged_data_mini))

[TaggedDocument(words=['chronic', 'tophaceous', 'gout', 'of', 'hand', 'due', 'to', 'renal', 'impairment'], tags=['303921000119109']), TaggedDocument(words=['major', 'systemic', 'to', 'pulmonary', 'collateral', 'artery', 'supplying', 'part', 'of', 'right', 'lung'], tags=['450309001']), TaggedDocument(words=['rotatory', 'subluxation', 'of', 'atlantoaxial', 'joint'], tags=['428799009']), TaggedDocument(words=['primary', 'malignant', 'neoplasm', 'of', 'palatine', 'bone'], tags=['93936002']), TaggedDocument(words=['o/e', '-', 'elbow', 'joint', 'abnormal'], tags=['164510000']), TaggedDocument(words=['obstetric', 'nipple', 'infection', 'with', 'postnatal', 'complication'], tags=['200370007']), TaggedDocument(words=['anterior', 'chamber', 'pseudo-exfoliation', 'deposits'], tags=['247002006']), TaggedDocument(words=['normal', 'sense', 'of', 'identity'], tags=['280948003']), TaggedDocument(words=['does', 'initiate', 'conversation'], tags=['288642003']), TaggedDocument(words=['dyrk1a-related', 'i

## Building The Model

In [11]:
max_epochs = 30
vec_size = 20
alpha = 0.025

model = Doc2Vec(vec_size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)
print("trainings started")
model.build_vocab(tagged_data)
print("build end")
for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("test_doc2vec.model")
print("Model Saved")

trainings started
build end
iteration 0




iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
Model Saved


## Loading the Model

In [12]:
model= Doc2Vec.load("test_doc2vec.model")
## Print model vocabulary
model.wv.vocab

{'chronic': <gensim.models.keyedvectors.Vocab at 0x140a4c02908>,
 'tophaceous': <gensim.models.keyedvectors.Vocab at 0x140be9baf60>,
 'gout': <gensim.models.keyedvectors.Vocab at 0x140bd02f5f8>,
 'of': <gensim.models.keyedvectors.Vocab at 0x140bd02f6d8>,
 'hand': <gensim.models.keyedvectors.Vocab at 0x140bd02f748>,
 'due': <gensim.models.keyedvectors.Vocab at 0x140bd02f5c0>,
 'to': <gensim.models.keyedvectors.Vocab at 0x140bd02f7b8>,
 'renal': <gensim.models.keyedvectors.Vocab at 0x140bd02f7f0>,
 'impairment': <gensim.models.keyedvectors.Vocab at 0x140bd02f828>,
 'major': <gensim.models.keyedvectors.Vocab at 0x140bd02f940>,
 'systemic': <gensim.models.keyedvectors.Vocab at 0x140bd02f978>,
 'pulmonary': <gensim.models.keyedvectors.Vocab at 0x140bd02f8d0>,
 'collateral': <gensim.models.keyedvectors.Vocab at 0x140bd02f908>,
 'artery': <gensim.models.keyedvectors.Vocab at 0x140bd02f9e8>,
 'supplying': <gensim.models.keyedvectors.Vocab at 0x140bd02fa90>,
 'part': <gensim.models.keyedvectors

## Testing the model

In [35]:
test_doc = word_tokenize("I can understand he is suffering from Chronic tophaceous gout of hand due to renal impairment".lower())
model.docvecs.most_similar(positive=[model.infer_vector(test_doc)],topn=5)
best_result = model.docvecs.most_similar(positive=[model.infer_vector(test_doc)],topn=1)
print("Best Result: ",best_result[0][0])
print("Sureity percntage: ",best_result[0][1])

Best Result:  288371008
Sureity percntage:  0.36000412702560425


Thank you