# AI Model Notebook

## Imports

In [4]:
import numpy as np
from spacy.lang.en import English
import pandas as pd
import os
os.chdir('../')

## Data Processing

In [5]:
data = "./data/main_data.csv"
read_csv = pd.read_csv(data)
read_csv[:5]

Unnamed: 0,code,display
0,303921000119109,Chronic tophaceous gout of hand due to renal i...
1,450309001,Major systemic to pulmonary collateral artery ...
2,428799009,Rotatory subluxation of atlantoaxial joint
3,93936002,Primary malignant neoplasm of palatine bone
4,164510000,O/E - elbow joint abnormal


In [6]:
data_frame = pd.DataFrame(read_csv)
data_frame.head()

Unnamed: 0,code,display
0,303921000119109,Chronic tophaceous gout of hand due to renal i...
1,450309001,Major systemic to pulmonary collateral artery ...
2,428799009,Rotatory subluxation of atlantoaxial joint
3,93936002,Primary malignant neoplasm of palatine bone
4,164510000,O/E - elbow joint abnormal


In [7]:
feature = data_frame["display"]
feature.head()

0    Chronic tophaceous gout of hand due to renal i...
1    Major systemic to pulmonary collateral artery ...
2           Rotatory subluxation of atlantoaxial joint
3          Primary malignant neoplasm of palatine bone
4                           O/E - elbow joint abnormal
Name: display, dtype: object

In [8]:
X_train = data_frame["display"]
Y_train = data_frame["code"]

## NLP Imports

In [7]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk
from nltk.tokenize import word_tokenize

## Word Tokenizing

In [9]:
tokenized_doc = []
doc = X_train.values.tolist()
for d in doc:
    d = str(d)
    tokenized_doc.append(word_tokenize(d.lower()))
print(tokenized_doc[:10])

[['chronic', 'tophaceous', 'gout', 'of', 'hand', 'due', 'to', 'renal', 'impairment'], ['major', 'systemic', 'to', 'pulmonary', 'collateral', 'artery', 'supplying', 'part', 'of', 'right', 'lung'], ['rotatory', 'subluxation', 'of', 'atlantoaxial', 'joint'], ['primary', 'malignant', 'neoplasm', 'of', 'palatine', 'bone'], ['o/e', '-', 'elbow', 'joint', 'abnormal'], ['obstetric', 'nipple', 'infection', 'with', 'postnatal', 'complication'], ['anterior', 'chamber', 'pseudo-exfoliation', 'deposits'], ['normal', 'sense', 'of', 'identity'], ['does', 'initiate', 'conversation'], ['dyrk1a-related', 'intellectual', 'disability', 'syndrome', 'due', 'to', '21q22.13q22.2', 'microdeletion']]


## Converting to Tagged Documents

In [10]:
features = Y_train.values.tolist()
tagged_data = []
for value in range(len(doc)):
    feature = features[value]
    tokenized_sent = tokenized_doc[value]
    aj = TaggedDocument(tokenized_sent, [str(feature)])
    tagged_data.append(aj)
print(tagged_data[:100])
tagged_data_mini = tagged_data[3:5]
print(len(tagged_data_mini))

[TaggedDocument(words=['chronic', 'tophaceous', 'gout', 'of', 'hand', 'due', 'to', 'renal', 'impairment'], tags=['303921000119109']), TaggedDocument(words=['major', 'systemic', 'to', 'pulmonary', 'collateral', 'artery', 'supplying', 'part', 'of', 'right', 'lung'], tags=['450309001']), TaggedDocument(words=['rotatory', 'subluxation', 'of', 'atlantoaxial', 'joint'], tags=['428799009']), TaggedDocument(words=['primary', 'malignant', 'neoplasm', 'of', 'palatine', 'bone'], tags=['93936002']), TaggedDocument(words=['o/e', '-', 'elbow', 'joint', 'abnormal'], tags=['164510000']), TaggedDocument(words=['obstetric', 'nipple', 'infection', 'with', 'postnatal', 'complication'], tags=['200370007']), TaggedDocument(words=['anterior', 'chamber', 'pseudo-exfoliation', 'deposits'], tags=['247002006']), TaggedDocument(words=['normal', 'sense', 'of', 'identity'], tags=['280948003']), TaggedDocument(words=['does', 'initiate', 'conversation'], tags=['288642003']), TaggedDocument(words=['dyrk1a-related', 'i

## Building The Model

In [11]:
max_epochs = 30
vec_size = 20
alpha = 0.025

model = Doc2Vec(vec_size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)
print("trainings started")
model.build_vocab(tagged_data)
print("build end")
for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("test_doc2vec.model")
print("Model Saved")

trainings started
build end
iteration 0




iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
Model Saved


## Loading the Model

In [10]:
model= Doc2Vec.load("./storage/doc2vec.model")

## Testing the model

In [10]:
data_frame.head()
try:
    data_frame.set_index("code", inplace=True)
except KeyError:
    pass
data_frame.head()

Unnamed: 0_level_0,display
code,Unnamed: 1_level_1
303921000119109,Chronic tophaceous gout of hand due to renal i...
450309001,Major systemic to pulmonary collateral artery ...
428799009,Rotatory subluxation of atlantoaxial joint
93936002,Primary malignant neoplasm of palatine bone
164510000,O/E - elbow joint abnormal


In [18]:

for _ in range(2):
    test_doc = word_tokenize("I think you are sufferring from major systematic to Pulmonary collateral artery supplyiing right lung".lower())
    print(model.docvecs.most_similar(positive=[model.infer_vector(test_doc)],topn=5))
    best_result = model.docvecs.most_similar(positive=[model.infer_vector(test_doc)],topn=1)
    print(list(data_frame.loc[int(best_result[0][0]), "display"]))
    print("Best Result: ",best_result[0][0])
    print("Sureity percntage: ",best_result[0][1])

[('719098007', 0.11558675020933151), ('707467004', 0.10146358609199524), ('369881000', 0.09773489832878113), ('450309001', 0.08361873030662537), ('707530009', 0.0750858336687088)]
['Primary salivary gland type carcinoma of lung', 'Primary salivary gland type carcinoma lung ']
Best Result:  707467004
Sureity percntage:  0.2959973216056824
[('450309001', 0.2442864179611206), ('719098007', 0.2084171324968338), ('450308009', 0.19240570068359375), ('697910001', 0.19021475315093994), ('450307004', 0.1856318563222885)]
['Major systemic to pulmonary collateral artery supplying part of right lung', 'Major pulmonary collateral artery right lung ']
Best Result:  450309001
Sureity percntage:  0.18912406265735626


Thank you

In [11]:
test_doc = word_tokenize("major systematic to Pulmonary collateral artery supplying right lung".lower())
tokenized_Doc = []
id_results=[]
for _ in range(2):
    model.docvecs.most_similar(positive=[model.infer_vector(test_doc)],topn=5)
    best_result = model.docvecs.most_similar(positive=[model.infer_vector(test_doc)],topn=1)
    most_similar = data_frame.loc[int(best_result[0][0]), "display"]
    lst = list(most_similar)
    print(lst)
    id_results.append(best_result[0][0])
    tokenized_Doc.append(word_tokenize(lst[0]))
    print("Best Result: ",best_result[0][0])
    print("Sureity percntage: ",best_result[0][1])
i = 0
j = 0
for _ in test_doc:
    if _ in tokenized_Doc[0]:
        i+=1
    if _ in tokenized_Doc[1]:
        j+=1
print(i, j)
print(id_results)
print(tokenized_Doc, test_doc)

['Primary salivary gland type carcinoma of lung', 'Primary salivary gland type carcinoma lung ']
Best Result:  707467004
Sureity percntage:  0.6447983980178833
['Major systemic to pulmonary collateral artery supplying part of right lung', 'Major pulmonary collateral artery right lung ']
Best Result:  450309001
Sureity percntage:  0.6174723505973816
1 7
['707467004', '450309001']
[['Primary', 'salivary', 'gland', 'type', 'carcinoma', 'of', 'lung'], ['Major', 'systemic', 'to', 'pulmonary', 'collateral', 'artery', 'supplying', 'part', 'of', 'right', 'lung']] ['major', 'systematic', 'to', 'pulmonary', 'collateral', 'artery', 'supplying', 'right', 'lung']


In [35]:
minimum_len = min(len(tokenized_Doc[0]),len(tokenized_Doc[1]), len(test_doc))
if minimum_len >3:
    if i >=3 or j >= 3:
        if i >= j: 
            print(id_results[0])
        else: print(id_results[1])
            
    else:
        print("true")
else:
    if i>=2 or j>=2:
        if i >= j: print(id_results[0])
        else: print(id_results[1])
    else:
        print("true")

450309001
