In [1]:
#Importing the required packages.
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score


In [2]:
#Reading the data
key = pd.read_excel('dxcode.xlsx')
sentence = pd.read_excel('proc.xlsx')



In [3]:
df = pd.read_excel('proc2.xlsx')
key['keyword'] = key['keyword'].str.rstrip()
cols_target = list(key['keyword'])


In [4]:

#Converting Text data into feature vectors
vec = TfidfVectorizer(ngram_range=(1,2), max_df=0.9)
X_dtm = vec.fit_transform(df['Order Procedure'])

In [5]:
#Initialising object for Logistic Regression
logreg = LogisticRegression(C=12.0)


In [6]:
#Initialising prdictions array
preds = np.zeros((len(df), len(cols_target)))

In [9]:

#Used Classifier chains to convert multi label problem into single label problem
def add_feature(X, feature_to_add):
 #Returns sparse feature matrix with added feature.
    
    from scipy.sparse import csr_matrix, hstack
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')

In [10]:
for i,label in enumerate(cols_target):
    if df[label].sum()>0:
        
        print('... Processing {}'.format(label))
        y = df[label]
        # train the model using X_dtm & y
        logreg.fit(X_dtm,y)
        # compute the training accuracy
        y_pred_X = logreg.predict(X_dtm)
        print('Training Accuracy is {}'.format(accuracy_score(y,y_pred_X)))
        
        preds[:,i] = logreg.predict_proba(X_dtm)[:,1]
        # chain current label to X_dtm
        X_dtm = add_feature(X_dtm, y)
        print('Shape of X_dtm is now {}'.format(X_dtm.shape))

... Processing spine
Training Accuracy is 1.0
Shape of X_dtm is now (27, 93)
... Processing lumbar
Training Accuracy is 1.0
Shape of X_dtm is now (27, 94)
... Processing mammo
Training Accuracy is 1.0
Shape of X_dtm is now (27, 95)
... Processing ct head
Training Accuracy is 1.0
Shape of X_dtm is now (27, 96)
... Processing contrast
Training Accuracy is 1.0
Shape of X_dtm is now (27, 97)
... Processing abdomen
Training Accuracy is 1.0
Shape of X_dtm is now (27, 98)
... Processing chest
Training Accuracy is 1.0
Shape of X_dtm is now (27, 99)
... Processing xr chest
Training Accuracy is 1.0
Shape of X_dtm is now (27, 100)
... Processing foot
Training Accuracy is 1.0
Shape of X_dtm is now (27, 101)
... Processing xr
Training Accuracy is 0.9629629629629629
Shape of X_dtm is now (27, 102)
... Processing bil
Training Accuracy is 0.9629629629629629
Shape of X_dtm is now (27, 103)
... Processing ct
Training Accuracy is 1.0
Shape of X_dtm is now (27, 104)
... Processing mr
Training Accuracy is 

In [11]:
#To prepare a dataframe with the obtained Predictions
submid = pd.DataFrame({'Procedure': df['Order Procedure']})
submission = pd.concat([submid, pd.DataFrame(preds, columns = cols_target)], axis=1)
columns = ['Procedure']
columns.extend(list(key['code']))
submission.columns = columns

In [12]:
#To print respective code for the keyword
for j,lab in enumerate(submission['Procedure']):
    print(lab)
    print(list(submission.iloc[:,1:].columns[(submission.iloc[:,1:] > 0.5).iloc[j]]))

ct spine lumbar without contrast
['A10.0', 'A10.1', 'S900.87', 'P54.7']
mg mammo screen digtal bil wtomo
['B23.9', 'V07.90']
ct head without contrast
['B11.7', 'S900.87']
xr chest 2 views 
['G19.98']
ct head without contrast
['B11.7', 'S900.87']
us abdomen 
['T47.1']
xr chest portable
['G19.98']
xr clavicle right
['Z78.0']
xr spine lumbar 2 or 3 views rade eap xr spine lumbar 2 or 3v
['A10.0', 'A10.1']
us abdomen limited
['T47.1']
mr lumbar w + wo contrast
['A10.1']
mg mammo screen digtal bil wtomo
['B23.9', 'V07.90']
xr chest 2 views 
['G19.98']
xr foot 3 views
['L92.3', 'Z78.0']
ct spine lumbar without contrast
['A10.0', 'A10.1', 'S900.87', 'P54.7']
mr spine cervical without contrast
['A10.0', 'S900.87', 'Q19.01']
ct spine lumbar without contrast
['A10.0', 'A10.1', 'S900.87', 'P54.7']
xr spine lumbar 2 or 3 views rade eap xr spine lumbar 2 or 3v
['A10.0', 'A10.1']
mg mammo screen digtal bil wtomo
['B23.9', 'V07.90']
xr chest portable
['G19.98']
ct head without contrast
['B11.7', 'S90