# Import Libraries

In [1]:
# import pandas as pd
import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import pickle
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from collections import OrderedDict
import gradio as gr
from fuzzywuzzy import fuzz
import re
from collections import Counter



# Install Dependencies

In [2]:
!pip install fuzz



In [3]:
!pip install textblob



In [4]:
!pip install fuzzywuzzy



In [5]:
!python -m pip install -U symspellpy

Requirement already up-to-date: symspellpy in c:\users\ag95028\appdata\local\conda\conda\envs\myenv\lib\site-packages (6.7.0)


In [6]:
df = pd.read_csv('medicare_train_dataset_BOI_input_manual_v3.csv')

In [7]:
df.head()

Unnamed: 0,sentence #,pattern,benefit_type,requirement_text,word,POS,tags
0,sentence 1,"Includes programs such as group exercise, adul...",ADCS,Adult Day Care Services,Includes,NNS,O
1,sentence 1,"Includes programs such as group exercise, adul...",ADCS,Adult Day Care Services,programs,NNS,O
2,sentence 1,"Includes programs such as group exercise, adul...",ADCS,Adult Day Care Services,such,JJ,O
3,sentence 1,"Includes programs such as group exercise, adul...",ADCS,Adult Day Care Services,as,IN,O
4,sentence 1,"Includes programs such as group exercise, adul...",ADCS,Adult Day Care Services,group,NN,B_INCLUSION


# Handle Missing Values

In [8]:
df = df.fillna(method='ffill')

In [9]:
df['sentence #'].nunique(), df.word.nunique(), df["tags"].nunique()

(250, 1677, 8)

In [10]:
df.groupby('tags').size().reset_index(name='counts')

Unnamed: 0,tags,counts
0,B_EXCLUSION,92
1,B_INCLUSION,391
2,I_EXCLUSION,238
3,I_INCLUSION,869
4,O,4874
5,U_EXCLUSION,31
6,U_INCLUSION,121
7,i_EXCLUSION,6


In [11]:
X = df.drop('tags', axis=1)
X.head()

Unnamed: 0,sentence #,pattern,benefit_type,requirement_text,word,POS
0,sentence 1,"Includes programs such as group exercise, adul...",ADCS,Adult Day Care Services,Includes,NNS
1,sentence 1,"Includes programs such as group exercise, adul...",ADCS,Adult Day Care Services,programs,NNS
2,sentence 1,"Includes programs such as group exercise, adul...",ADCS,Adult Day Care Services,such,JJ
3,sentence 1,"Includes programs such as group exercise, adul...",ADCS,Adult Day Care Services,as,IN
4,sentence 1,"Includes programs such as group exercise, adul...",ADCS,Adult Day Care Services,group,NN


In [12]:
X.columns

Index(['sentence #', 'pattern', 'benefit_type', 'requirement_text', 'word',
       'POS'],
      dtype='object')

# Data Preprocessing

In [13]:
v = DictVectorizer(sparse=False)
X = v.fit_transform(X.to_dict('records'))
X.shape

(6622, 2361)

In [14]:
y = df["tags"].values

In [15]:
classes = np.unique(y)

In [16]:
classes = classes.tolist()
classes

['B_EXCLUSION',
 'B_INCLUSION',
 'I_EXCLUSION',
 'I_INCLUSION',
 'O',
 'U_EXCLUSION',
 'U_INCLUSION',
 'i_EXCLUSION']

In [17]:
X.shape, y.shape

((6622, 2361), (6622,))

In [18]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=0)

In [19]:
#X_train.shape, y_train.shape

In [20]:
new_classes = classes.copy()
new_classes.pop()
new_classes

['B_EXCLUSION',
 'B_INCLUSION',
 'I_EXCLUSION',
 'I_INCLUSION',
 'O',
 'U_EXCLUSION',
 'U_INCLUSION']

# Feature Extraction

In [21]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w,b,r, p, t) for w,b,r, p, t in zip(s['word'].values.tolist(),
                                                           s['benefit_type'].values.tolist(),
                                                           s['requirement_text'].values.tolist(),
                                                           s['POS'].values.tolist(), 
                                                           s['tags'].values.tolist())]
        self.grouped = self.data.groupby('sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None

In [22]:
getter = SentenceGetter(df)

In [23]:
sent = getter.get_next()
print(sent)

None


In [24]:
sentences = getter.sentences

In [25]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    #benefit_type = sent[i][1]
    #requirement_text = sent[i][2]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        #'benefit_type':benefit_type,
        #'requirement_text':requirement_text,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token,benefit_type,requirement_text, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [26]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

# Splitting the data

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [28]:
X_test

[[{'bias': 1.0,
   'word.lower()': 'this',
   'word[-3:]': 'his',
   'word[-2:]': 'is',
   'word.isupper()': False,
   'word.istitle()': True,
   'word.isdigit()': False,
   'postag': 'LARC',
   'postag[:2]': 'LA',
   'BOS': True,
   '+1:word.lower()': 'service',
   '+1:word.istitle()': False,
   '+1:word.isupper()': False,
   '+1:postag': 'LARC',
   '+1:postag[:2]': 'LA'},
  {'bias': 1.0,
   'word.lower()': 'service',
   'word[-3:]': 'ice',
   'word[-2:]': 'ce',
   'word.isupper()': False,
   'word.istitle()': False,
   'word.isdigit()': False,
   'postag': 'LARC',
   'postag[:2]': 'LA',
   '-1:word.lower()': 'this',
   '-1:word.istitle()': True,
   '-1:word.isupper()': False,
   '-1:postag': 'LARC',
   '-1:postag[:2]': 'LA',
   '+1:word.lower()': 'usually',
   '+1:word.istitle()': False,
   '+1:word.isupper()': False,
   '+1:postag': 'LARC',
   '+1:postag[:2]': 'LA'},
  {'bias': 1.0,
   'word.lower()': 'usually',
   'word[-3:]': 'lly',
   'word[-2:]': 'ly',
   'word.isupper()': False

# Model Training

In [29]:

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [30]:
filename = 'medicare_ner_inc_exc_model_updated.pkl'
pickle.dump(crf, open(filename, 'wb'))

In [31]:
#####prediction on test set#####
crf1 = pickle.load(open('medicare_ner_inc_exc_model_updated.pkl', 'rb'))

In [32]:
mapping_sheet = pd.read_excel("GBD Medicare Benefit Names and Categories - Inclusion Or Exclusions.xlsx")

# Spell Correction Check

In [33]:
mapping_sheet['Benefit Name'].values

array(['Abortion Ambulatory Surgical Center',
       'Abortion Inpatient Professional',
       'Abortion Observation Room Outpatient Institutional',
       'Abortion Office Professional - PCP',
       'Abortion Office Professional - Specialist',
       'Abortion Surgery Outpatient Institutional',
       'Abortion Office Professional - Specialist - Designated Care Program',
       'Acupuncture',
       'Mental Health - Crisis Intervention/Crisis Stabilization',
       'Substance Abuse - Crisis Intervention/Crisis Stabilization',
       'Mental Health - Intensive Outpatient Program (IOP) Institutional',
       'Substance Abuse - Intensive Outpatient Program (IOP) Institutional',
       'Mental Health - Inpatient Institutional - Hospital Days ',
       'Mental Health - Inpatient Professional',
       'Mental Health - Inpatient Psychiatric Facility - Hospital Days',
       'Substance Abuse - Inpatient Institutional - Hospital Days ',
       'Substance Abuse - Inpatient Professional',
     

In [34]:
befitname_value = np.savetxt(r'train.txt', mapping_sheet['Benefit Name'].values, fmt='%s')

In [35]:
def words(text): return re.findall(r'\w+', text.lower())

WORDS = Counter(words(open('train.txt').read()))

def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

# Model Prediction

In [36]:
# Test Prediction COde
def inc_exc_prediction(benefit_name):
    import csv
    import json
#     benefit_name = correction(benefit_name)
#     sentence = mapping_sheet[mapping_sheet["Benefit Name"]==benefit_name]["Requirement Text"].iloc[0]
#     print(sentence)
    mapping_sheet_inclusions = []
    if str(mapping_sheet[mapping_sheet["Benefit Name"]==benefit_name]["Inclusions"].iloc[0])!="nan":
        mapping_sheet_inclusions = mapping_sheet[mapping_sheet["Benefit Name"]==benefit_name]["Inclusions"].iloc[0].split("\n")
        print(mapping_sheet_inclusions)
    mapping_sheet_exclusions = []
    if str(mapping_sheet[mapping_sheet["Benefit Name"]==benefit_name]["Exclusions"].iloc[0])!="nan":
        mapping_sheet_exclusions = mapping_sheet[mapping_sheet["Benefit Name"]==benefit_name]["Exclusions"].iloc[0].split("\n")
        print(mapping_sheet_exclusions)
    
#     df_new = df[df['requirement_text']==sentence] 
    #df_new = df[]
    getter = SentenceGetter(benefit_name)
    sentences = getter.sentences
    X = [sent2features(s) for s in sentences]
    y = [sent2labels(s) for s in sentences]
    predictions = crf1.predict(X)
    #print(predictions)
    predictions = crf1.predict(X)
    final_predictions = []
    for sentence_index,each_sentence_pred in enumerate(predictions):
        inc_string = ""
        exc_string = ""
        sentence_id = "sentence "+str(sentence_index)
        pattern = ""
        score = metrics.flat_accuracy_score([predictions[sentence_index]], [y[sentence_index]])
        f1_score =  metrics.flat_f1_score([predictions[sentence_index]], [y[sentence_index]], average='weighted', labels=new_classes)

        for word_index,each_iob in enumerate(each_sentence_pred):
            pattern = pattern+" "+str(X[sentence_index][word_index]['word.lower()'])
            if each_iob == "B_INCLUSION":
                inc_string = inc_string+","+str(X[sentence_index][word_index]['word.lower()'])
            elif each_iob == "I_INCLUSION":
                inc_string = inc_string+" "+str(X[sentence_index][word_index]['word.lower()'])
            elif each_iob == "B_EXCLUSION":
                exc_string = exc_string+","+str(X[sentence_index][word_index]['word.lower()'])
            elif each_iob == "I_EXCLUSION":
                exc_string = exc_string+" "+str(X[sentence_index][word_index]['word.lower()'])
        
        pred_inclusions = inc_string[1:].split(",")
        pred_inclusions = [i for i in pred_inclusions if i]
        for i in range(len(pred_inclusions)):
            for j in range(len(mapping_sheet_inclusions)):
                val = fuzz.ratio(pred_inclusions[i],mapping_sheet_inclusions[j])
                if val>80:
                    print(val)
                    pred_inclusions[i] = mapping_sheet_inclusions[j]
        
        pred_exclusions = exc_string[1:].split(",")
        pred_exclusions = [i for i in pred_exclusions if i]
        for i in range(len(pred_exclusions)):
            for j in range(len(mapping_sheet_exclusions)):
                val = fuzz.ratio(pred_exclusions[i],mapping_sheet_exclusions[j])
                if val>70:
                    print(val)
                    pred_exclusions[i] = mapping_sheet_exclusions[j]


        
        json_dict_pred = {}
        json_dict_pred["Inclusion"] = pred_inclusions
        json_dict_pred["Exclusion"] = pred_exclusions
        json_dict_pred["accuracy_score"] = score
        json_dict_pred["f1-score"] = f1_score

        if json_dict_pred["Inclusion"]!=[] or json_dict_pred["Exclusion"]!=[]:    
            json_dict_pred = json.dumps(json_dict_pred, sort_keys=True)
            final_predictions.append(json_dict_pred)

    return final_predictions

In [37]:
inc_exc_prediction("Dental Services - Designated Care Program")

['Jaw Reconstruction', 'Oral Exams']
['Dental Care', 'Dental Treatment', 'Removal/Replacement Of Teeth ', 'Removal/Replacement Of Structures Directly Supporting Teeth', 'Dental Appliances', 'Dentures', 'Alveolar Process', 'Filings', 'Other dental devices', 'Dental plates']


AttributeError: 'str' object has no attribute 'groupby'

In [38]:
inc_exc_prediction("Dental Services - Designated Care Program")

['Jaw Reconstruction', 'Oral Exams']
['Dental Care', 'Dental Treatment', 'Removal/Replacement Of Teeth ', 'Removal/Replacement Of Structures Directly Supporting Teeth', 'Dental Appliances', 'Dentures', 'Alveolar Process', 'Filings', 'Other dental devices', 'Dental plates']


AttributeError: 'str' object has no attribute 'groupby'

In [39]:
benefit_iface = gr.Interface(fn=inc_exc_prediction, inputs=gr.inputs.Textbox(default="Please enter the Benefit Text."), outputs="json")
benefit_iface.launch()

Running locally at: http://127.0.0.1:7860/
To create a public link, set `share=True` in `launch()`.
Interface loading below...


Tip: Add interpretation to your model by simply adding `interpretation="default"` to `Interface()`


(<Flask 'gradio.networking'>, 'http://127.0.0.1:7860/', None)

[2021-05-13 13:35:12,651] ERROR in app: Exception on /api/predict/ [POST]
Traceback (most recent call last):
  File "C:\Users\AG95028\AppData\Local\conda\conda\envs\myenv\lib\site-packages\flask\app.py", line 2447, in wsgi_app
    response = self.full_dispatch_request()
  File "C:\Users\AG95028\AppData\Local\conda\conda\envs\myenv\lib\site-packages\flask\app.py", line 1952, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "C:\Users\AG95028\AppData\Local\conda\conda\envs\myenv\lib\site-packages\flask_cors\extension.py", line 165, in wrapped_function
    return cors_after_request(app.make_response(f(*args, **kwargs)))
  File "C:\Users\AG95028\AppData\Local\conda\conda\envs\myenv\lib\site-packages\flask\app.py", line 1821, in handle_user_exception
    reraise(exc_type, exc_value, tb)
  File "C:\Users\AG95028\AppData\Local\conda\conda\envs\myenv\lib\site-packages\flask\_compat.py", line 39, in reraise
    raise value
  File "C:\Users\AG95028\AppData\Local\conda\conda\

['Jaw Reconstruction', 'Oral Exams']
['Dental Care', 'Dental Treatment', 'Removal/Replacement Of Teeth ', 'Removal/Replacement Of Structures Directly Supporting Teeth', 'Dental Appliances', 'Dentures', 'Alveolar Process', 'Filings', 'Other dental devices', 'Dental plates']


[2021-05-13 13:35:13,792] ERROR in app: Exception on /api/predict/ [POST]
Traceback (most recent call last):
  File "C:\Users\AG95028\AppData\Local\conda\conda\envs\myenv\lib\site-packages\flask\app.py", line 2447, in wsgi_app
    response = self.full_dispatch_request()
  File "C:\Users\AG95028\AppData\Local\conda\conda\envs\myenv\lib\site-packages\flask\app.py", line 1952, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "C:\Users\AG95028\AppData\Local\conda\conda\envs\myenv\lib\site-packages\flask_cors\extension.py", line 165, in wrapped_function
    return cors_after_request(app.make_response(f(*args, **kwargs)))
  File "C:\Users\AG95028\AppData\Local\conda\conda\envs\myenv\lib\site-packages\flask\app.py", line 1821, in handle_user_exception
    reraise(exc_type, exc_value, tb)
  File "C:\Users\AG95028\AppData\Local\conda\conda\envs\myenv\lib\site-packages\flask\_compat.py", line 39, in reraise
    raise value
  File "C:\Users\AG95028\AppData\Local\conda\conda\

['Jaw Reconstruction', 'Oral Exams']
['Dental Care', 'Dental Treatment', 'Removal/Replacement Of Teeth ', 'Removal/Replacement Of Structures Directly Supporting Teeth', 'Dental Appliances', 'Dentures', 'Alveolar Process', 'Filings', 'Other dental devices', 'Dental plates']


[2021-05-13 13:36:04,607] ERROR in app: Exception on /api/predict/ [POST]
Traceback (most recent call last):
  File "C:\Users\AG95028\AppData\Local\conda\conda\envs\myenv\lib\site-packages\flask\app.py", line 2447, in wsgi_app
    response = self.full_dispatch_request()
  File "C:\Users\AG95028\AppData\Local\conda\conda\envs\myenv\lib\site-packages\flask\app.py", line 1952, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "C:\Users\AG95028\AppData\Local\conda\conda\envs\myenv\lib\site-packages\flask_cors\extension.py", line 165, in wrapped_function
    return cors_after_request(app.make_response(f(*args, **kwargs)))
  File "C:\Users\AG95028\AppData\Local\conda\conda\envs\myenv\lib\site-packages\flask\app.py", line 1821, in handle_user_exception
    reraise(exc_type, exc_value, tb)
  File "C:\Users\AG95028\AppData\Local\conda\conda\envs\myenv\lib\site-packages\flask\_compat.py", line 39, in reraise
    raise value
  File "C:\Users\AG95028\AppData\Local\conda\conda\

['Jaw Reconstruction', 'Oral Exams']
['Dental Care', 'Dental Treatment', 'Removal/Replacement Of Teeth ', 'Removal/Replacement Of Structures Directly Supporting Teeth', 'Dental Appliances', 'Dentures', 'Alveolar Process', 'Filings', 'Other dental devices', 'Dental plates']


[2021-05-13 13:36:05,520] ERROR in app: Exception on /api/predict/ [POST]
Traceback (most recent call last):
  File "C:\Users\AG95028\AppData\Local\conda\conda\envs\myenv\lib\site-packages\flask\app.py", line 2447, in wsgi_app
    response = self.full_dispatch_request()
  File "C:\Users\AG95028\AppData\Local\conda\conda\envs\myenv\lib\site-packages\flask\app.py", line 1952, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "C:\Users\AG95028\AppData\Local\conda\conda\envs\myenv\lib\site-packages\flask_cors\extension.py", line 165, in wrapped_function
    return cors_after_request(app.make_response(f(*args, **kwargs)))
  File "C:\Users\AG95028\AppData\Local\conda\conda\envs\myenv\lib\site-packages\flask\app.py", line 1821, in handle_user_exception
    reraise(exc_type, exc_value, tb)
  File "C:\Users\AG95028\AppData\Local\conda\conda\envs\myenv\lib\site-packages\flask\_compat.py", line 39, in reraise
    raise value
  File "C:\Users\AG95028\AppData\Local\conda\conda\

['Jaw Reconstruction', 'Oral Exams']
['Dental Care', 'Dental Treatment', 'Removal/Replacement Of Teeth ', 'Removal/Replacement Of Structures Directly Supporting Teeth', 'Dental Appliances', 'Dentures', 'Alveolar Process', 'Filings', 'Other dental devices', 'Dental plates']
