In [1]:
import numpy as np
import pandas as pd

import math
from numpy.linalg import norm

from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings("ignore")

import re
import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sentence_transformers import SentenceTransformer

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.ensemble import RandomForestClassifier
import statsmodels.api as sm # We can fit logistic regression using statsmodel as well
from sklearn.linear_model import LogisticRegression 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score

In [2]:
def cosine_sim(src, tgt):
    similarity = cosine_similarity(src, tgt)
    return similarity

In [3]:
test = pd.read_excel('drugs.xlsx')
test_Interpretation= test.loc[test['HC_Subpart'] == 'Interpretation']
test_General = test.loc[test['HC_Subpart'] == 'General']
test_removal = pd.concat([test_Interpretation,test_General])
cond = test['HC_Subpart'].isin(test_removal['HC_Subpart'])
test.drop(test[cond].index, inplace = True)
test.reset_index(drop=True, inplace=True)
print(test.shape)
test.head(3)

(3225, 12)


Unnamed: 0,FDA_Chapter,FDA_Section,FDA_Code,FDA_Subpart,FDA_Description,matched,score,HC_Code,HC_Chapter,HC_Section,HC_Subpart,HC_Description
0,GENERAL ENFORCEMENT REGULATIONS,General Provisions,"""1.3""",Definitions.,"(a) Labeling includes all written, printed,...",1,0.635591,C.01.031,Drugs,Cautionary Statements and Child Resistant Pack...,Cautionary Statements and Child Resistant Pack...,"(1) Subject to section C.01.031.2, (a) no pers..."
1,GENERAL ENFORCEMENT REGULATIONS,General Labeling Requirements,"""1.20""",Presence of mandatory label information.,In the regulations specified in 1.1(c) of this...,1,0.779747,C.03.208,Drugs,Good Manufacturing Practices,"Drugs, Other than Radionuclides, Sold or Repre...",Every kit shall be labelled to show (a) its pr...
2,GENERAL ENFORCEMENT REGULATIONS,General Labeling Requirements,"""1.21""",Failure to reveal material facts.,"(a) Labeling of a food, drug, device, cosmetic...",1,0.729136,C.01.029,Drugs,Cautionary Statements and Child Resistant Pack...,Cautionary Statements and Child Resistant Pack...,(1) Subject to subsections C.01.031.2(1) and (...


In [4]:
train = pd.read_excel('fda_ hc RVP review 11172020.xlsx')
train.drop(['Unnamed: 13'],axis=1,inplace=True)
train.reset_index(drop=True, inplace=True)
print(train.shape)
train.head(3)

(100, 13)


Unnamed: 0,FDA_Chapter,FDA_Section,FDA_Code,FDA_Subpart,FDA_Description,matched,score,HC_Code,HC_Chapter,HC_Section,HC_Subpart,HC_Description,Yes(y) / No(n)
0,CURRENT GOOD MANUFACTURING PRACTICE FOR FINISH...,Buildings and Facilities,"""211.56""",Sanitation.,"(a) Any building used in the manufacture, proc...",1,0.773877,C.02.004,Drugs,Good Manufacturing Practices,Premises,The premises in which a lot or batch of a drug...,Y
1,CURRENT GOOD MANUFACTURING PRACTICE FOR TYPE A...,Construction and Maintenance of Facilities and...,"""226.20""",Buildings.,Buildings in which Type A medicated article(s)...,1,0.724563,C.02.004,Drugs,Good Manufacturing Practices,Premises,The premises in which a lot or batch of a drug...,Y
2,CURRENT GOOD MANUFACTURING PRACTICE FOR FINISH...,Buildings and Facilities,"""211.58""",Maintenance.,"Any building used in the manufacture, processi...",1,0.717186,C.02.004,Drugs,Good Manufacturing Practices,Premises,The premises in which a lot or batch of a drug...,Y


In [5]:
stopwords_lst = ["a","about","above","after","again","against","ain","all","am","an","and","any","are","aren","aren't","as",
       "at","be","because","been","before","being","below","between","both","but","by","can","couldn","couldn't",
       "d","did","didn","didn't","do","does","doesn","doesn't","doing","don","don't","down","during",
       "each","few","for","from","further","had","hadn","hadn't","has","hasn","hasn't","have","haven",
       "haven't","having","he","her","here","hers","herself","him","himself","his","how","i","if","in",
       "into","is","isn","isn't","it","it's","its","itself","just","ll","m","ma","me","mightn","mightn't",
       "more","most","mustn","mustn't","my","myself","needn","needn't","no","nor","not","now","o","of","off","on",
       "once","only","or","other","our","ours","ourselves","out","over","own","re","s","same","shan","shan't",
       "she","she's","should","should've","shouldn","shouldn't","so","some","such","t","than","that","that'll",
       "the","their","theirs","them","themselves","then","there","these","they","this","those","through","to","too",
       "under","until","up","ve","very","was","wasn","wasn't","we","were","weren","weren't","what","when","where",
       "which","while","who","whom","why","will","with","won","won't","wouldn","wouldn't","y","you","you'd","you'll",
       "you're","you've","your","yours","yourself","yourselves","could","he'd","he'll","he's","here's","how's","i'd",
       "i'll","i'm","i've","let's","ought","she'd","shall","she'll","that's","there's","they'd","they'll","they're","they've",
       "we'd","we'll","we're","we've","what's","when's","where's","who's","why's","would","able","abst",
       "accordance","according","accordingly","across","act","actually","added","adj","affected","affecting",
       "affects","afterwards","ah","almost","alone","along","already","also","although","always","among","amongst",
       "announce","another","anybody","anyhow","anymore","anyone","anything","anyway","anyways","anywhere","apparently",
       "approximately","arent","arise","around","aside","ask","asking","auth","available","away","awfully","b","back",
       "became","become","becomes","becoming","beforehand","begin","beginning","beginnings","begins","behind","believe",
       "beside","besides","beyond","biol","brief","briefly","c","ca","came","cannot","can't","cause","causes","certain",
       "certainly","co","com","come","comes","contain","containing","contains","couldnt","date","different","done",
       "downwards","due","e","ed","edu","effect","eg","eight","eighty","either","else","elsewhere","end","ending","enough",
       "especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","except","f",
       "far","ff","fifth","first","five","fix","followed","following","follows","former","formerly","forth","found","four","furthermore","g","gave",
       "get","gets","getting","give","given","gives","giving","go","goes","gone","got","gotten","h","happens","hardly","hed","hence","hereafter",
             "hereby","herein","heres","hereupon","hes","hi","hid","hither","home","howbeit","however","hundred","id","ie","im","inc","indeed","index","instead","invention","inward",
             "itd","it'll","j","k","keep","keeps","kept","kg","km","know","known","knows","l","largely","last","lately","later","latter",
             "latterly","least","less","lest","let","lets","like","liked","likely","line","little","'ll","look",
             "looking","looks","ltd","made","mainly","make","makes","many","may","maybe","mean","means","meantime",
             "meanwhile","merely","mg","might","million","miss","ml","moreover","mostly","mr","mrs","much","mug",
             "must","n","na","name","namely","nay","nd","near","nearly","necessarily","necessary","need","needs",
             "neither","never","nevertheless","next","nine","ninety","nobody","non","none","nonetheless","noone",
             "normally","nos","noted","nothing","nowhere","obtain","obtained","obviously","often","oh","ok","okay",
             "old","omitted","one","ones","onto","ord","others","otherwise","outside","overall","owing","p","page",
             "pages","part","particular","particularly","past","per","perhaps","placed","please","plus",
             "possibly","potentially","pp","predominantly","present","previously","primarily","probably",
             "promptly","proud","provides","put","q","que","quickly","quite","qv","r","ran","rather","rd","readily",
             "really","recent","recently","ref","refs","regarding","regardless","regards","related","relatively","research",
             "respectively","resulted","resulting","right","run","said","saw","say","saying","says","sec","section","see",
             "seeing","seem","seemed","seeming","seems","seen","self","selves","sent","seven","several","shall","shed","shes",
             "show","showed","shown","showns","shows","significantly","similarly","since","six","slightly","somebody","somehow",
             "someone","somethan","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specifically","specified","specify","specifying","still","stop","strongly","sub","substantially","successfully","sufficiently","suggest","sup","sure","take","taken","taking","tell","tends","th","thank","thanks",
             "thanx","thats","that've","thence","thereafter","thereby","thered","therefore","therein","there'll","thereof","therere","theres","thereto","thereupon","there've","theyd","theyre","think","thou","though","thoughh","thousand","throug","throughout","thru","thus","til","tip","together","took","toward","towards","tried","tries","truly","try","trying","ts","twice","two","u","un","unfortunately","unless","unlike","unlikely","unto","upon","ups","us","use","used","useful","usefully","usefulness","uses","using","usually","v","value","various","'ve","via","viz","vol","vols","vs","w","want","wants","wasnt","way","wed","welcome","went","werent","whatever","what'll","whats","whence","whenever","whereafter","whereas","whereby","wherein","wheres","whereupon","wherever","whether","whim","whither","whod","whoever","whole","who'll","whomever","whos","whose","widely","willing","wish","within","without","wont","words","world","wouldnt","www","x","yes","yet","youd","youre","z","zero","a's","ain't","allow","allows","apart","appear","appreciate","appropriate","associated","best","better","c'mon","c's","cant","changes","clearly","concerning","consequently","consider","considering","corresponding","course","currently","definitely","described","despite","entirely","exactly","example","going","greetings","hello","help","hopefully","ignored","inasmuch","indicate","indicated","indicates","inner","insofar","it'd","keep","keeps","novel","presumably","reasonably","second","secondly","sensible","serious","seriously","sure","t's","third","thorough","thoroughly","three","well","wonder","a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also","although","always","am","among", "amongst", "amoungst", "amount", "an", "and", "another", "any","anyhow","anyone","anything","anyway", "anywhere", "are", "around", "as", "at", "back","be","became", "because","become","becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom","but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven","else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own","part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "the","a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p",
"q", "r", "s", "t", "u", "v", "w", "x", "y", "z","A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z",'co','op','research-articl', 'pagecount','cit','ibid','les','le','au','que','est','pas','vol','el','los','pp','u201d','well-b', 'http', 'volumtype', 'par', '0o', '0s', '3a', '3b', '3d', '6b', '6o', 'a1', 'a2', 'a3', 'a4', 'ab', 'ac', 'ad', 'ae', 'af', 'ag', 'aj', 'al', 'an', 'ao', 'ap', 'ar', 'av', 'aw', 'ax', 'ay', 'az', 'b1', 'b2', 'b3', 'ba', 'bc', 'bd', 'be', 'bi', 'bj', 'bk', 'bl', 'bn', 'bp', 'br', 'bs', 'bt', 'bu', 'bx', 'c1', 'c2', 'c3', 'cc', 'cd', 'ce', 'cf', 'cg', 'ch', 'ci', 'cj', 'cl', 'cm', 'cn', 'cp', 'cq', 'cr', 'cs', 'ct', 'cu', 'cv', 'cx', 'cy', 'cz', 'd2', 'da', 'dc', 'dd', 'de', 'df', 'di', 'dj', 'dk', 'dl', 'do', 'dp', 'dr', 'ds', 'dt', 'du', 'dx', 'dy', 'e2', 'e3', 'ea', 'ec', 'ed', 'ee', 'ef', 'ei', 'ej', 'el', 'em', 'en', 'eo', 'ep', 'eq', 'er', 'es', 'et', 'eu', 'ev', 'ex', 'ey', 'f2', 'fa', 'fc', 'ff', 'fi', 'fj', 'fl', 'fn', 'fo', 'fr', 'fs', 'ft', 'fu', 'fy', 'ga', 'ge', 'gi', 'gj', 'gl', 'go', 'gr', 'gs', 'gy', 'h2', 'h3', 'hh', 'hi', 'hj', 'ho', 'hr', 'hs', 'hu', 'hy', 'i', 'i2', 'i3', 'i4', 'i6', 'i7', 'i8', 'ia', 'ib', 'ic', 'ie', 'ig', 'ih', 'ii', 'ij', 'il', 'in', 'io', 'ip', 'iq', 'ir', 'iv', 'ix', 'iy', 'iz', 'jj', 'jr', 'js', 'jt', 'ju', 'ke', 'kg', 'kj', 'km', 'ko', 'l2', 'la', 'lb', 'lc', 'lf', 'lj', 'ln', 'lo', 'lr', 'ls', 'lt', 'm2', 'ml', 'mn', 'mo', 'ms', 'mt', 'mu', 'n2', 'nc', 'nd', 'ne', 'ng', 'ni', 'nj', 'nl', 'nn', 'nr', 'ns', 'nt', 'ny', 'oa', 'ob', 'oc', 'od', 'of', 'og', 'oi', 'oj', 'ol', 'om', 'on', 'oo', 'oq', 'or', 'os', 'ot', 'ou', 'ow', 'ox', 'oz', 'p1', 'p2', 'p3', 'pc', 'pd', 'pe', 'pf', 'ph', 'pi', 'pj', 'pk', 'pl', 'pm', 'pn', 'po', 'pq', 'pr', 'ps', 'pt', 'pu', 'py', 'qj', 'qu', 'r2', 'ra', 'rc', 'rd', 'rf', 'rh', 'ri', 'rj', 'rl', 'rm', 'rn', 'ro', 'rq', 'rr', 'rs', 'rt', 'ru', 'rv', 'ry', 's2', 'sa', 'sc', 'sd', 'se', 'sf', 'si', 'sj', 'sl', 'sm', 'sn', 'sp', 'sq', 'sr', 'ss', 'st', 'sy', 'sz', 't1', 't2', 't3', 'tb', 'tc', 'td', 'te', 'tf', 'th', 'ti', 'tj', 'tl', 'tm', 'tn', 'tp', 'tq', 'tr', 'ts', 'tt', 'tv', 'tx', 'ue', 'ui', 'uj', 'uk', 'um', 'un', 'uo', 'ur', 'ut', 'va', 'wa', 'vd', 'wi', 'vj', 'vo', 'wo', 'vq', 'vt', 'vu', 'x1', 'x2', 'x3', 'xf', 'xi', 'xj', 'xk', 'xl', 'xn', 'xo', 'xs', 'xt', 'xv', 'xx', 'y2', 'yj', 'yl', 'yr', 'ys', 'yt', 'zi', 'zz']

In [6]:
train['fda_desc_cleaned']=''
train['hc_desc_cleaned']=''

# Pre-processing FDA Descriptions
lemmatizer = WordNetLemmatizer() # stem instead
pattern1 = re.compile(r'\b(' + r'|'.join(stopwords_lst) + r')\b\s*')
pattern2 = '[0-9]'
for i in range(0,len(train)):
        sent_ref = train.iloc[i]['FDA_Description']
#         sent_ref = sent_ref.lower() # convert to lower case
        word_list = nltk.word_tokenize(sent_ref)
        sent_ref = ' '.join([lemmatizer.lemmatize(w) for w in word_list]) # lemmatize
        sent_ref = re.sub(r'\([^)]*\)', '', sent_ref) # remove characters which are within () and []
        sent_ref = pattern1.sub('', sent_ref) # remove stopwords
        sent_ref = re.sub(r'[^\w\s]','',sent_ref) # remove spaces (new line, tabs etc)
        sent_ref = re.sub(pattern2, '', sent_ref) # remove numbers
        sent_ref = " ".join(sent_ref.split()) # remove whitespaces, if any
        train['fda_desc_cleaned'][i] = sent_ref
        
# Pre-processing Health Canada Descriptions
for i in range(0,len(train)):
        sent_ref = train.iloc[i]['HC_Description']
#         sent_ref = sent_ref.lower() # convert to lower case
        word_list = nltk.word_tokenize(sent_ref)
        sent_ref = ' '.join([lemmatizer.lemmatize(w) for w in word_list]) # lemmatize
        sent_ref = re.sub(r'\([^)]*\)', '', sent_ref) # remove characters which are within parenthesis
        sent_ref = pattern1.sub('', sent_ref) # remove stopwords
        sent_ref = re.sub(r'[^\w\s]','',sent_ref) # remove spaces (new line, tabs etc)
        sent_ref = re.sub(pattern2, '', sent_ref) # remove numbers
        sent_ref = " ".join(sent_ref.split()) # remove whitespaces, if any
        train['hc_desc_cleaned'][i] = sent_ref

In [7]:
from collections import OrderedDict
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load('en_core_web_sm')

class TextRank4Keyword():
    """Extract keywords from text"""
    
    def __init__(self):
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 10 # iteration steps
        self.node_weight = None # save keywords and its weight

    
    def set_stopwords(self, stopwords):  
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    def sentence_segment(self, doc, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences
        
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
        
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
            
        # Get Symmeric matrix
        g = self.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
        
        return g_norm

    
    def get_keywords(self, number=10):
        """Print top number keywords"""
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        for i, (key, value) in enumerate(node_weight.items()):
            print(key + ' - ' + str(value))
            if i > number:
                break
        
        
    def analyze(self, text, 
                candidate_pos=['NOUN', 'PROPN','VERB', 'ADJ'], 
                window_size=4, lower=False, stopwords=list()):
        """Main function to analyze text"""
        
        # Set stop words
        self.set_stopwords(stopwords)
        
        # Pair text by spaCy
        doc = nlp(text)
        
        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words
        
        # Build vocabulary
        vocab = self.get_vocab(sentences)
        
        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)
        
        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)
        
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        self.node_weight = node_weight

In [8]:
sent1 = train['fda_desc_cleaned'][0]
sent2 = train['hc_desc_cleaned'][0]

In [9]:
train['FDA_Description'][0]

'(a) Any building used in the manufacture, processing, packing, or holding of a drug product shall be maintained in a clean and sanitary condition, Any such building shall be free of infestation by rodents, birds, insects, and other vermin (other than laboratory animals). Trash and organic waste matter shall be held and disposed of in a timely and sanitary manner.  (b) There shall be written procedures assigning responsibility for sanitation and describing in sufficient detail the cleaning schedules, methods, equipment, and materials to be used in cleaning the buildings and facilities; such written procedures shall be followed.  (c) There shall be written procedures for use of suitable rodenticides, insecticides, fungicides, fumigating agents, and cleaning and sanitizing agents. Such written procedures shall be designed to prevent the contamination of equipment, components, drug product containers, closures, packaging, labeling materials, or drug products and shall be followed. Rodenti

In [10]:
tr4w = TextRank4Keyword()
tr4w.analyze(sent1, candidate_pos = ['NOUN', 'PROPN','VERB'], window_size=4, lower=False)
tr4w.get_keywords(10)

procedure - 2.2239304783950615
cleaning - 2.0509368619420703
drug - 1.946237678062678
product - 1.704226816239316
building - 1.6228804249762585
equipment - 1.5274125712250712
material - 1.4635425569800566
performed - 1.3645765432098766
work - 1.3640649691358027
fungicide - 1.2933797260802469
written - 1.260117901234568
Trash - 1.1766111111111113


In [11]:
train['HC_Description'][0]

'The premises in which a lot or batch of a drug is fabricated, packaged/labelled or stored shall be designed, constructed and maintained in a manner that (a) permits the operations therein to be performed under clean, sanitary and orderly conditions; (b) permits the effective cleaning of all surfaces therein; and (c) prevents the contamination of the drug and the addition of extraneous material to the drug. '

In [12]:
tr4w.analyze(sent2, candidate_pos = ['NOUN', 'PROPN','VERB'], window_size=4, lower=False)
tr4w.get_keywords(10)

drug - 2.1147745181405897
permit - 1.6705218253968255
fabricated - 1.0468439625850339
packagedlabelled - 1.017296343537415
stored - 0.9846624149659863
surface - 0.9640026927437642
prevents - 0.9587913832199545
designed - 0.9535535714285713
constructed - 0.9503829365079364
maintained - 0.9478025793650793
manner - 0.9392013888888888
contamination - 0.9315880102040816


In [13]:
# source:https://towardsdatascience.com/textrank-for-keyword-extraction-by-python-c0bae21bcec0

In [14]:
sent3 = train['fda_desc_cleaned'][1]
sent4 = train['hc_desc_cleaned'][1]

In [15]:
train['FDA_Description'][1]

'Buildings in which Type A medicated article(s) are manufactured, processed, packaged, labeled, or held shall be maintained in a clear and orderly manner and shall be of suitable size, construction and location in relation to surroundings to facilitate maintenance and operation for their intended purpose. The building shall:  (a) Provide adequate space for the orderly placement of equipment and materials used in any of the following operations for which they are employed to minimize risk of mixups between different Type A medicated article(s), their components, packaging, or labeling:  (1) The receipt, sampling, control, and storage of components.  (2) Manufacturing and processing operations performed on the Type A medicated article(s).  (3) Packaging and labeling operations.  (4) Storage of containers, packaging materials, labeling, and finished products.  (5) Control laboratory operations.  (b) Provide adequate lighting and ventilation, and when necessary for the intended production 

In [16]:
tr4w.analyze(sent3, candidate_pos = ['NOUN', 'PROPN','VERB'], window_size=4, lower=False)
tr4w.get_keywords(10)

operation - 3.0798367107805316
article - 2.8834302194852164
Type - 2.608401366351779
material - 2.505234934763389
component - 2.1401698980798525
production - 2.129634272822847
Provide - 1.9341169009103094
intended - 1.7573903710527676
control - 1.6852009975438298
storage - 1.670819714022421
medicated - 1.65323888405689
equipment - 1.4269192391338463


In [17]:
train['HC_Description'][1]

'The premises in which a lot or batch of a drug is fabricated, packaged/labelled or stored shall be designed, constructed and maintained in a manner that (a) permits the operations therein to be performed under clean, sanitary and orderly conditions; (b) permits the effective cleaning of all surfaces therein; and (c) prevents the contamination of the drug and the addition of extraneous material to the drug. '

In [18]:
tr4w.analyze(sent4, candidate_pos = ['NOUN', 'PROPN','VERB'], window_size=4, lower=False)
tr4w.get_keywords(10)

drug - 2.1147745181405897
permit - 1.6705218253968255
fabricated - 1.0468439625850339
packagedlabelled - 1.017296343537415
stored - 0.9846624149659863
surface - 0.9640026927437642
prevents - 0.9587913832199545
designed - 0.9535535714285713
constructed - 0.9503829365079364
maintained - 0.9478025793650793
manner - 0.9392013888888888
contamination - 0.9315880102040816
