# Using Rake-NLTK

In [1]:
!pip install rake-nltk

Collecting rake-nltk
  Downloading rake_nltk-1.0.6-py3-none-any.whl (9.1 kB)
Collecting nltk<4.0.0,>=3.6.2
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: nltk, rake-nltk
  Attempting uninstall: nltk
    Found existing installation: nltk 3.2.4
    Uninstalling nltk-3.2.4:
      Successfully uninstalled nltk-3.2.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
preprocessing 0.1.13 requires nltk==3.2.4, but you have nltk 3.8.1 which is incompatible.[0m[31m
[0mSuccessfully installed nltk-3.8.1 rake-nltk-1.0.6
[0m

In [2]:
from rake_nltk import Rake
import numpy as np 
import pandas as pd 

# Toy Example for keyword extraction

In [3]:
r = Rake()
text = "Of course you feel buying a expensive will be more sturdy and long lasting with all other features that an inexpensive product would have.But belive it or not after almost weeks of researchi came across this fan which not only has good rpm, blowing right amount of air to cook your laptop but also is silent!!!. I've had others too but nothing compared to this. It's almost affordable and give required features that costly ones may have....I would say JUST PERFECT for my every day office work and little gaming.."
r.extract_keywords_from_text(text)
for rating, keyword in r.get_ranked_phrases_with_scores():
    if rating > 5:
        print(rating, keyword)

16.0 every day office work
9.0 researchi came across
9.0 little gaming ..
9.0 costly ones may
9.0 blowing right amount
8.5 inexpensive product would
8.0 give required features


# Load Data

In [4]:
notes = pd.read_csv("/kaggle/input/nbme-score-clinical-patient-notes/patient_notes.csv")

In [5]:
notes.head()

Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...


## Drop unwanted cols

In [6]:
notes.drop(["pn_num", "case_num"], axis=1, inplace=True)

In [7]:
notes["pn_history"]

0        17-year-old male, has come to the student heal...
1        17 yo male with recurrent palpitations for the...
2        Dillon Cleveland is a 17 y.o. male patient wit...
3        a 17 yo m c/o palpitation started 3 mos ago; \...
4        17yo male with no pmh here for evaluation of p...
                               ...                        
42141    Ms. Madden is a 20 yo female presenting w/ the...
42142    A 20 YO F CAME COMPLAIN A DULL 8/10 HEADACHE T...
42143    Ms. Madden is a 20yo female who presents with ...
42144    Stephanie madden is a 20 year old woman compla...
42145    patient is a 20 yo F who presents with a heada...
Name: pn_history, Length: 42146, dtype: object

In [8]:
def extract_keywords(pn_history):
    r = Rake(punctuations = [')','(',',',':','),',').','.'])
    r.extract_keywords_from_text(pn_history)
    phrase_df = pd.DataFrame(r.get_ranked_phrases_with_scores(), columns = ['score','phrase'])
    phrase_df.loc[phrase_df.score>5]
    return phrase_df['phrase'].tolist()

In [9]:
notes["keywords"] = notes["pn_history"].apply(extract_keywords)

In [10]:
notes.sample(8)

Unnamed: 0,pn_history,keywords
36041,HPI: a 67 y o f c/o not able to sleep. onset ...,[concentrate normal bowel habits pmh ; htn psh...
20528,"Ms. Whelan, a 26-year-old female, has come to ...","[palpitations - started 3 weeks ago, 26 - year..."
13292,"-35 yo M consults for epigastric pain, describ...","[intensity 5 / 10 - started 2 months, 1 - 2 be..."
23096,Patient is a 26 year old caucasian female pres...,"[26 year old caucasian female presents, caffei..."
16349,Karin Moore is a 45 y/o woman who presents wit...,"[home nearby - patient drinks 5 - 6 cups, drug..."
17536,45 yo F comes to clinic complaining of feeling...,"[last pap smear 10 months ago, skin / hair cha..."
22906,26 yr F with presents for f/u after ED visit 2...,"[sleeps 8 hours per night feels rested, regula..."
6275,"HPI : 35 YO MALE COMES WITH EPIGASTRIC PAIN, S...","[started two weeks ago, 35 yo male comes, pain..."


# using Bag of Words

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(stop_words='english')

## Using Toy Example

In [12]:
text_data = ['I am interested in NLP', 'This is a good tutorial with good topic', 'Feature extraction is very important topic']

In [13]:
# fit the data
bow.fit(text_data)

CountVectorizer(stop_words='english')

In [14]:
# get the vocabulary list
bow.get_feature_names()



['extraction',
 'feature',
 'good',
 'important',
 'interested',
 'nlp',
 'topic',
 'tutorial']

In [15]:
bow_features = bow.transform(text_data)
#bow_features

In [16]:
bow_feature_array = bow_features.toarray()
#bow_feature_array

In [17]:
#print(bow.get_feature_names())
for sentence, feature in zip(text_data, bow_feature_array):
    #print(sentence)
    #print(feature)
    lst = []
    for index, element in enumerate(feature.tolist()):
        if element > 0:
            lst.append(bow.get_feature_names()[index])
    print(lst)

['interested', 'nlp']
['good', 'topic', 'tutorial']
['extraction', 'feature', 'important', 'topic']


## Using Real dataset

In [18]:
notes

Unnamed: 0,pn_history,keywords
0,"17-year-old male, has come to the student heal...","[treatment - began 2 - 3 months ago, non - all..."
1,17 yo male with recurrent palpitations for the...,"[baskeball game two days ago light headedness,..."
2,Dillon Cleveland is a 17 y.o. male patient wit...,"[smoking ; 3 - 4 drinks, weekend per sitting ;..."
3,a 17 yo m c/o palpitation started 3 mos ago; \...,[nausea vomiting ; headache ; abdominal pain ;...
4,17yo male with no pmh here for evaluation of p...,"[endorse theses attacks occuring 1 - 2 times, ..."
...,...,...
42141,Ms. Madden is a 20 yo female presenting w/ the...,"[dad w / hpl social alcohol use, 20 yo female ..."
42142,A 20 YO F CAME COMPLAIN A DULL 8/10 HEADACHE T...,[ocp oby ; nmp 2wks ago menarche 14 occurs eve...
42143,Ms. Madden is a 20yo female who presents with ...,"[smokes marijuanna 2 - 3 joints per day, uri 1..."
42144,Stephanie madden is a 20 year old woman compla...,"[marijuna 3 - 4 joints, etoh 2 - 3 drinks, 20 ..."


In [19]:
notes.drop(["keywords"], axis=1, inplace=True)

In [20]:
notes

Unnamed: 0,pn_history
0,"17-year-old male, has come to the student heal..."
1,17 yo male with recurrent palpitations for the...
2,Dillon Cleveland is a 17 y.o. male patient wit...
3,a 17 yo m c/o palpitation started 3 mos ago; \...
4,17yo male with no pmh here for evaluation of p...
...,...
42141,Ms. Madden is a 20 yo female presenting w/ the...
42142,A 20 YO F CAME COMPLAIN A DULL 8/10 HEADACHE T...
42143,Ms. Madden is a 20yo female who presents with ...
42144,Stephanie madden is a 20 year old woman compla...


In [21]:
text_data = notes["pn_history"].tolist()
print(len(text_data))    

42146


In [22]:
bow.fit(text_data[:50])
bow_features = bow.transform(text_data[:50])
bow_feature_array = bow_features.toarray()

In [23]:
for sentence, feature in zip(text_data, bow_feature_array):
    print("Sentence: ")
    print(sentence)
    #print(feature)
    lst = []
    for index, element in enumerate(feature.tolist()):
        if element > 0:
            lst.append(bow.get_feature_names()[index])
    print("Keywords: ")
    print(lst)
    print("____________________________________________")

Sentence: 
17-year-old male, has come to the student health clinic complaining of heart pounding. Mr. Cleveland's mother has given verbal consent for a history, physical examination, and treatment
-began 2-3 months ago,sudden,intermittent for 2 days(lasting 3-4 min),worsening,non-allev/aggrav
-associated with dispnea on exersion and rest,stressed out about school
-reports fe feels like his heart is jumping out of his chest
-ros:denies chest pain,dyaphoresis,wt loss,chills,fever,nausea,vomiting,pedal edeam
-pmh:non,meds :aderol (from a friend),nkda
-fh:father had MI recently,mother has thyroid dz
-sh:non-smoker,mariguana 5-6 months ago,3 beers on the weekend, basketball at school
-sh:no std
Keywords: 
['17', 'aderol', 'aggrav', 'ago', 'allev', 'associated', 'basketball', 'beers', 'began', 'chest', 'chills', 'cleveland', 'clinic', 'come', 'complaining', 'consent', 'days', 'denies', 'dispnea', 'dyaphoresis', 'dz', 'edeam', 'examination', 'exersion', 'father', 'fe', 'feels', 'fever



Keywords: 
['17', '1x', 'acknowledges', 'adderal', 'adjusting', 'ago', 'alcohol', 'allergies', 'apparently', 'bladder', 'bowel', 'breath', 'cannabis', 'changes', 'chest', 'clinic', 'college', 'come', 'complaining', 'days', 'does', 'episode', 'episodes', 'episodic', 'feels', 'fevers', 'frequency', 'freshman', 'habits', 'health', 'heart', 'help', 'increase', 'lasting', 'life', 'lightheadedness', 'male', 'medications', 'minutes', 'months', 'nausea', 'nkda', 'numbness', 'occasionally', 'parent', 'patient', 'permission', 'pmh', 'pounding', 'precipicated', 'precription', 'prescribed', 'presenting', 'pressure', 'recent', 'roomate', 'ros', 'sh', 'shortness', 'smoking', 'social', 'started', 'state', 'states', 'stress', 'student', 'studying', 'suddenly', 'taker', 'taking', 'tests', 'tingling', 'total', 'use', 'vision', 'worst', 'yo']
____________________________________________
Sentence: 
17 yr old boy complaining of heart pounding for the last 4 months.not associated with nausea,vomiting,sweati

# Uisng TF-IDF

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')

In [25]:
text_data = ['I am interested in NLP', 'This is a good tutorial with good topic', 'Feature extraction is very important topic']

In [26]:
tfidf.fit(text_data)
tfidf.vocabulary_

{'interested': 4,
 'nlp': 5,
 'good': 2,
 'tutorial': 7,
 'topic': 6,
 'feature': 1,
 'extraction': 0,
 'important': 3}

In [27]:
tfidf_inverse_dict = {v: k for k, v in tfidf.vocabulary_.items()}
tfidf_inverse_dict

{4: 'interested',
 5: 'nlp',
 2: 'good',
 7: 'tutorial',
 6: 'topic',
 1: 'feature',
 0: 'extraction',
 3: 'important'}

In [28]:
tfidf_features = tfidf.transform(text_data)
tfidf_features

<3x8 sparse matrix of type '<class 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [29]:
tfidf_feature_array = tfidf_features.toarray()
tfidf_feature_array

array([[0.        , 0.        , 0.        , 0.        , 0.70710678,
        0.70710678, 0.        , 0.        ],
       [0.        , 0.        , 0.84678897, 0.        , 0.        ,
        0.        , 0.32200242, 0.42339448],
       [0.52863461, 0.52863461, 0.        , 0.52863461, 0.        ,
        0.        , 0.40204024, 0.        ]])

In [30]:
for sentence, feature in zip(text_data, tfidf_feature_array):
    print(sentence)
    print(feature)

I am interested in NLP
[0.         0.         0.         0.         0.70710678 0.70710678
 0.         0.        ]
This is a good tutorial with good topic
[0.         0.         0.84678897 0.         0.         0.
 0.32200242 0.42339448]
Feature extraction is very important topic
[0.52863461 0.52863461 0.         0.52863461 0.         0.
 0.40204024 0.        ]


In [31]:
for sentence, keywords in zip(text_data, tfidf_feature_array):
    print("Sentence: ")
    print(sentence)
    #print(feature)
    lst = []
    for index, element in enumerate(keywords.tolist()):
        if element > 0:
            #print(index)
            lst.append(tfidf_inverse_dict[index])
    print("Keywords: ")
    print(lst)
    print("____________________________________________")

Sentence: 
I am interested in NLP
Keywords: 
['interested', 'nlp']
____________________________________________
Sentence: 
This is a good tutorial with good topic
Keywords: 
['good', 'topic', 'tutorial']
____________________________________________
Sentence: 
Feature extraction is very important topic
Keywords: 
['extraction', 'feature', 'important', 'topic']
____________________________________________
