In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm

pd.options.display.max_colwidth = 200

import warnings
warnings.filterwarnings('ignore')


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames[:5]:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# !pip install spacy==2.3.5

In [None]:
!pip install https://med7.s3.eu-west-2.amazonaws.com/en_core_med7_lg.tar.gz

### Syntax

Syntax is the structure of a language which is governed by grammers. Any ordering of words can not be a sentence. Hence, we need syntactical analysis for natural languages.

## Table of Contents

* [Parts of Speech Tagging](#pos)
* [Dependency Parsing](#parsing)
* [Named Entity Recognition](#ner)

<a id='pos'></a>

# 1. Parts of Speech Tagging

Parts of speech (POS) are specific lexical categories to which words are assigned, based on their syntactic context and role. Usually, words can fall into one of the following major categories.

* <strong>Nouns</strong>
* <strong>Verb</strong>
* <strong>Adjective</strong>
* <strong>Adverb</strong>

Besides these four major categories of parts of speech , there are other categories that occur frequently in the English language. These include pronouns, prepositions, interjections, conjunctions, determiners, and many others. The process of classifying and labeling POS tags for words called parts of speech tagging or POS tagging . POS tags are used to annotate words and depict their POS, which is really helpful to perform specific analysis, such as narrowing down upon nouns and seeing which ones are the most prominent, word sense disambiguation, and grammar analysis.


In [None]:
data = pd.read_csv('/kaggle/input/nlp-specialization-data/Cleaned_POS_Medical_Notes.csv') #for excel file use read_excel
data

In [None]:
import nltk
import spacy
import en_core_med7_lg #en_core_web_sm
import re

nlp=en_core_med7_lg.load()
#nlp = spacy.load('en_core_med7_lg', parse=True, tag=True, entity=True)

In [None]:
sample_text = data.clean_text.iloc[1]
print (sample_text)

In [None]:
text_tokenized = nlp(sample_text)

for token in text_tokenized:
    print ("{} ---> {}".format(token,token.pos_))

Usually POS tags are used for analysis, feature engineering or, feature selection. In this analysis, let us select only the words that are nouns, verbs, number and adjectives.

In [None]:
def get_selected_pos(text):
    text_tokenized = nlp(text)
    selected_words = [token.string for token in text_tokenized if token.pos_ in ['NOUN','PROPN','NUM','ADJ','VERB','PUNCT']]
    processed_text = re.sub(' +',' ', " ".join(selected_words))
    return processed_text

In [None]:
data = data.dropna(subset=['clean_text'])
data.clean_text = data.clean_text.apply(get_selected_pos)

In [None]:
data

<a id='parsing'></a>

# 2. Dependency Parsing

In dependency parsing, we try to use dependency-based grammars to analyze and infer both structure and semantic dependencies and relationships between tokens in a sentence. The basic principle behind a dependency grammar is that in any sentence in the language, all words except one, have some relationship or dependency on other words in the sentence. The word that has no dependency is called the root of the sentence. The verb is taken as the root of the sentence in most cases. All the other words are directly or indirectly linked to the root verb using links , which are the dependencies.

In [None]:
from spacy import displacy
text_tokenized = nlp(data.clean_text.iloc[1])

options = {"compact": True}
displacy.serve(text_tokenized, style="dep", options=options)

<a id='ner'></a>

# 3. Named Entity Recognition (NER)

In any text document, there are particular terms that represent specific entities that are more informative and have a unique context. These entities are known as named entities , which more specifically refer to terms that represent real-world objects like people, places, organizations, and so on, which are often denoted by proper names. A naive approach could be to find these by looking at the noun phrases in text documents. Named entity recognition (NER) , also known as entity chunking/extraction , is a popular technique used in information extraction to identify and segment the named entities and classify or categorize them under various predefined classes.

In [None]:
text_tokenized_orig = nlp(data.text.iloc[1])
#for ent in text_tokenized_orig_eng.ents:
#    print ("{} ---> {}".format(ent.text, ent.label_))
    
displacy.serve(text_tokenized_orig, style="ent")

In [None]:
#for ent in text_tokenized.ents:
#    print ("{} ---> {}".format(ent.text, ent.label_))
    
displacy.serve(text_tokenized, style="ent")

In [None]:
# English Language Model
nlp_eng = spacy.load('en_core_web_sm', parse=True, tag=True, entity=True)

In [None]:
text_tokenized_orig_eng = nlp_eng(data.text.iloc[1])
#for ent in text_tokenized_orig_eng.ents:
#    print ("{} ---> {}".format(ent.text, ent.label_))
    
displacy.serve(text_tokenized_orig_eng, style="ent")

In [None]:
text_tokenized_eng = nlp_eng(data.clean_text.iloc[1])
#for ent in text_tokenized_eng.ents:
#    print ("{} ---> {}".format(ent.text, ent.label_))
    
displacy.serve(text_tokenized_eng, style="ent")

## References for further reading

<strong> POS tagging </strong>

* https://www.nltk.org/book/ch05.html

<strong> Medical named entity recognition </strong>

* https://github.com/kormilitzin/med7

* https://github.com/NLPatVCU/medaCy

* https://github.com/text-machine-lab/CliNER
