###  Please Upvote this Notebook if find anything important.

In this competition, you’ll identify specific clinical concepts in patient notes. Specifically, you'll develop an automated method to map clinical concepts from an exam rubric (e.g., “diminished appetite”) to various ways in which these concepts are expressed in clinical patient notes written by medical students (e.g., “eating less,” “clothes fit looser”). Great solutions will be both accurate and reliable.

## Import all those Library

In [None]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
import nltk
import re

## - Import dataset

In [None]:
train =pd.read_csv('../input/nbme-score-clinical-patient-notes/train.csv')
patiens_note = pd.read_csv('../input/nbme-score-clinical-patient-notes/patient_notes.csv')
features = pd.read_csv('../input/nbme-score-clinical-patient-notes/features.csv')
test= pd.read_csv('../input/nbme-score-clinical-patient-notes/test.csv')

### Train CSV
Feature annotations for 1000 of the patient notes, 100 for each of ten cases.
- `id` - Unique identifier for each patient note / feature pair.
- `pn_num `- The patient note annotated in this row.
- `feature_num `- The feature annotated in this row.
- `case_num` - The case to which this patient note belongs.
- `annotation` - The text(s) within a patient note indicating a feature. A feature may be indicated multiple times within a single note.
- `location` - Character spans indicating the location of each annotation within the note. Multiple spans may be needed to represent an annotation, in which case the spans are delimited by a semicolon

In [None]:
#check train columns
train.columns

In [None]:
#row and columns of train data
train.shape

In [None]:
#chek null value
train.isnull().sum().sum()

### Discuss about Patience Note 

In [None]:
#view from patience note
patiens_note.head(5)

In [None]:
patiens_note.shape

### Value counts from Each Cases

In [None]:
#value counts
temp = patiens_note.groupby('case_num').count()['pn_history'].reset_index().sort_values(by='pn_history', ascending=False)
temp.style.background_gradient(cmap='Purples')

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(x='case_num', data=patiens_note)

In [None]:
fig=go.Figure(go.Funnelarea(
    text=temp.case_num,
    values=temp.pn_history,
    title= {"position":"top center", "text": "Funnel-chart on Case Number Distribution"}))
fig.show()

### Most Common word 

In [None]:
#Most common word in the target Selected
patiens_note['temp']= patiens_note['pn_history'].apply(lambda x: str(x).split())
top= Counter([item for sublist in patiens_note['temp'] for item in sublist])
temp = pd.DataFrame(top.most_common(20))
temp.columns=["Common_words", 'count']
temp.style.background_gradient(cmap='Blues')

In [None]:
fig = px.bar(temp,x="count", y="Common_words", 
             title="Common words in Selected Text", orientation='h', width=700, height=700,color='Common_words')
fig.show()

### Remove Stop words and check common word again

In [None]:
def remove_stopword(x):
    return [y for y in x if y not in stopwords.words('english')]
patiens_note['temp'] = patiens_note['temp'].apply(lambda x: remove_stopword(x))

In [None]:
top = Counter([item for sublist in patiens_note['temp'] for item in sublist])
temp=pd.DataFrame(top.most_common(20))
temp=temp.iloc[1:,:]
temp.columns=["Common_words",'count']
temp.style.background_gradient(cmap='Purples')

In [None]:
fig = px.treemap(temp, path=['Common_words'], values='count', title='Tree of Most Common words')
fig.show()

### Most common word in  most popular case 

In [None]:
most_popular_case = patiens_note[patiens_note['case_num']==3]

In [None]:
#MosT common  words in 
top = Counter([item for sublist in most_popular_case['temp'] for item in sublist])
temp_positive = pd.DataFrame(top.most_common(20))
temp_positive.columns = ['Common_words','count']
temp_positive.style.background_gradient(cmap='Greens')

In [None]:
fig = px.bar(temp_positive, x="count", y="Common_words", title='Most Commmon Words popular cases Words', orientation='h', 
             width=700, height=700,color='Common_words')
fig.show()

### Sample of Patience Note

In [None]:
print(patiens_note["pn_history"].iloc[15])

### Distribution of top n-grams for pn_history

In [None]:
# if n_words=1 -> unigrams, if n_words=2 -> bigrams..
def get_top_Nwords(corpus, n=None, remove_stop_words=False, n_words=1):
    if remove_stop_words:
        vec = CountVectorizer(stop_words='english', ngram_range=(n_words, n_words)).fit(corpus)
    else:
        vec = CountVectorizer(ngram_range=(n_words, n_words)).fit(corpus)
    
    bag_ofWords =vec.transform(corpus)
    sum_words = bag_ofWords.sum(axis=0)
    words_freq = [(word, sum_words[0,idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:n]

### Bigram distribution

In [None]:
common_words = get_top_Nwords(patiens_note['pn_history'].drop_duplicates(), 20, remove_stop_words=True, n_words=2)
for word, freq in common_words:
    print(word, freq)

In [None]:
train_temp =pd.DataFrame(common_words, columns=['text', 'count'])
fig = plt.figure(figsize=(10,8))
ax1 = train_temp.groupby('text').sum()['count'].sort_values(ascending=False).plot(kind='bar',
                                                                            color='#120f7b')
ax1.set_title("Bigram_distributin")
ax1.set_xlabel('Bigram')
ax1.set_ylabel("frequency")
plt.show()

### Tigram distribution

In [None]:
common_words = get_top_Nwords(patiens_note['pn_history'].drop_duplicates(), 20, remove_stop_words=True, n_words=3)
for word, freq in common_words:
    print(word, freq)

In [None]:
train_temp =pd.DataFrame(common_words, columns=['text', 'count'])
fig = plt.figure(figsize=(10,8))
ax1 = train_temp.groupby('text').sum()['count'].sort_values(ascending=False).plot(kind='bar',
                                                                            color='#120f7b')
ax1.set_title("Tigram_distributin")
ax1.set_xlabel('Tigram')
ax1.set_ylabel("frequency")
plt.show()

Hey, why not we check the wordcloud for better observation.
### WordCloud

In [None]:
stopwords=set(STOPWORDS)
wordclouds=WordCloud(width=800, 
                    height=700,
                    background_color='white',
                    max_font_size=120,
                    min_font_size=10,
                    stopwords=stopwords).generate(''.join(patiens_note['pn_history']))

#plot the worldcloud image
plt.figure(figsize=(8,8), facecolor=None)
plt.imshow(wordclouds)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

In [None]:
def pre_processing_text(text, flg_steam=False,flg_lemm=True):
    #lower case
    text=re.sub(r'[^\w\s]','',str(text).lower().strip())
    #tokenize
    words = text.split()
    #stop steamming
    if flg_steam==True:
        ps=nltk.stem.porter.PorterStemmer()
        words = [ps.stem(word) for word in words]
    #lemmitization
    if flg_lemm == True:
        lem=nltk.stem.wordnet.WordNetLemmatizer()
        words = [lem.lemmatize(word) for word in words]
    
    #back to string from list
    text = ' '.join(words)
    return text
        

In [None]:
# length of pn_history
patiens_note['clean_history']=patiens_note['temp'].apply(lambda x: pre_processing_text(x,
                                                                                       flg_steam=False, flg_lemm=True))

In [None]:
#clean word count
patiens_note['clean_word']=patiens_note['clean_history'].apply(lambda x: len(x))
patiens_note['word_count'] = patiens_note['clean_history'].apply(lambda x: len(str(x).split(" ")))

In [None]:
def plot_distribution(x, title):
    fig=px.histogram(patiens_note['pn_history'],
                    x=x,
                    width=800,
                    height=500,
                    title=title)
    fig.show()

In [None]:
plot_distribution(x=patiens_note['clean_word'], title='Patience Note length')

In [None]:
plot_distribution(x=patiens_note['word_count'], title='Patience Note Word Count')

- Lets do some stuff on Features data.
## Features dataset and that view

In [None]:
features.head(5)

In [None]:
features_note = features.groupby("case_num").count()
fig = px.bar(data_frame =features_note, 
             x = features_note.index,
             y = 'feature_num' , 
             color_discrete_sequence=['#D63230'],
             orientation='h',
             color_continuous_scale="Emrld")
fig.show()

### Value count in Features

In [None]:
temp = features.groupby('case_num').count()['feature_text'].reset_index().sort_values(by='feature_text', ascending=False)
temp.style.background_gradient(cmap='Purples')

In [None]:
fig=go.Figure(go.Funnelarea(
    text=temp.case_num,
    values=temp.feature_text,
    title= {"position":"top center", "text": "Funnel-chart on Case Number Distribution"}))
fig.show()

### WordCloud

In [None]:
stopwords=set(STOPWORDS)
wordclouds=WordCloud(width=800, 
                    height=600,
                    max_font_size=100,
                    background_color='white',
                    min_font_size=10,
                    stopwords=stopwords).generate(''.join(features['feature_text']))

#plot the worldcloud image
plt.figure(figsize=(8,8), facecolor=None)
plt.imshow(wordclouds)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

Thanks for checking this notebook. If you have any query please feel free to ask and also suggest me if you have. Don't forget to support me. :-)