# INTRODUCTION
What we need to clarify superior in this kaggle, you'll characterize certain clinical concepts in patient notes.
Particularly, you may create an mechanized strategy for mapping clinical concepts from an exam rubric to the different ways in which these concepts are communicated in clinical understanding notes composed by restorative understudies.
Extraordinary arrangements will be both precise and solid.

<br>CONTENT:
1. [Explanation of Features](#1)
2. [Loading Data](#2)



In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from plotly.offline import init_notebook_mode, iplot, plot
import plotly as py

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from wordcloud import WordCloud, STOPWORDS
import wordcloud
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<a id="1"></a> <br>
## Explanation of Features
* Description from hosts
  * featureNum -> A unique identifier for each feature.
  * caseNum -> A unique identifier for each case.
  * featureText -> A description of the feature.
  * pn_num -> The patient note annotated in this row.
  * pn_history -> The text of the encounter as recorded by the test taker.
  * pn_annotion -> The text(s) within a patient note indicating a feature. A feature may be indicated multiple times within a single note.
  * pn_loc ->  Character spans indicating the location of each annotation within the note. Multiple spans may be needed to represent an annotation, in which case the spans are delimited by a semicolon
  



<a id="2"></a> <br>
## Loading Data

In [None]:
train =pd.read_csv('../input/nbme-score-clinical-patient-notes/train.csv')
patient_notes = pd.read_csv('../input/nbme-score-clinical-patient-notes/patient_notes.csv')
features = pd.read_csv('../input/nbme-score-clinical-patient-notes/features.csv')
test= pd.read_csv('../input/nbme-score-clinical-patient-notes/test.csv')

In [None]:
train.head()

In [None]:
patient_notes.head()

In [None]:
features.head()

In [None]:
train.info()

In [None]:
train.value_counts()

In [None]:
patient_notes.shape

In [None]:
#replace the nan values with 0's
train.id.replace(["-"],0.0,inplace=True)
#Change type of poverty rate string to float
train.id=train.id.astype(float)

In [None]:
train.info()

In [None]:
#value counts
v_count = patient_notes.groupby('case_num').count()['pn_history'].reset_index().sort_values(by='case_num', ascending=False)
v_count

In [None]:
#visualization for case_num
sns.countplot(patient_notes.case_num)
sns.countplot(patient_notes.case_num)

In [None]:
from collections import Counter
#Most common word in the target Selected
patient_notes['temp']= patient_notes['pn_history'].apply(lambda x: str(x).split())
top= Counter([item for sublist in patient_notes['temp'] for item in sublist])
common = pd.DataFrame(top.most_common(10))
common.columns=["Common_words", 'count']
common.style.background_gradient(cmap='Purples')

In [None]:
# data preparation
df2016 = patient_notes[patient_notes.case_num == 1].iloc[:7,:]
pie1 = df2016.case_num 
labels = df2016.pn_num
# figure
fig = {
  "data": [
    {
      "values": pie1,
      "labels": labels,
      "domain": {"x": [0, .5]},
      "hoverinfo":"label+percent+name",
      "hole": .3,
      "type": "pie"
    },],
  "layout": {
        "title":"patiens_note",
        "annotations": [
            { "font": { "size": 20},
              "showarrow": False,
              "text": "patiens_note",
                "x": 0.20,
                "y": 1
            },
        ]
    }
}
iplot(fig)

In [None]:
patients_notes_count = patient_notes.groupby("case_num").count()
fig = px.bar(data_frame =patients_notes_count, 
             x = patients_notes_count.index,
             y = 'pn_num' , 
             color = "pn_num",
             color_continuous_scale="Purples") 
fig.update_layout(title = {
        'text': 'Numbers of patient notes for each case',
        'y':0.95,
        'x':0.48,
        'xanchor': 'center',
        'yanchor': 'top'} ,
                   xaxis = dict(
        tickmode = 'array',
        tickvals = [0, 1,2, 3, 4,5, 6,7,8,9],
        ticktext = ['Case 0', 'Case 1', 'Case 2', 'Case 3', 'Case 4', 'Case 5', 'Case 6', 'Case 7', 'Case 8', 'Case 9']))
fig.show()

In [None]:
# prepare data
x2011 = patient_notes.pn_history[patient_notes.case_num == 1]
x2012 = patient_notes.pn_history[patient_notes.case_num == 2]

trace1 = go.Histogram(
    x=x2011,
    opacity=0.75,
    name = "case_1",
    marker=dict(color='rgba(171, 50, 96, 0.6)'))
trace2 = go.Histogram(
    x=x2012,
    opacity=0.75,
    name = "case_2",
    marker=dict(color='rgba(12, 50, 196, 0.6)'))

data = [trace1, trace2]
layout = go.Layout(barmode='overlay',
                   title=' patient_notes ',
                   xaxis=dict(title='case_num = 1'),
                   yaxis=dict( title="case_num = 2"),
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [None]:
%ls "../input"

In [None]:
df = pd.read_csv("../input/nbme-score-clinical-patient-notes/sample_submission.csv")

In [None]:
df.to_csv("submission.csv",index=False)