In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA, TruncatedSVD
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

###  Problem
> develop an automated method to map clinical concepts from an exam rubric (e.g., “diminished appetite”) to various ways in which these concepts are expressed in clinical patient notes written

Given an clinical patient note map out clinical concepts in it.

Lets try to understand data files given to get intuitive understanding of what the problem is. See if we can map input and output of the problem at the end of analysis


* ### [Patient Notes](#section-one)
* ### [Features](#section-two)
* ### [Train](#section-three)
* ### [Conclusion](#section-four)


<a id="section-one"></a>
### Patient notes

This is nothing but patient individual notes

In [None]:
df_patient = pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/patient_notes.csv')

In [None]:
df_patient.head()

In [None]:
# we can see number of patients, case_nums
df_patient.pn_num.nunique(),df_patient.case_num.nunique(), df_patient.shape

In [None]:
df_patient[df_patient['pn_num'].isin([1,0])]['pn_history'].values

#### we can see there are specific fields they are collecting for every patients like age, symptom, family history, previous treatment, its effects on the life of patient, usual routine (diet and excersize.. etc). In later analysis we will analyse these patterns.

<a id="section-two"></a>
## Features

Lets understand what each case represents

In [None]:
df_features = pd.read_csv("/kaggle/input/nbme-score-clinical-patient-notes/features.csv")

In [None]:
df_features.head()

In [None]:
df_features.shape, df_features.feature_num.nunique(), df_features.case_num.nunique()

#### As we can see, we have 10 cases and each case has some feature associated with it

In [None]:
sns.countplot(data=df_features, x='case_num')

#### Lets see top words in each cases to see what kind of symptoms we are talking about

We use tfidf because we already know the no cases so : build Tfidf vectorizer and sort through the features based on their weight for each of the case

In [None]:
def clean(txt):
    txt = txt.replace('-', ' ')
    return txt.lower()
df_features['feature_text'] = df_features['feature_text'].apply(clean)
df_features['feature_text'][:10].values

In [None]:
tmp_df = df_features.groupby('case_num')['feature_text'].agg(lambda x: ' '.join(x)).reset_index()
tmp_df.head()

In [None]:
tfidf = TfidfVectorizer(stop_words=stop_words)
tf_vector = tfidf.fit_transform(tmp_df['feature_text'])

In [None]:
def get_topk_words(tf_vector, top_k):
    for i in range(tf_vector.shape[0]):
        row = tf_vector[i].toarray()
        row_vector = np.squeeze(row)
        sort_features = np.argsort(row_vector)[::-1][:top_k]
        features = tfidf.get_feature_names()
        li = [features[i] for i in sort_features]
        print(f"cases number: {i} and its top words : {li}")

In [None]:
get_topk_words(tf_vector, 10) ## we could see each of cases belong to sympton or complaint or concern.

#### we can see tfidf words associated with each case and figureout associated symptom with it:
* 1 st one talks about heart rate.
* 2 nd about addominal, diarrhea.
* 3 rd about permenstrual
* etc..

#### Do cluster of feature to see what kind of patterns we have

In [None]:
df_features.head()

In [None]:
# Lets intialize tfidf again this time on feature_text itself.
tfidf = TfidfVectorizer(stop_words=stop_words)
tfidf_vector = tfidf.fit_transform(df_features['feature_text'])
tfidf_vector.shape

In [None]:
df_features.groupby('case_num')['feature_num'].count().mean()

In [None]:
# we could see each case has on average 14 features, let's see if we can see pattern in the features are collecting for each case
true_k = 14
kmeans = KMeans(n_clusters=true_k, random_state=42)
kmeans.fit(tfidf_vector)

In [None]:
order_centers = kmeans.cluster_centers_.argsort()[:,::-1]
terms = tfidf.get_feature_names()
for i in range(true_k):
    terms_out = []
    for ind in order_centers[i, :10]:
        terms_out.append(terms[ind])
    print(f"cluster {i}, and its words: {','.join(terms_out)}")

#### We could see cluster
* 0,1,5,3,6,7,8 symptoms
* 5 about family history 
* 11,10, 9talks about habits
* 1,4 duration, how many days was it present.
* 2 date info
* etc..

<a id="section-three"></a>
## Train

only 1000 patient data is annotated.

In [None]:
df_train = pd.read_csv("/kaggle/input/nbme-score-clinical-patient-notes/train.csv")

In [None]:
df_train.head()

In [None]:
df_train.shape, df_train.pn_num.nunique(), df_train.case_num.nunique()

#### doe's patients have only one case associated with it

conform's the beow code that each patient belong to one case only

In [None]:
df_groupby_pt = df_train.groupby(['pn_num'])['case_num'].nunique()

In [None]:
(df_groupby_pt != 1).sum() # all are false so when you sum it it's 0

#### Compare case number and their features

In [None]:
sns.countplot(data=df_train, x='case_num') ## We could see each cases has same features represented in features file

### Test file

In [None]:
df_test = pd.read_csv("/kaggle/input/nbme-score-clinical-patient-notes/test.csv")

In [None]:
df_test

### sample submission file

In [None]:
df_sub = pd.read_csv("/kaggle/input/nbme-score-clinical-patient-notes/sample_submission.csv")
df_sub

<a id="section-four"></a>

### Conclusion/Analysis


* Given a case number and patient number along with feature num we have to find reference of it expressed in patient notes and retrieve the index.
* how analysis of cases, feature names help us understand how/what kind of symptom data we are dealing with. Since we see number of different features: we can have different mechanisms to extract different features.
* Since only 1000 patients has annotations. by using pseudo labelling schema we can see for other patient history labells can be generated or not


This is on going document, as and when I find more analysis I will be updaing it!. Fork and feel free to add your analysis. 

                                            


#                                                                             **Kindly upvote if you find it useful**