In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import json

import collections

**I've put little effort to understand and figure out methods. Any suggestions are highly appreciated and** **please leave a comment. Upvote if you think it's worth an upvote**
**Thank You** 

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
f_path = '/kaggle/input/nbme-score-clinical-patient-notes/'
features = pd.read_csv(f_path+'features.csv')
sample_submission = pd.read_csv(f_path+'sample_submission.csv')
patient_notes = pd.read_csv(f_path+'patient_notes.csv')
train = pd.read_csv(f_path+'train.csv')
test = pd.read_csv(f_path+'test.csv')

### Exploring features.csv

For a particular Case number we will have number of features, there's a verbal explanation which describes what should be marked under a particular feature.

- No null value is there
- Number of Features varies to different Cases
- Total we have 143 features across all cases


|Case | Number of Frquencies |
| --- | --- |
|0|    13|
|1|    13|
|2|    17|
|3|    16|
|4|    10|
|5|    18|
|6|    12|
|7|     9|
|8|    18|
|9|    17|

In [None]:
print(features.shape)
features.info()


In [None]:
features.head()

In [None]:
features.groupby(['case_num'])['feature_num'].agg('count')

### Exploring patient_notes.csv

This the notes taken on each patient. 
- Patient has unique id : pn_num
- Complete Note is given in pn_history
- Case is identified by case_num, which is also mentioned in features
- No Patient is repeated in the list
- No null values
- Total 42146 records available

#### example of note : 
17-year-old male, has come to the student health clinic complaining of heart pounding. Mr. Cleveland's mother has given verbal consent for a history, physical examination, and treatment\
-began 2-3 months ago,sudden,intermittent for 2 days(lasting 3-4 min),worsening,non-allev/aggrav \
-associated with dispnea on exersion and rest,stressed out about school \
-reports fe feels like his heart is jumping out of his chest \
-ros:denies chest pain,dyaphoresis,wt loss,chills,fever,nausea,vomiting,pedal edeam \
-pmh:non,meds :aderol (from a friend),nkda \
-fh:father had MI recently,mother has thyroid dz \
-sh:non-smoker,mariguana 5-6 months ago,3 beers on the weekend, basketball at school \
-sh:no std 

In [None]:
print(patient_notes.shape)
patient_notes.info()
patient_notes.head()

In [None]:
len(patient_notes['pn_num'].unique())

In [None]:
print(patient_notes[patient_notes['pn_num']==0]['pn_history'][0])

### Exploring train.csv

Train CSV is a combination of Feature and Patient Note
- There's unique ID for each row
- Case Number, Patient Number and Feature Number is associating with previous dataframes
- From Patients note (pn_history). features matching with features table for particular case is identified
- in annotation the key statement is annoted and on location character wise location is given

In [None]:
print(train.shape)
train.info()
train.head()

In [None]:
test.head()

In [None]:
sample_submission.head()

### Let's Try to Figure out the requirement Mannually

Just like search engine and tags. We can try to figure out the statement which valid using the frequently used Keys. Here I'm trying to find out most frequently used keys for each feature in each case, which we can later use for further analysis

In [None]:
# I'm listing out all annotations of patient no : 16 as per the training data
train[train['pn_num']==16]

In [None]:
# Also I'm checking the note from which We were able to pick the annotations.
print(patient_notes[patient_notes['pn_num']==16]['pn_history'][16])

In [None]:
# I'm listing out all the features of the particular case patient 16 is having
features[features['case_num']==0]

In [None]:
# expanding feature 0
features[features['case_num']==0]['feature_text'][0]

In [None]:
# expanding feature 5
features[features['case_num']==0]['feature_text'][5]

### Trying to Solve for one Feature 

I'm trying to implement a key word list for 1 particular feature. Once we finish we'll try to create a method and run for all rows in features and store it in a dataframe

In [None]:
# Duplicating train df
dummy_train = train.copy()

In [None]:
# removing all unnecessary characters from the data frame
dummy_train['annotation']  = dummy_train['annotation'].str.replace('[','')
dummy_train['annotation']  = dummy_train['annotation'].str.replace(']','')
dummy_train['annotation']  = dummy_train['annotation'].str.replace("'","")
dummy_train['annotation']  = dummy_train['annotation'].str.replace('"','')
dummy_train['annotation']  = dummy_train['annotation'].str.replace(',','')
dummy_train['annotation']  = dummy_train['annotation'].str.replace('-',' ')
dummy_train['annotation']  = dummy_train['annotation'].str.replace(':',' ')

# changing all words to lower so we can avoid repetition 
dummy_train['annotation']  = dummy_train['annotation'].str.lower()
dummy_train

In [None]:
# Taking all unique values for a particular feature for a particular case and storing values to an array
feature_list = pd.Series(dummy_train[(train['case_num']==0) & (train['feature_num']==0)]['annotation']).unique()
print(type(feature_list))
print(feature_list)

In [None]:
# From sentence I'm splitting it into words,
feature_word_list = ' '.join(feature_list).split()
print(type(feature_word_list))
print(feature_word_list)

In [None]:
# Creating a frequency list for each word, so we can find out most used word for this particular feature
counter=collections.Counter(feature_word_list)
feature_word_list = dict(sorted(counter.items(), key=lambda item: item[1],reverse=True))
print(feature_word_list)

In [None]:
# let's remove common used english words like a and an. For now we've taken only small set of list, 
# later we will expand it
drop_keys = ['had','with','in','a','an','of','his','for','has','the','at']
for i in drop_keys:
    if i in feature_word_list:
        del feature_word_list[i]

In [None]:
# removing all words which has frequency less than 5
feature_word_list = {key:val for key, val in feature_word_list.items() if val > 4}

In [None]:
# let's see the final list
print(feature_word_list)

In [None]:
# Now I'll create a list from the word, currently feature_word_list is a dict with vale as freq
final_list = list(feature_word_list.keys())
print(type(final_list))
print(final_list)

###  Now let's create a Method to create Key list for all features

In [None]:
dummy_features = features.copy()


In [None]:
def GenerateKeys(string_list, freq):
    drop_keys = ['had','with','in','a','an','of','his','for','has','the','at','no','last',
                 'ago','to','not','past','was','her','he','his','and','is','have','when','up','but','mo"]',
                 'by','him']
    string_word_list = ' '.join(string_list).split()
    counter=collections.Counter(string_word_list)
    string_word_list = dict(sorted(counter.items(), key=lambda item: item[1],reverse=True))
    
    for drop_key in drop_keys:
        if drop_key in string_word_list:
            del string_word_list[drop_key]
        
    string_word_list = {key:val for key, val in string_word_list.items() if val > freq}
    return list(string_word_list.keys())

In [None]:
dummy_features.reset_index()

for idx,row in dummy_features.iterrows():
    string_list = pd.Series(dummy_train[(dummy_train['case_num']==row['case_num']) & (dummy_train['feature_num']==row['feature_num'])]['annotation']).unique()
    no = len(string_list)
    keys = GenerateKeys(string_list,2)
    dummy_features.at[idx,'keys'] = json.dumps(keys)
    dummy_features.at[idx,'no_of_notes'] = no
    
    
    

In [None]:
dummy_features[dummy_features['case_num']==4]

In [None]:
# The keys still need some adjustments as it some of them had only 2 notes per feature.
# Apart from that I hope this will give some headstart
# will continue with notebook and methods to predict the sentence using keys
# find the features_with_keys.csv in data section of this notebook
dummy_features.to_csv('features_with_keys.csv')