In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import json

import collections

In [2]:
# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

In [3]:
f_path = 'data/'
features = pd.read_csv(f_path+'features.csv')
sample_submission = pd.read_csv(f_path+'sample_submission.csv')
patient_notes = pd.read_csv(f_path+'patient_notes.csv')
train = pd.read_csv(f_path+'train.csv')
test = pd.read_csv(f_path+'test.csv')

### Exploring features.csv

For a particular Case number we will have number of features, there's a verbal explanation which describes what should be marked under a particular feature.

- No null value is there
- Number of Features varies to different Cases
- Total we have 143 features across all cases


|Case | Number of Frquencies |
| --- | --- |
|0|    13|
|1|    13|
|2|    17|
|3|    16|
|4|    10|
|5|    18|
|6|    12|
|7|     9|
|8|    18|
|9|    17|

In [4]:
print(features.shape)
features.info()


(143, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143 entries, 0 to 142
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   feature_num   143 non-null    int64 
 1   case_num      143 non-null    int64 
 2   feature_text  143 non-null    object
dtypes: int64(2), object(1)
memory usage: 3.5+ KB


In [5]:
features.head()

Unnamed: 0,feature_num,case_num,feature_text
0,0,0,Family-history-of-MI-OR-Family-history-of-myoc...
1,1,0,Family-history-of-thyroid-disorder
2,2,0,Chest-pressure
3,3,0,Intermittent-symptoms
4,4,0,Lightheaded


In [6]:
features.groupby(['case_num'])['feature_num'].agg('count')

case_num
0    13
1    13
2    17
3    16
4    10
5    18
6    12
7     9
8    18
9    17
Name: feature_num, dtype: int64

### Exploring patient_notes.csv

This the notes taken on each patient. 
- Patient has unique id : pn_num
- Complete Note is given in pn_history
- Case is identified by case_num, which is also mentioned in features
- No Patient is repeated in the list
- No null values
- Total 42146 records available

#### example of note : 
17-year-old male, has come to the student health clinic complaining of heart pounding. Mr. Cleveland's mother has given verbal consent for a history, physical examination, and treatment\
-began 2-3 months ago,sudden,intermittent for 2 days(lasting 3-4 min),worsening,non-allev/aggrav \
-associated with dispnea on exersion and rest,stressed out about school \
-reports fe feels like his heart is jumping out of his chest \
-ros:denies chest pain,dyaphoresis,wt loss,chills,fever,nausea,vomiting,pedal edeam \
-pmh:non,meds :aderol (from a friend),nkda \
-fh:father had MI recently,mother has thyroid dz \
-sh:non-smoker,mariguana 5-6 months ago,3 beers on the weekend, basketball at school \
-sh:no std 

In [7]:
print(patient_notes.shape)
patient_notes.info()
patient_notes.head()

(42146, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42146 entries, 0 to 42145
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   pn_num      42146 non-null  int64 
 1   case_num    42146 non-null  int64 
 2   pn_history  42146 non-null  object
dtypes: int64(2), object(1)
memory usage: 987.9+ KB


Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...


In [8]:
len(patient_notes['pn_num'].unique())

42146

In [9]:
print(patient_notes[patient_notes['pn_num']==0]['pn_history'][0])

17-year-old male, has come to the student health clinic complaining of heart pounding. Mr. Cleveland's mother has given verbal consent for a history, physical examination, and treatment
-began 2-3 months ago,sudden,intermittent for 2 days(lasting 3-4 min),worsening,non-allev/aggrav
-associated with dispnea on exersion and rest,stressed out about school
-reports fe feels like his heart is jumping out of his chest
-ros:denies chest pain,dyaphoresis,wt loss,chills,fever,nausea,vomiting,pedal edeam
-pmh:non,meds :aderol (from a friend),nkda
-fh:father had MI recently,mother has thyroid dz
-sh:non-smoker,mariguana 5-6 months ago,3 beers on the weekend, basketball at school
-sh:no std


### Exploring train.csv

Train CSV is a combination of Feature and Patient Note
- There's unique ID for each row
- Case Number, Patient Number and Feature Number is associating with previous dataframes
- From Patients note (pn_history). features matching with features table for particular case is identified
- in annotation the key statement is annoted and on location character wise location is given

In [10]:
print(train.shape)
train.info()
train.head()

(14300, 6)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14300 entries, 0 to 14299
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           14300 non-null  object
 1   case_num     14300 non-null  int64 
 2   pn_num       14300 non-null  int64 
 3   feature_num  14300 non-null  int64 
 4   annotation   14300 non-null  object
 5   location     14300 non-null  object
dtypes: int64(3), object(3)
memory usage: 670.4+ KB


Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location
0,00016_000,0,16,0,['dad with recent heart attcak'],['696 724']
1,00016_001,0,16,1,"['mom with ""thyroid disease']",['668 693']
2,00016_002,0,16,2,['chest pressure'],['203 217']
3,00016_003,0,16,3,"['intermittent episodes', 'episode']","['70 91', '176 183']"
4,00016_004,0,16,4,['felt as if he were going to pass out'],['222 258']


In [11]:
test.head()

Unnamed: 0,id,case_num,pn_num,feature_num
0,00016_000,0,16,0
1,00016_001,0,16,1
2,00016_002,0,16,2
3,00016_003,0,16,3
4,00016_004,0,16,4


In [12]:
sample_submission.head()

Unnamed: 0,id,location
0,00016_000,0 100
1,00016_001,
2,00016_002,200 250;300 400
3,00016_003,
4,00016_004,75 110


### Let's Try to Figure out the requirement Mannually

Just like search engine and tags. We can try to figure out the statement which valid using the frequently used Keys. Here I'm trying to find out most frequently used keys for each feature in each case, which we can later use for further analysis

In [13]:
# I'm listing out all annotations of patient no : 16 as per the training data
train[train['pn_num']==16]

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location
0,00016_000,0,16,0,['dad with recent heart attcak'],['696 724']
1,00016_001,0,16,1,"['mom with ""thyroid disease']",['668 693']
2,00016_002,0,16,2,['chest pressure'],['203 217']
3,00016_003,0,16,3,"['intermittent episodes', 'episode']","['70 91', '176 183']"
4,00016_004,0,16,4,['felt as if he were going to pass out'],['222 258']
5,00016_005,0,16,5,[],[]
6,00016_006,0,16,6,"['adderall', 'adderrall', 'adderrall']","['321 329', '404 413', '652 661']"
7,00016_007,0,16,7,[],[]
8,00016_008,0,16,8,[],[]
9,00016_009,0,16,9,"['palpitations', 'heart beating/pounding']","['26 38', '96 118']"


In [14]:
# Also I'm checking the note from which We were able to pick the annotations.
print(patient_notes[patient_notes['pn_num']==16]['pn_history'][16])

HPI: 17yo M presents with palpitations. Patient reports 3-4 months of intermittent episodes of "heart beating/pounding out of my chest." 2 days ago during a soccer game had an episode, but this time had chest pressure and felt as if he were going to pass out (did not lose conciousness). Of note patient endorses abusing adderall, primarily to study (1-3 times per week). Before recent soccer game, took adderrall night before and morning of game. Denies shortness of breath, diaphoresis, fevers, chills, headache, fatigue, changes in sleep, changes in vision/hearing, abdominal paun, changes in bowel or urinary habits. 
PMHx: none
Rx: uses friends adderrall
FHx: mom with "thyroid disease," dad with recent heart attcak
All: none
Immunizations: up to date
SHx: Freshmen in college. Endorses 3-4 drinks 3 nights / week (on weekends), denies tabacco, endorses trying marijuana. Sexually active with girlfriend x 1 year, uses condoms


In [15]:
# I'm listing out all the features of the particular case patient 16 is having
features[features['case_num']==0]

Unnamed: 0,feature_num,case_num,feature_text
0,0,0,Family-history-of-MI-OR-Family-history-of-myoc...
1,1,0,Family-history-of-thyroid-disorder
2,2,0,Chest-pressure
3,3,0,Intermittent-symptoms
4,4,0,Lightheaded
5,5,0,No-hair-changes-OR-no-nail-changes-OR-no-tempe...
6,6,0,Adderall-use
7,7,0,Shortness-of-breath
8,8,0,Caffeine-use
9,9,0,heart-pounding-OR-heart-racing


In [16]:
# expanding feature 0
features[features['case_num']==0]['feature_text'][0]

'Family-history-of-MI-OR-Family-history-of-myocardial-infarction'

In [17]:
# expanding feature 5
features[features['case_num']==0]['feature_text'][5]

'No-hair-changes-OR-no-nail-changes-OR-no-temperature-intolerance'

### Trying to Solve for one Feature 

I'm trying to implement a key word list for 1 particular feature. Once we finish we'll try to create a method and run for all rows in features and store it in a dataframe

In [18]:
# Duplicating train df
dummy_train = train.copy()

In [19]:
# removing all unnecessary characters from the data frame
dummy_train['annotation']  = dummy_train['annotation'].str.replace('[','')
dummy_train['annotation']  = dummy_train['annotation'].str.replace(']','')
dummy_train['annotation']  = dummy_train['annotation'].str.replace("'","")
dummy_train['annotation']  = dummy_train['annotation'].str.replace('"','')
dummy_train['annotation']  = dummy_train['annotation'].str.replace(',','')
dummy_train['annotation']  = dummy_train['annotation'].str.replace('-',' ')
dummy_train['annotation']  = dummy_train['annotation'].str.replace(':',' ')

# changing all words to lower so we can avoid repetition 
dummy_train['annotation']  = dummy_train['annotation'].str.lower()
dummy_train

  
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location
0,00016_000,0,16,0,dad with recent heart attcak,['696 724']
1,00016_001,0,16,1,mom with thyroid disease,['668 693']
2,00016_002,0,16,2,chest pressure,['203 217']
3,00016_003,0,16,3,intermittent episodes episode,"['70 91', '176 183']"
4,00016_004,0,16,4,felt as if he were going to pass out,['222 258']
...,...,...,...,...,...,...
14295,95333_912,9,95333,912,,[]
14296,95333_913,9,95333,913,,[]
14297,95333_914,9,95333,914,photobia,['274 282']
14298,95333_915,9,95333,915,no sick contacts,['421 437']


In [20]:
# Taking all unique values for a particular feature for a particular case and storing values to an array
feature_list = pd.Series(dummy_train[(train['case_num']==0) & (train['feature_num']==0)]['annotation']).unique()
print(type(feature_list))
print(feature_list)

<class 'numpy.ndarray'>
['dad with recent heart attcak' '' 'father  heart attack' 'father mi'
 'dad mi' 'father had acute mi' 'father heart attach' 'dad had recent mi'
 'dad had heart attack' 'father heart problem'
 'mi of his father father had mi' 'mi in the father'
 'father with heart attack recently father with heart attack'
 'father had an mi' 'dad had mi' 'father hx of mi'
 'father had a heart attack' 'father had mi' 'father with mi'
 'father had heart attack' 'heart attack 1 year ago   father'
 'ather heart attack' 'father heart attack' 'heart attack in father'
 'mi at 52 for father' 'dad   heart attack' 'father recently had an mi'
 'mi in his father' 'father had problems with heart'
 'dad with heart attack' 'dad  heart problem'
 'fh positive for a recent heart attack father  mi'
 'dad has cardiac issues' 'dad with recent heart attack' 'father  mi'
 'father   mi' 'father had possible mi' 'dad heart attack'
 'dad had a heart attack' 'father  myocardial infarction' 'cad in father'


In [21]:
# From sentence I'm splitting it into words,
feature_word_list = ' '.join(feature_list).split()
print(type(feature_word_list))
print(feature_word_list)

<class 'list'>
['dad', 'with', 'recent', 'heart', 'attcak', 'father', 'heart', 'attack', 'father', 'mi', 'dad', 'mi', 'father', 'had', 'acute', 'mi', 'father', 'heart', 'attach', 'dad', 'had', 'recent', 'mi', 'dad', 'had', 'heart', 'attack', 'father', 'heart', 'problem', 'mi', 'of', 'his', 'father', 'father', 'had', 'mi', 'mi', 'in', 'the', 'father', 'father', 'with', 'heart', 'attack', 'recently', 'father', 'with', 'heart', 'attack', 'father', 'had', 'an', 'mi', 'dad', 'had', 'mi', 'father', 'hx', 'of', 'mi', 'father', 'had', 'a', 'heart', 'attack', 'father', 'had', 'mi', 'father', 'with', 'mi', 'father', 'had', 'heart', 'attack', 'heart', 'attack', '1', 'year', 'ago', 'father', 'ather', 'heart', 'attack', 'father', 'heart', 'attack', 'heart', 'attack', 'in', 'father', 'mi', 'at', '52', 'for', 'father', 'dad', 'heart', 'attack', 'father', 'recently', 'had', 'an', 'mi', 'mi', 'in', 'his', 'father', 'father', 'had', 'problems', 'with', 'heart', 'dad', 'with', 'heart', 'attack', 'dad', '

In [22]:
# Creating a frequency list for each word, so we can find out most used word for this particular feature
counter=collections.Counter(feature_word_list)
feature_word_list = dict(sorted(counter.items(), key=lambda item: item[1],reverse=True))
print(feature_word_list)

{'father': 37, 'heart': 26, 'mi': 23, 'attack': 20, 'had': 15, 'dad': 14, 'with': 9, 'recent': 4, 'in': 4, 'a': 4, 'problem': 3, 'an': 3, 'of': 2, 'his': 2, 'recently': 2, 'hx': 2, 'for': 2, 'has': 2, 'attcak': 1, 'acute': 1, 'attach': 1, 'the': 1, '1': 1, 'year': 1, 'ago': 1, 'ather': 1, 'at': 1, '52': 1, 'problems': 1, 'fh': 1, 'positive': 1, 'cardiac': 1, 'issues': 1, 'possible': 1, 'myocardial': 1, 'infarction': 1, 'cad': 1, 'suffer': 1, 'herat': 1, 'disease': 1}


In [23]:
# let's remove common used english words like a and an. For now we've taken only small set of list, 
# later we will expand it
drop_keys = ['had','with','in','a','an','of','his','for','has','the','at']
for i in drop_keys:
    if i in feature_word_list:
        del feature_word_list[i]

In [24]:
# removing all words which has frequency less than 5
feature_word_list = {key:val for key, val in feature_word_list.items() if val > 4}

In [25]:
# let's see the final list
print(feature_word_list)

{'father': 37, 'heart': 26, 'mi': 23, 'attack': 20, 'dad': 14}


In [26]:
# Now I'll create a list from the word, currently feature_word_list is a dict with vale as freq
final_list = list(feature_word_list.keys())
print(type(final_list))
print(final_list)

<class 'list'>
['father', 'heart', 'mi', 'attack', 'dad']


###  Now let's create a Method to create Key list for all features

In [27]:
dummy_features = features.copy()


In [28]:
def GenerateKeys(string_list, freq):
    drop_keys = ['had','with','in','a','an','of','his','for','has','the','at','no','last',
                 'ago','to','not','past','was','her','he','his','and','is','have','when','up','but','mo"]',
                 'by','him']
    string_word_list = ' '.join(string_list).split()
    counter=collections.Counter(string_word_list)
    string_word_list = dict(sorted(counter.items(), key=lambda item: item[1],reverse=True))
    
    for drop_key in drop_keys:
        if drop_key in string_word_list:
            del string_word_list[drop_key]
        
    string_word_list = {key:val for key, val in string_word_list.items() if val > freq}
    return list(string_word_list.keys())

In [29]:
dummy_features.reset_index()

for idx,row in dummy_features.iterrows():
    string_list = pd.Series(dummy_train[(dummy_train['case_num']==row['case_num']) & (dummy_train['feature_num']==row['feature_num'])]['annotation']).unique()
    no = len(string_list)
    keys = GenerateKeys(string_list,2)
    dummy_features.at[idx,'keys'] = json.dumps(keys)
    dummy_features.at[idx,'no_of_notes'] = no
    
    
    

In [30]:
dummy_features[dummy_features['case_num']==4]

Unnamed: 0,feature_num,case_num,feature_text,keys,no_of_notes
59,400,4,Lack-of-other-thyroid-symptoms,"[""denies"", ""changes"", ""palpitations"", ""intoler...",83.0
60,401,4,anxious-OR-nervous,"[""nervousness"", ""anxiety"", ""nervous"", ""overwhe...",49.0
61,402,4,Stress-due-to-caring-for-elderly-parents,"[""care"", ""mother"", ""taking"", ""laws"", ""elderly""...",58.0
62,403,4,Heavy-caffeine-use,"[""5"", ""6"", ""coffee"", ""cups"", ""caffeine"", ""day""...",27.0
63,404,4,No-depressed-mood,"[""denies"", ""loss"", ""mood"", ""interest"", ""suicid...",50.0
64,405,4,Weight-stable,"[""weight"", ""denies"", ""loss"", ""changes"", ""chang...",30.0
65,406,4,Insomnia,"[""asleep"", ""falling"", ""difficulty"", ""sleep"", ""...",43.0
66,407,4,Female,"[""female""]",5.0
67,408,4,Decreased-appetite,"[""appetite"", ""decreased"", ""decrease"", ""apetite...",36.0
68,409,4,45-year,"[""45"", ""year""]",12.0


In [31]:
# The keys still need some adjustments as it some of them had only 2 notes per feature.
# Apart from that I hope this will give some headstart
# will continue with notebook and methods to predict the sentence using keys
# find the features_with_keys.csv in data section of this notebook
dummy_features.to_csv('features_with_keys.csv')

In [33]:
# larg_check= np.array(dummy_features['keys'])


In [34]:
string_list = pd.Series(dummy_train[(train['case_num']==0) & (train['feature_num']==0)]['annotation']).unique()

In [35]:
GenerateKeys(string_list, 4)

['father', 'heart', 'mi', 'attack', 'dad']

In [36]:
dummy_features.loc[(dummy_features['feature_num']==0) & (dummy_features['case_num']==0),'keys'] =json.dumps(final_list)

In [37]:
import json
string = json.dumps(final_list)
print(type(string))
string

<class 'str'>


'["father", "heart", "mi", "attack", "dad"]'

In [38]:
lst = json.loads(string)
print(type(lst))
lst

<class 'list'>


['father', 'heart', 'mi', 'attack', 'dad']

In [42]:
# print(note16)

In [41]:
# text = note16.replace('.',' ')
# pred = ''

# for pattern in final_list:
#     if text.find(pattern)>0:
#             pred += str(text.find(pattern))+' '+str(text.find(pattern)+len(pattern))+';'
# pred = pred[:-1]

In [None]:
# pred

In [None]:
# text[696:699]

In [None]:
# final_list

In [None]:
# sample_submission