In [3]:
import numpy as np
import pandas as  pd
import matplotlib.pyplot as plt
from collections import Counter
from spellchecker import SpellChecker

### The next cell imports the raw data and drops
* Null columns
* Rows with disease name as null

In [4]:
patients_df = pd.read_csv("AI DATA MODIFIED.csv",encoding = "ISO-8859-1")
patients_df.drop(columns=['HEART_RATE','HEAD_CIRCUMFERENCE','UPPER_ARM_CIRCUMFERENCE'],inplace = True)
patients_df = patients_df[pd.notnull(patients_df['DISEASE_NAME'])]

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
drop_df = pd.read_excel('Records to be excluded.xlsx')

In [30]:
drop_df

Unnamed: 0,PATIENT_VISIT_ID
0,PV000001055092
1,PV000000582830
2,PV000000614244
3,PV000000703296
4,PV000000850994
5,PV000000579016
6,PV000000563596
7,PV000000963176
8,PV000000557364
9,PV000000599078


In [10]:
patients_df.drop(patients_df[patients_df.PATIENT_VISIT_ID.isin(drop_df.PATIENT_VISIT_ID)].index,inplace=True)

In [11]:
patients_df.head()

Unnamed: 0,PATIENT_VISIT_ID,PATIENT_NAME,GENDER,AGE,STATE_NAME,DISTRICT_NAME,CENTER_SHORT_NAME,SYSTOLIC_BP,DIASTOLIC_BP,PULSE,RESPIRATION_RATE,BODY_TEMPERATURE,BODY_WEIGHT,HEIGHT,SPO2,SYMPTOM_ID,SYMPTOM_DESC,MEDICAL_TEST_NAME,DISEASE_NAME
0,PV000000554626,Harjas Singh,Male,4.0,Rajasthan,Bundi,Pandit Briz Sundar Sharma General Hospital Bundi,,,,,,18.0,,,,ECZIMA,,ACD (POST APPLICATION OF DETTOL)
1,PV000000554628,Hetal Vaishnav,Male,22.5,Rajasthan,Udaipur,Satelite Hospital Hiran Magri Sec 5 Udaipur,120.0,80.0,68.0,18.0,,53.0,,97.0,,AC URI WITH STOMATITIS,CBC~SGOT (AST),ORAL ULCER~URI
3,PV000000554634,Kesar Dev,Male,35.0,Rajasthan,Jhunjhunun,B D K Hospital Jhunjhunu,,,,,,,,,,RINGWORM,,FUC TINEA C/O ITCHING
4,PV000000554636,Paavan Swami,Male,15.2,Rajasthan,Churu,Government Hospital Ratangarh,110.0,70.0,72.0,24.0,98.6,45.0,154.0,99.0,,WHITE HAIR,,PREMATURE CANITIES
5,PV000000554638,Bhupundra Sharma,Male,18.2,Rajasthan,Churu,Government Hospital Ratangarh,120.0,80.0,72.0,24.0,98.6,58.0,178.0,99.0,SYMP00768,,,TINEA CRURIS


### The next cell drops all rows with wrongly entered vitals

In [12]:
patients_df.drop(patients_df[(patients_df.SYSTOLIC_BP>230) | (patients_df.DIASTOLIC_BP>150)].index, inplace = True)
patients_df.drop(patients_df[(patients_df.SYSTOLIC_BP<80) | (patients_df.DIASTOLIC_BP<50)].index, inplace=True)
patients_df.drop(patients_df[(patients_df.BODY_TEMPERATURE<90) | (patients_df.BODY_TEMPERATURE>110)].index, inplace=True)
patients_df.drop(patients_df[patients_df.AGE>100].index, inplace=True)
patients_df.drop(patients_df[(patients_df.SPO2>100) | (patients_df.SPO2<80)].index,inplace=True)
patients_df.drop(patients_df[(patients_df.PULSE<50) | (patients_df.PULSE>120)].index,inplace=True)
patients_df.drop(patients_df[(patients_df.RESPIRATION_RATE>30) | (patients_df.RESPIRATION_RATE<13)].index,inplace=True)
patients_df.drop(patients_df[patients_df.BODY_WEIGHT>200].index,inplace=True)
patients_df.drop(patients_df[(patients_df.HEIGHT>200) | (patients_df.HEIGHT<45)].index,inplace=True)

Vitals to be categorized:

- [X] Age
- [X] Sys and Dias BP
- [X] Pulse
- [X] RESPIRATION_RATE
- [ ] BODY_TEMPERATURE
- [ ] BODY_WEIGHT
- [ ] Height
- [X] SPO2

### Binning age values:

In [13]:
bins = [x for x in range(0,101,5)]
labels = [x for x in range(1,21)]

In [14]:
patients_df['age_binned'] = pd.cut(patients_df.AGE,bins,labels=labels,include_lowest=True)

#### As we see, most of the vitals consist of null values

In [15]:
patients_df.isnull().sum()/patients_df.shape[0]*100

PATIENT_VISIT_ID      0.000000
PATIENT_NAME          0.000000
GENDER                0.000000
AGE                   0.000000
STATE_NAME            0.000000
DISTRICT_NAME         0.000000
CENTER_SHORT_NAME     0.000000
SYSTOLIC_BP          45.633435
DIASTOLIC_BP         45.617895
PULSE                33.363928
RESPIRATION_RATE     53.811973
BODY_TEMPERATURE     42.393294
BODY_WEIGHT          31.910430
HEIGHT               57.842237
SPO2                 35.687680
SYMPTOM_ID           41.485647
SYMPTOM_DESC         58.510468
MEDICAL_TEST_NAME    89.268006
DISEASE_NAME          0.000000
age_binned            0.000000
dtype: float64

#### Replacing with default values if available, else median

<b>NOTE:</b> This might be risky since many patients with missing vitals may actually contain abnormal values, which the model may fail to consider. Thus, better to infer values depending on other vitals (to check later)

In [16]:
patients_df.SYSTOLIC_BP.fillna(120,inplace=True)
patients_df.DIASTOLIC_BP.fillna(80,inplace=True)
patients_df.PULSE.fillna(patients_df.PULSE.median(),inplace=True)
patients_df.RESPIRATION_RATE.fillna(patients_df.RESPIRATION_RATE.median(),inplace=True)
patients_df.BODY_TEMPERATURE.fillna(patients_df.BODY_TEMPERATURE.median(),inplace=True)
patients_df.BODY_WEIGHT.fillna(patients_df.BODY_WEIGHT.median(),inplace=True)
patients_df.HEIGHT.fillna(patients_df.HEIGHT.median(),inplace=True)
patients_df.SPO2.fillna(patients_df.SPO2.median(),inplace=True)

In [17]:
patients_df.describe()

Unnamed: 0,AGE,SYSTOLIC_BP,DIASTOLIC_BP,PULSE,RESPIRATION_RATE,BODY_TEMPERATURE,BODY_WEIGHT,HEIGHT,SPO2
count,205917.0,205917.0,205917.0,205917.0,205917.0,205917.0,205917.0,205917.0,205917.0
mean,33.73537,119.177499,78.586921,83.13149,18.712457,97.934234,44.311307,148.563957,98.386515
std,20.370633,8.578054,5.913708,8.675305,2.131331,0.9265,14.57248,15.163316,1.567289
min,0.0,80.0,50.0,50.0,13.0,90.0,0.1,45.0,80.0
25%,18.0,120.0,80.0,78.0,18.0,98.0,42.0,151.0,98.0
50%,31.0,120.0,80.0,82.0,18.0,98.0,45.0,151.0,99.0
75%,50.0,120.0,80.0,85.0,18.0,98.2,50.0,151.0,99.0
max,100.0,230.0,150.0,120.0,30.0,109.2,180.0,200.0,100.0


#### Categorizing BP

In [18]:
patients_df['blood_pressure']=''
patients_df['blood_pressure'][(patients_df.SYSTOLIC_BP<90) | (patients_df.DIASTOLIC_BP<60)] = 'low'
patients_df['blood_pressure'][((patients_df.SYSTOLIC_BP>=90) & (patients_df.SYSTOLIC_BP<=120)) & ((patients_df.DIASTOLIC_BP>=60) & (patients_df.DIASTOLIC_BP<=80))] = 'normal'
patients_df['blood_pressure'][((patients_df.SYSTOLIC_BP>120) & (patients_df.SYSTOLIC_BP<130)) & ((patients_df.DIASTOLIC_BP>=60) & (patients_df.DIASTOLIC_BP<=80))] = 'elevated'
patients_df['blood_pressure'][((patients_df.SYSTOLIC_BP>=130) & (patients_df.SYSTOLIC_BP<140)) | ((patients_df.DIASTOLIC_BP>80) & (patients_df.DIASTOLIC_BP<90))] = 'high1'
patients_df['blood_pressure'][((patients_df.SYSTOLIC_BP>=140) & (patients_df.SYSTOLIC_BP<180)) | ((patients_df.DIASTOLIC_BP>=90) & (patients_df.DIASTOLIC_BP<120))] = 'high2'
patients_df['blood_pressure'][(patients_df.SYSTOLIC_BP>=180) | (patients_df.DIASTOLIC_BP>=120)] = 'hypertensive'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://p

In [19]:
patients_df['blood_pressure'].value_counts(normalize=True)*100

normal          88.778003
high2            4.965593
high1            4.658188
elevated         1.141237
low              0.263213
hypertensive     0.193767
Name: blood_pressure, dtype: float64

#### Categorizing pulse rate

In [20]:
patients_df['pulse_rate_categorized']=''
patients_df['pulse_rate_categorized'][patients_df.PULSE<60]='low'
patients_df['pulse_rate_categorized'][(patients_df.PULSE>=60) & (patients_df.PULSE<=100)]='normal'
patients_df['pulse_rate_categorized'][patients_df.PULSE>100]='high'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [21]:
patients_df.pulse_rate_categorized.value_counts(normalize=True)*100

normal    94.693493
high       5.106426
low        0.200081
Name: pulse_rate_categorized, dtype: float64

#### Categorizing respiration rate

In [22]:
patients_df['respiration_rate_categorized']=''
patients_df['respiration_rate_categorized'][patients_df.RESPIRATION_RATE<16]='low'
patients_df['respiration_rate_categorized'][(patients_df.RESPIRATION_RATE>=16) & (patients_df.RESPIRATION_RATE<=20)]='normal'
patients_df['respiration_rate_categorized'][patients_df.RESPIRATION_RATE>20]='high'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [23]:
patients_df.respiration_rate_categorized.value_counts(normalize=True)*100

normal    88.840649
high      10.772301
low        0.387049
Name: respiration_rate_categorized, dtype: float64

#### Categorizing SPO2

In [24]:
patients_df['SPO2_categorized']=''
patients_df['SPO2_categorized'][patients_df.SPO2>=95]='normal'
patients_df['SPO2_categorized'][patients_df.SPO2<95]='low'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [25]:
patients_df.SPO2_categorized.value_counts(normalize=True)*100

normal    97.798628
low        2.201372
Name: SPO2_categorized, dtype: float64

In [26]:
patients_df.isnull().sum()

PATIENT_VISIT_ID                     0
PATIENT_NAME                         0
GENDER                               0
AGE                                  0
STATE_NAME                           0
DISTRICT_NAME                        0
CENTER_SHORT_NAME                    0
SYSTOLIC_BP                          0
DIASTOLIC_BP                         0
PULSE                                0
RESPIRATION_RATE                     0
BODY_TEMPERATURE                     0
BODY_WEIGHT                          0
HEIGHT                               0
SPO2                                 0
SYMPTOM_ID                       85426
SYMPTOM_DESC                    120483
MEDICAL_TEST_NAME               183818
DISEASE_NAME                         0
age_binned                           0
blood_pressure                       0
pulse_rate_categorized               0
respiration_rate_categorized         0
SPO2_categorized                     0
dtype: int64

For symptoms:
* Will AWS ground truth work?
* Spell correct
* Everything else

In [27]:
symptom_desc_list = list(patients_df.SYMPTOM_DESC[patients_df.SYMPTOM_DESC.notnull()])
symptom_word_list = []

for symptom in symptom_desc_list:
        symptom_word_list.extend([word for word in symptom.split()])
    
word_counter = Counter(symptom_word_list)
word_counter.most_common()

[('PAIN', 16730),
 ('COLD', 11623),
 ('SKIN', 8852),
 ('COUGH', 8370),
 ('ITCHING', 7670),
 ('FEVER', 7651),
 ('WEAKNESS', 6617),
 ('ALLERGY', 3952),
 ('BODY', 3865),
 ('ACIDITY', 3763),
 ('INFECTION', 3659),
 ('PROBLEM', 3341),
 ('BACK', 3166),
 ('HEADACHE', 2310),
 ('JOINT', 2216),
 ('AND', 2066),
 ('TC', 1775),
 ('BODYACHE', 1600),
 ('INJURY', 1446),
 ('DAYS', 1327),
 ('ON', 1228),
 ('SINCE', 1180),
 ('NESS', 1147),
 ('WEAK', 1136),
 ('MOTION', 1045),
 ('LEG', 996),
 ('LOOSE', 968),
 ('COMMON', 951),
 ('KNEE', 910),
 ('BACKPAIN', 907),
 ('HAND', 901),
 ('FACE', 870),
 ('TINEA', 863),
 ('DAAD', 820),
 ('DANDRUFF', 810),
 ('OF', 808),
 ('IN', 777),
 ('TWO', 735),
 ('WOUND', 732),
 ('LAST', 726),
 ('COULD', 715),
 ('ONE', 687),
 ('ABDOMINAL', 669),
 ('EYE', 655),
 ('BODYPAIN', 653),
 ('EAR', 638),
 ('FUC', 621),
 ('HEAD', 606),
 ('ITICHING', 603),
 ('WITH', 594),
 ('SCABIES', 592),
 ('FOOT', 587),
 ('INDIGESTION', 586),
 ('CHEST', 577),
 ('HEADACH', 575),
 ('WHITE', 536),
 ('ACID', 479

### Working on data from Orissa since it has the max no. of consultations 

In [137]:
patients_df.STATE_NAME.value_counts(normalize=True)*100

Orissa           56.037644
Rajasthan        41.498686
West Bengal       1.292719
Gujarat           0.593735
Chhattisgarh      0.229376
Jharkhand         0.155277
Bihar             0.154805
Uttar Pradesh     0.029262
Delhi             0.008495
Name: STATE_NAME, dtype: float64

In [138]:
orissa_df = patients_df[patients_df.STATE_NAME=='Orissa']

In [139]:
orissa_df.shape

(118732, 24)

In [140]:
orissa_df.describe()

Unnamed: 0,AGE,SYSTOLIC_BP,DIASTOLIC_BP,PULSE,RESPIRATION_RATE,BODY_TEMPERATURE,BODY_WEIGHT,HEIGHT,SPO2
count,118732.0,118732.0,118732.0,118732.0,118732.0,118732.0,118732.0,118732.0,118732.0
mean,33.962603,118.46279,78.010014,84.552412,18.912635,97.862317,42.56667,146.284889,98.070444
std,21.512701,9.279421,6.681305,10.311345,2.363849,1.145696,16.984812,18.758125,1.904339
min,0.0,80.0,50.0,50.0,13.0,90.0,0.1,45.0,80.0
25%,15.0,120.0,78.0,78.0,18.0,97.7,35.0,146.0,98.0
50%,32.95,120.0,80.0,82.0,18.0,98.0,45.0,151.0,99.0
75%,52.0,120.0,80.0,89.0,19.0,98.4,54.0,154.0,99.0
max,100.0,230.0,150.0,120.0,30.0,109.2,170.0,200.0,100.0


In [39]:
orissa_df[orissa_df.SYMPTOM_DESC.notnull()].shape

(49914, 20)

In [40]:
orissa_df[orissa_df.SYMPTOM_DESC.notnull()].to_csv('orissa_symptom_desc_notnull.csv')

In [168]:
symptom_desc_list = list(patients_df.SYMPTOM_DESC[patients_df.SYMPTOM_DESC.notnull()])
symptom_word_list = []

for symptom in symptom_desc_list:
        symptom_word_list.extend([word for word in symptom.split()])
    
word_counter = Counter(symptom_word_list)
word_counter.most_common()

['eczema']
['AC', 'URI', 'WITH', 'STOMATITIS']
['ITCHING']
['RINGWORM']
['WHITE', 'HAIR']
['ITCHING']
['ITCHING']
['COUGH', 'BODY', 'PAIN', 'GHABRAHAT']
['SKIN', 'PROBLEM']
['SKIN', 'INFECTION', 'ON', 'NECK']
['WHITE', 'SPOT']
['ITCHING']
['ALLERGY']
['ACNE']
['ITCHING']
['TC']
['URI']
['SKIN', 'PROBLEM']
['ITCHING']
['ITCHING', 'WITH', 'INFECTION']
['resale', 'ITCHING']
['URI']
['ITCHING']
['ITCHING']
['ITCHING']
['ITCHING']
['resale', 'ITCHING']
['RINGWOMNS', 'ITCHING']
['ringworm', 'ITCHING']
['BURN', 'SPOT']
['TC']
['ringworm', 'ITCHING']
['ITCHING']
['DRY', 'SPOTS', 'ON', 'FACE']
['and']
['ECZEMA']
['ECZEMA']
['LEG', 'PAIN']
['ruthless', 'NESS']
['resale', 'ITCHING']
['RINGWOMNS', 'ITCHING']
['ringworm', 'ITCHING']
['ITCHING']
['allow', 'PAIN']
['LESIONS', 'ITCHING']
['SKIN', 'INFECTION']
['hen', 'CDM']
['HAIR', 'LOSS']
['LT', 'SIDE', 'CHEST', 'PAIN']
['FOLLOW', 'UP', 'TC']
['CAD']
['NECK', 'INFECTION']
['ITCHING']
['RINGWOMNS', 'ITCHING']
['WHITE', 'SPOT']
['ORAL', 'STOMATITIS']


['RINGWOMNS', 'ITCHING']
['MOUTH', 'ULCER']
['COUGH', 'BODY', 'PAIN', 'GHABRAHAT']
['ALLERGY']
['ITCHING']
['COLD', 'FEVER', 'COUGH', 'ABDOMINALPAIN']
['COLD', 'COUGH', 'FEVER', 'headache']
['ITCHING', 'LESIONS', 'RASHES']
['melanoma']
['SKIN', 'INFECTION']
['ECZEMA']
['DM']
['WEAKNESS']
['COUGH', 'COLD', 'BODY', 'ACHE']
['ITCHING']
['HEAD', 'INJURY']
['fun', 'tina']
['ALLERGY']
['ITCHING']
['ACIDITY']
['ITCHING']
['COUGHING', 'WITH', 'SNEEZING']
['ALLERGY', 'SCALP', 'FACE']
['HAIR', 'full']
['KNEE', 'PAIN']
['co', 'annie']
['MI']
['COLD', 'FEVER', 'WEAK', 'NESS']
['FEVER', 'COLD', 'COUGH', 'HEADACHE']
['BLACK', 'MARK', 'ON', 'FACE']
['COLD', 'COUGH', 'FEVER']
['SKIN', 'ALLERGY', 'ITCHING']
['ANC']
['ITCHING']
['DM']
['URI']
['COLD', 'COUGH', 'INDIGESTION']
['ITCHING']
['ITCHING']
['fun', 'PHYTOPHOTODERMATITIS']
['GENITAL', 'HERPES']
['LOOSE', 'MOTIONS']
['ACNE']
['fun', 'IRRITANT', 'with', 'tina', 'edi']
['BACK', 'PAIN', 'diabetic']
['caught']
['LOOS', 'MOTION']
['ITCHING']
['chalker'

['JOINTPAIN']
['dandruff']
['hydroxyl', 'PAIN']
['COUGH', 'SINCE', 'THREE', 'DAYS']
['ATOPIC', 'DERMATITIS']
['BACK', 'PAIN']
['CHEST', 'PAIN']
['ITCHING']
['ACNE']
['FEVER', 'swearing', 'CHEST', 'PAIN', 'RT', 'SI']
['fun', 'tina']
['SINCE', 'ONE', 'DAY', 'SUGER']
['URI']
['COLD', 'FEVER', 'HEADACHE']
['ITCHING']
['pupils', 'ITCHING']
['tina']
['URI']
['COLD', 'headache', 'WEAKNESS']
['KNEE', 'PAIN']
['URI']
['MI', 'CHEST', 'PAIN']
['SKIN', 'INFECTION']
['DANDRUFF']
['COLD', 'AND', 'OTHER', 'PROBLEM']
['COLD', 'COUGH', 'BODYPAIN']
['RT', 'LEG', 'PAIN']
['URI']
['gastric']
['ACIDITY', 'INDIGESTION']
['ITCHING']
['SKIN', 'PROBLEM']
['FEVER', 'AT', 'NIGHT']
['ITCHING']
['RING', 'WORM']
['SKIN', 'ALLERGY']
['ITCHING']
['ACIDITY']
['DM', 'abdomen', 'PAIN']
['ALLERGY']
['WHITE', 'SPOTS', 'WHITE', 'HAIR']
['COLD', 'DRY', 'COUGH', 'HAND', 'PAIN']
['pimples']
['HAIR', 'DANDRUFF']
['RT', 'HAND', 'INJURY']
['pimples']
['PIMPLES']
['URI']
['ALLERGY']
['COLD', 'FEVER', 'DRY', 'COUGH']
['SKIN', 'INF

['GHABRAHAT']
['PAIN', 'LEFT', 'THUMB']
['PAIN', 'LEFT', 'LEG']
['ACIDITY', 'COLD']
['fun', 'tina']
['fun', 'tina']
['EPILEPSY']
['FEVER', 'COUGH']
['FEVER', 'CHEST', 'PAIN', 'BODY', 'PAIN']
['fun', 'versace', 'plan']
['ALLERGY']
['ALLERGY']
['ALLERGY']
['ALLERGY']
['ALLERGY']
['ALLERGY']
['ALLERGY']
['ALLERGY']
['JOINT', 'PAIN', 'HEADACHE']
['WEAKNESS']
['COLD', 'EAR', 'PAIN']
['LBA']
['ITCHING']
['ITCHING']
['ALLERGY']
['SKIN', 'INFECTION']
['FEVER', 'BODYACHE']
['LT', 'HAND', 'PAIN']
['SKIN', 'RASH', 'SINCE', 'TWENTY', 'DAYS']
['KNEE', 'PAIN', 'BACK', 'PAIN', 'weakness']
['ITCHING', 'WHITE', 'PATCHES', 'ON', 'HAND']
['COLD', 'FEVER', 'COUGH', 'FOOT', 'ITCHING']
['SKIN', 'RASH', 'SINCE', 'TWO', 'MONTHS']
['COLD', 'FEVER', 'COUGH']
['fun', 'HYPERKERATOTIC']
['MENORRHAGIA']
['INJURY', 'TODAY']
['ITCHING']
['ITCHING']
['fun', 'TV']
['COLD', 'BODY', 'PAIN', 'ACIDITY']
['EAR', 'ITCHING']
['first', 'JOINT', 'PAIN']
['MOUTH', 'ULCER']
['WEAKNESS']
['weakness']
['ITCHING']
['IRREGULAR', 'MEN

['HEMOPATISIS', 'BLEEDING', 'PEREACTAM']
['RIGHT', 'EYE', 'INCHING', 'ONE', 'WEEK']
['RIGHT', 'HAND', 'PAIN', 'COLD']
['SKIN', 'PROBLEM']
['GASTRIC', 'indexation']
['backpack', 'FOOT', 'PAIN']
['COLD']
['SUGAR']
['LOOSE', 'MOTION']
['SWELLING', 'OF', 'FOOT']
['LOOSE', 'MOTION']
['SWELLING', 'OF', 'FOOT', 'FINGER', 'FEVER']
['HYPERTENSION']
['WRIST', 'PAIN']
['BODY', 'PAIN']
['PAIN', 'abdomen']
['ABDOMINAL', 'PAIN']
['BACK', 'PAIN']
['COLD', 'HEADACHE', 'COUGH']
['LOOSE', 'MOTION']
['RUNNING', 'EAR']
['COLD', 'RUNNING', 'NOSE', 'SINCE', 'TEN', 'DA']
['COLD']
['SKIN', 'PROBLEM']
['LOOSE', 'motion', 'N', 'VOMITING']
['COLD', 'HEADACHE', 'ACIDITY']
['BACK', 'PAIN']
['COLD', 'ITCHING', 'COUGH']
['ACIDITY']
['menstrual', 'PROBLEM', 'MUSCLE', 'PAIN']
['BODY', 'ITCHING']
['FEVER']
['SKIN', 'PROBLEM']
['ITCHING']
['FEVER']
['ACIDITY']
['BODY', 'PAIN', 'OR', 'WEAKNESS', 'TWO', 'WEEK']
['ITCHING']
['ITCHING', 'WEAKNESS']
['golf']
['backpack', 'WEAKNESS']
['ITCHING', 'BODY', 'PAIN']
['SEVER', 'dan

['RINGEWOMNS', 'ITCHING']
['pupils', 'ITCHING']
['MUSCLE', 'PAIN', 'SKIN', 'PROBLEM']
['SKIN', 'ALLERGY']
['ITCHING']
['backpack']
['SUGAR']
['ACIDITY', 'WEAK', 'NESS', 'FEVER']
['ARI']
['RINGWOMNS', 'ITCHING']
['pupils', 'ITCHING']
['ACIDITY', 'INDIGESTION', 'MUSCLE', 'PAI']
['CHEST', 'PAIN']
['SKIN', 'ALLERGY']
['WEAK', 'NESS', 'ACIDITY']
['resale']
['WEAKNESS']
['BOTH', 'LEG', 'PAIN', 'OR', 'WEAK', 'NESS']
['SKIN', 'INFECTION']
['ACNE']
['ARI']
['SKIN', 'ALLERGY']
['COLD', 'FEVER', 'BODY', 'PAIN']
['URINE', 'INFECTION', 'LOOSE', 'MOTIONS']
['CUT', 'INJURY']
['INJURY']
['SKIN', 'INFECTION']
['COLD', 'FEVER', 'FOOT', 'SWELLING']
['SKIN', 'ALLERGY']
['VERTIGO']
['BACK', 'PAIN']
['WOUND']
['CHEST', 'SWELLING']
['ITCHING']
['DANDRUFF']
['ITCHING']
['ITCHING']
['ALLERGY']
['RECURRENT', 'art']
['URI']
['clergy']
['ITCHING']
['ALLERGY']
['BLOOD', 'SUGAR', 'MUSCLE', 'PAIN']
['HAND', 'PAIN']
['ABDOMINAL', 'PAIN']
['ITCHING']
['ITCHING']
['ringworm', 'ITCHING']
['ALLERGY']
['tania']
['ALLERGY'

['RINGWOMNS', 'ITCHING']
['WEAKNESS', 'SINCE', 'FOUR', 'DAYS']
['URI']
['tina']
['ALLERGY']
['hen']
['RINGWOMNS', 'ITCHING']
['ALLERGY']
['SKIN', 'ALLERGY']
['RT', 'things', 'PAIN']
['ALLERGY']
['SKIN', 'INFECTION']
['ALLERGY']
['ITCHING']
['INJURY']
['ITCHING']
['WEAKNESS']
['fun', 'EXOGENOUS', 'OCHRONOSIS']
['KNEE', 'JOINT', 'PAIN']
['SKIN', 'INFECTION']
['DRYNESS', 'LOSS', 'OF', 'epitome']
['common', 'COLD', 'RUNNING', 'NOSE']
['MENSTRUAL', 'PROBLEM']
['VERTIGO']
['ITCHING']
['SKIN', 'INFECTION', 'AT', 'LT', 'EYE', 'LID']
['LOWER', 'BACK', 'PAIN']
['WOUND', 'SINCE', 'FIVE', 'DAY', 'weakness']
['hen']
['BACK', 'PAIN']
['ITCHING']
['COLD', 'FEVER', 'COUGH', 'LOOSE', 'MOTION']
['MENTAL', 'RETARD']
['ITCHING']
['BODY', 'PAIN', 'AND', 'WEAKNESS']
['ITCHING']
['DANDRUFF']
['HYPERTHYRODISM']
['ITCHING']
['COLD', 'FEVER']
['headache']
['ITCHING']
['SKIN', 'INFECTION']
['SKIN', 'ITCHING']
['RIGHT', 'HAND', 'PAIN', 'TWO', 'MONTH']
['COLD', 'COUGH']
['HEAD', 'ACHE']
['SKIN', 'RASH', 'SINCE', '

KeyboardInterrupt: 

In [21]:
symptoms = pd.read_csv('SYMPTOM MASTER.csv',encoding = "ISO-8859-1")

In [108]:
symptoms

Unnamed: 0,SYMPTOM_ID,SYMPTOM_NAME,GENDER
0,SYMP04148,Increased Urinary Infection,C
1,SYMP04158,Increased Vocal Resonance,C
2,SYMP00998,indentation on either side of the nasal tip,C
3,SYMP00130,Excessive Burping/ Belching,C
4,SYMP00135,Indigestion,C
5,SYMP03062,Eructation,C
6,SYMP03093,Gas,C
7,SYMP00962,Indurated Lower Lip,C
8,SYMP00803,Induration,C
9,SYMP03045,Dependent Edema,C


In [50]:
symptoms[symptoms.SYMPTOM_ID == 'SYMP00756']

Unnamed: 0,SYMPTOM_ID,SYMPTOM_NAME,GENDER
1557,SYMP00756,Fever,C
