In [238]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
import html
import string

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Ensure the necessary NLTK resources are available
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/pranjalrane/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pranjalrane/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/pranjalrane/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Disease Symptom Dataset

In [239]:
data_path = '../Data/Raw Data/Disease_symptom_and_patient_profile_dataset.csv'
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,Disease,Fever,Cough,Fatigue,Difficulty Breathing,Age,Gender,Blood Pressure,Cholesterol Level,Outcome Variable
0,Influenza,Yes,No,Yes,Yes,19,Female,Low,Normal,Positive
1,Common Cold,No,Yes,Yes,No,25,Female,Normal,Normal,Negative
2,Eczema,No,Yes,Yes,No,25,Female,Normal,Normal,Negative
3,Asthma,Yes,Yes,No,Yes,25,Male,Normal,Normal,Positive
4,Asthma,Yes,Yes,No,Yes,25,Male,Normal,Normal,Positive


In [240]:
df.isnull().sum()

Disease                 0
Fever                   0
Cough                   0
Fatigue                 0
Difficulty Breathing    0
Age                     0
Gender                  0
Blood Pressure          0
Cholesterol Level       0
Outcome Variable        0
dtype: int64

In [241]:
df['Fever'] = df['Fever'].str.lower()
df['Cough'] = df['Cough'].str.lower()
df['Fatigue'] = df['Fatigue'].str.lower()
df['Difficulty Breathing'] = df['Difficulty Breathing'].str.lower()
df['Gender'] = df['Gender'].str.lower()
df['Blood Pressure'] = df['Blood Pressure'].str.lower()
df['Cholesterol Level'] = df['Cholesterol Level'].str.lower()
df['Outcome Variable'] = df['Outcome Variable'].str.lower()

In [242]:
binary_columns = ['Fever', 'Cough', 'Fatigue', 'Difficulty Breathing']
label_encoder = LabelEncoder()
for col in binary_columns:
    df[col] = label_encoder.fit_transform(df[col])
df.head()

Unnamed: 0,Disease,Fever,Cough,Fatigue,Difficulty Breathing,Age,Gender,Blood Pressure,Cholesterol Level,Outcome Variable
0,Influenza,1,0,1,1,19,female,low,normal,positive
1,Common Cold,0,1,1,0,25,female,normal,normal,negative
2,Eczema,0,1,1,0,25,female,normal,normal,negative
3,Asthma,1,1,0,1,25,male,normal,normal,positive
4,Asthma,1,1,0,1,25,male,normal,normal,positive


In [243]:
one_hot_columns = ['Gender', 'Blood Pressure', 'Cholesterol Level']
df = pd.get_dummies(df, columns=one_hot_columns)
df.head()

Unnamed: 0,Disease,Fever,Cough,Fatigue,Difficulty Breathing,Age,Outcome Variable,Gender_female,Gender_male,Blood Pressure_high,Blood Pressure_low,Blood Pressure_normal,Cholesterol Level_high,Cholesterol Level_low,Cholesterol Level_normal
0,Influenza,1,0,1,1,19,positive,True,False,False,True,False,False,False,True
1,Common Cold,0,1,1,0,25,negative,True,False,False,False,True,False,False,True
2,Eczema,0,1,1,0,25,negative,True,False,False,False,True,False,False,True
3,Asthma,1,1,0,1,25,positive,False,True,False,False,True,False,False,True
4,Asthma,1,1,0,1,25,positive,False,True,False,False,True,False,False,True


In [244]:
scaler = StandardScaler()
df[['Age']] = scaler.fit_transform(df[['Age']])
df.head()

Unnamed: 0,Disease,Fever,Cough,Fatigue,Difficulty Breathing,Age,Outcome Variable,Gender_female,Gender_male,Blood Pressure_high,Blood Pressure_low,Blood Pressure_normal,Cholesterol Level_high,Cholesterol Level_low,Cholesterol Level_normal
0,Influenza,1,0,1,1,-2.09116,positive,True,False,False,True,False,False,False,True
1,Common Cold,0,1,1,0,-1.631964,negative,True,False,False,False,True,False,False,True
2,Eczema,0,1,1,0,-1.631964,negative,True,False,False,False,True,False,False,True
3,Asthma,1,1,0,1,-1.631964,positive,False,True,False,False,True,False,False,True
4,Asthma,1,1,0,1,-1.631964,positive,False,True,False,False,True,False,False,True


In [245]:
min_max_scaler = MinMaxScaler()
df[['Age']] = min_max_scaler.fit_transform(df[['Age']])
df.head()

Unnamed: 0,Disease,Fever,Cough,Fatigue,Difficulty Breathing,Age,Outcome Variable,Gender_female,Gender_male,Blood Pressure_high,Blood Pressure_low,Blood Pressure_normal,Cholesterol Level_high,Cholesterol Level_low,Cholesterol Level_normal
0,Influenza,1,0,1,1,0.0,positive,True,False,False,True,False,False,False,True
1,Common Cold,0,1,1,0,0.084507,negative,True,False,False,False,True,False,False,True
2,Eczema,0,1,1,0,0.084507,negative,True,False,False,False,True,False,False,True
3,Asthma,1,1,0,1,0.084507,positive,False,True,False,False,True,False,False,True
4,Asthma,1,1,0,1,0.084507,positive,False,True,False,False,True,False,False,True


In [246]:
cleaned_data_path = '../Data/Processed Data/Cleaned_Disease_Symptom_and_Patient_Profile_Dataset.csv'
df.to_csv(cleaned_data_path, index=False)


In [247]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 349 entries, 0 to 348
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Disease                   349 non-null    object 
 1   Fever                     349 non-null    int64  
 2   Cough                     349 non-null    int64  
 3   Fatigue                   349 non-null    int64  
 4   Difficulty Breathing      349 non-null    int64  
 5   Age                       349 non-null    float64
 6   Outcome Variable          349 non-null    object 
 7   Gender_female             349 non-null    bool   
 8   Gender_male               349 non-null    bool   
 9   Blood Pressure_high       349 non-null    bool   
 10  Blood Pressure_low        349 non-null    bool   
 11  Blood Pressure_normal     349 non-null    bool   
 12  Cholesterol Level_high    349 non-null    bool   
 13  Cholesterol Level_low     349 non-null    bool   
 14  Cholestero

### Drug Review Dataset

In [248]:
data_path = '../Data/Raw Data/drugsComTrain_raw.csv'
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37


In [249]:
df['review'] = df['review'].apply(html.unescape)
df.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37


In [250]:
df['date'] = pd.to_datetime(df['date'], format='%d-%b-%y')
df.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,2012-05-20,27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,2010-04-27,192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,2009-12-14,17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,2015-11-03,10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,2016-11-27,37


In [251]:
df.isnull().sum()

uniqueID         0
drugName         0
condition      899
review           0
rating           0
date             0
usefulCount      0
dtype: int64

In [252]:
df['drugName'] = df['drugName'].str.lower()
df['condition'] = df['condition'].str.lower()
df.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,206461,valsartan,left ventricular dysfunction,"""It has no side effect, I take it in combinati...",9,2012-05-20,27
1,95260,guanfacine,adhd,"""My son is halfway through his fourth week of ...",8,2010-04-27,192
2,92703,lybrel,birth control,"""I used to take another oral contraceptive, wh...",5,2009-12-14,17
3,138000,ortho evra,birth control,"""This is my first time using any form of birth...",8,2015-11-03,10
4,35696,buprenorphine / naloxone,opiate dependence,"""Suboxone has completely turned my life around...",9,2016-11-27,37


In [253]:

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

In [254]:
df['review'] = df['review'].apply(preprocess_text)


In [255]:
df = df.dropna(subset=['condition'])

In [256]:
df = df.drop(columns=['uniqueID', 'date'])
df['review'] = df['review'].astype(str)
df = df[df['review'] != '']
df

Unnamed: 0,drugName,condition,review,rating,usefulCount
0,valsartan,left ventricular dysfunction,side effect take combination bystolic 5 mg fis...,9,27
1,guanfacine,adhd,son halfway fourth week intuniv became concern...,8,192
2,lybrel,birth control,used take another oral contraceptive 21 pill c...,5,17
3,ortho evra,birth control,first time using form birth control im glad we...,8,10
4,buprenorphine / naloxone,opiate dependence,suboxone completely turned life around feel he...,9,37
...,...,...,...,...,...
161292,campral,alcohol dependence,wrote first report midoctober 2014 alcohol sin...,10,125
161293,metoclopramide,nausea/vomiting,given iv surgey immediately became anxious cou...,1,34
161294,orencia,rheumatoid arthritis,limited improvement 4 month developed bad rash...,2,35
161295,thyroid desiccated,underactive thyroid,ive thyroid medication 49 year spent first 38 ...,10,79


In [257]:
empty_strings_count = df['review'].apply(lambda x: x == '').sum()
print(f"Number of empty strings in 'review' column: {empty_strings_count}")

# Check for non-string types
non_string_count = df['review'].apply(lambda x: not isinstance(x, str)).sum()
print(f"Number of non-string types in 'review' column: {non_string_count}")

Number of empty strings in 'review' column: 0
Number of non-string types in 'review' column: 0


In [258]:
cleaned_data_path = '../Data/Processed Data/Cleaned_Drug_Review_Dataset_Train.csv'
df.to_csv(cleaned_data_path, index=False)

In [259]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 160392 entries, 0 to 161296
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   drugName     160392 non-null  object
 1   condition    160392 non-null  object
 2   review       160392 non-null  object
 3   rating       160392 non-null  int64 
 4   usefulCount  160392 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 7.3+ MB
