# Data Preparation and Explorative Analysis Workbook

## Import libraries and modules

In [1]:
import pandas as pd
import html
import json
import re
from sklearn.model_selection import train_test_split

## 1 - Data Cleaning and Preprocessing

In [2]:
# Read in the training data
train_df = pd.read_json('./data/lewtun-drug-reviews/train.jsonl', lines=True)
train_df.sample(5)

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
111681,204986,Toradol,Pain,"""Lower back was out to the point of 98% incapa...",10,2013-07-22,47
4203,115655,Cefprozil,Bronchitis,"""I used this to treat bronchitis and whooping ...",8,2010-07-21,16
62426,209544,Praluent,"High Cholesterol, Familial Heterozygous","""I have genetic heterozygous familial hyperlip...",9,2017-08-02,3
94936,66612,Toviaz,Overactive Bladde,"""My 10 year old daughter with spina bifida use...",8,2013-03-24,18
91100,230633,Depo-Provera,Endometriosis,"""I have taken Depo-Provera now for 4 years. It...",5,2009-09-26,12


In [3]:
# Check for missing values
train_df.describe()

Unnamed: 0.1,Unnamed: 0,rating,date,usefulCount
count,161297.0,161297.0,161297,161297.0
mean,115923.585305,6.994377,2014-06-11 08:51:43.779983360,28.004755
min,2.0,1.0,2008-02-24 00:00:00,0.0
25%,58063.0,5.0,2012-04-12 00:00:00,6.0
50%,115744.0,8.0,2015-06-09 00:00:00,16.0
75%,173776.0,10.0,2016-08-19 00:00:00,36.0
max,232291.0,10.0,2017-12-12 00:00:00,1291.0
std,67004.44517,3.272329,,36.403742


In [4]:
# Check for duplicated id
train_df.duplicated(subset=['Unnamed: 0']).sum()

0

In [5]:
train_df.loc[train_df.review.str.contains('&#039;')]

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,2009-12-14,17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,2015-11-03,10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,2016-11-27,37
5,155963,Cialis,Benign Prostatic Hyperplasia,"""2nd day on 5mg started to work with rock hard...",2,2015-11-28,43
6,165907,Levonorgestrel,Emergency Contraception,"""He pulled out, but he cummed a bit in me. I t...",1,2017-03-07,5
...,...,...,...,...,...,...,...
161287,132177,Ativan,Anxiety,"""I was super against taking medication. I&#039...",9,2016-08-16,61
161289,105263,Carbamazepine,Trigeminal Neuralgia,"""Up to 800mg seems to work about once every 2n...",1,2016-01-31,10
161291,164345,Junel 1.5 / 30,Birth Control,"""This would be my second month on Junel. I&#03...",6,2015-05-27,0
161295,47128,Thyroid desiccated,Underactive Thyroid,"""I&#039;ve been on thyroid medication 49 years...",10,2015-09-19,79


In [6]:
# Decode HTML entities back to original characters
train_df['review'] = train_df['review'].apply(html.unescape)
print(train_df.iloc[161287].review)

train_df.iloc[1].review[659:670]

"I was super against taking medication. I've started dealing with anxiety (some depression) for awhile now. I got prescribed Ativan months ago, and never took it. I thought I could handle myself again but then my relationship started to fail because of my over thinking. I went back to the doctor, and decided to try it. She prescribed me .50 3 times a day. I took a full one at night after a bad panic attack, and within 30 minutes I was calm and slept through the entire night. I have only been taking a half morning and mid afternoon and full at night because it does make you feel tired. My brain is in a slight fog, but nothing crazy, I still know whats going on though :) Give it a try if you are suffering. You will know after 1."


'\r\nWe have t'

In [7]:
def correct_excape(row):
    row['review'] = re.sub(r'(?<!\\)([\r\n\t])', '\\1', row['review'])
    return row


train_df = train_df.apply(correct_excape, axis=1)

In [8]:

json.loads(train_df.iloc[161287].review)
#json.loads(train_df.iloc[1].review)
train_df.iloc[1].review[659:670]

'\r\nWe have t'