In [1]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# load data
dataset_path = '/kaggle/input/psychiatric-drug-webmd-reviews/'

reviews_file = dataset_path + 'psychiatric_drug_webmd_reviews.csv'
df = pd.read_csv(reviews_file, index_col=0)

col_descripts_file = dataset_path + 'column_descriptions.csv'
col_descripts = pd.read_csv(col_descripts_file)

In [3]:
# description of columns

from pandas import option_context
from IPython.display import display

with option_context('display.max_colwidth', None, 'display.colheader_justify', 'left'):
    to_display = col_descripts.style.set_properties(**{'text-align': 'left'})
    display(to_display)

Unnamed: 0,column_name,description
0,drug_name,Name of medication being reviewed
1,date,Date of review
2,age,"Age group of patient, selected from a dropdown list"
3,gender,"Gender of patient, selected from a dropdown list"
4,time_on_drug,"How long the medication has been taken, selected from a dropdown list"
5,reviewer_type,"The type of person reviewing, patient or caregiver"
6,condition,"The ""main reason for taking this medication,"" selected from a dropdown list"
7,rating_overall,"Overall rating for the drug, the average of rating_effectiveness, rating_ease_of_use, and rating_satisfaction, from 1 to 5"
8,rating_effectiveness,"Agreement with the statement ""This drug has worked for me,"" from 1 to 5 stars"
9,rating_ease_of_use,"Agreement with the statement ""This medication has been easy for me to use,"" from 1 to 5 stars"


In [4]:
# total number of reviews available
len(df)

61320

In [5]:
# example of some of the records in the dataset
df.head(5)

Unnamed: 0,drug_name,date,age,gender,time_on_drug,reviewer_type,condition,rating_overall,rating_effectiveness,rating_ease_of_use,rating_satisfaction,text
0,Sertraline Oral,5/12/2024,45-54,Female,1 to less than 2 years,Patient,Posttraumatic Stress Syndrome,5.0,5,5,5,It's almost two years now and it has been so e...
1,Sertraline Oral,4/21/2024,35-44,Female,less than 1 month,Patient,Depression,1.0,1,1,1,Iv been on this 4 weeks and iv never felt so a...
2,Sertraline Oral,4/16/2024,25-34,Female,2 to less than 5 years,Patient,Repeated Episodes of Anxiety,4.3,4,4,5,Been taking sertraline for past 3 years. Had n...
3,Sertraline Oral,4/11/2024,45-54,Male,less than 1 month,Patient,Panic Disorder,1.7,1,3,1,"Of course, take this with a pinch of salt beca..."
4,Sertraline Oral,4/8/2024,13-18,Female,,Patient,Major Depressive Disorder,3.0,2,4,3,used for a while saw no effects.


In [6]:
# medications included in the dataset and links to their reviews

with option_context('display.max_colwidth', None, 'display.colheader_justify', 'left'):
    to_display = drugs.style.set_properties(**{'text-align': 'left'})
    display(to_display)

NameError: name 'drugs' is not defined

In [None]:
# possible values for reviewer type: patient, caregiver, or empty
for x in pd.unique(df['reviewer_type']):
    print(x)

In [None]:
# count number of reviews for each drug - plot only the top n
fig, ax = plt.subplots(figsize=(15, 5))
n_top = 50
order = df['drug_name'].value_counts(ascending=False)[:50]
sns.countplot(df, x='drug_name', order=order.index)
plt.xticks(rotation=-45, ha='left')
ax.bar_label(container=ax.containers[0], labels=order, rotation=-90)
plt.title('Number of Reviews for Each Drug')
plt.ylim(0, 5500);

In [None]:
# number of drugs with at least n reviews
counts = df['drug_name'].value_counts()
print(f'{np.sum(counts >= 100)} drugs with at least 100 reviews')
print(f'{np.sum(counts >= 50)} drugs with at least 50 reviews')

In [None]:
bins = 10
bin_width = (5-1)/bins
ax = sns.displot(df, x='rating_overall', bins=bins)
plt.xticks(np.arange(1, 5+bin_width, bin_width));

In [None]:
sns.displot(df, x='rating_effectiveness', bins=5);

In [None]:
sns.displot(df, x='rating_ease_of_use', bins=5);

In [None]:
sns.displot(df, x='rating_satisfaction', bins=5)

In [None]:
vals = np.array([x for x in pd.unique(df['age']) if type(x)==str])
first_numbers = [int(x.split('-')[0]) for x in vals]
order_inds = np.argsort(first_numbers)
order = vals[order_inds]
sns.countplot(df, x='age', order=order)

In [None]:
fig, ax = plt.subplots(figsize=(15, 5))
order = df['condition'].value_counts(ascending=False)
top_n = 15
top_conds = order[:top_n].index
sns.countplot(df[[x in top_conds for x in df['condition']]], x='condition', order=top_conds)
plt.xticks(rotation=-45, ha='left');

## Suggestions for Data Cleaning

In [None]:
# identify entries with NAs and remove or interpolate values depending on the analysis that you are doing
df.isna().sum()

In [None]:
# inspect the records where patient is very young
# it is possible that some of these are user input errors
# but there are also entries where age, time on drug, and reviewer type are consistent
df[[x == '0-2' for x in df['age']]]

In [None]:
df[[x == '3-6' for x in df['age']]]

In [None]:
# drop NAs

In [None]:
# identify nonsense reviews - for example, discard reviews where > a certain percentage of words are misspelled