In [93]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display, HTML, Markdown, Latex
import plotly.express as px
import plotly.offline as pyo
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
cf.go_offline()
import warnings

## Library for more complex plots
import plotly.figure_factory as ff

## For subplots
from plotly import subplots 

import cufflinks as cf
%matplotlib inline

display(HTML("""
<style>
.output {
    display: flex;
    align-items: center;
    text-align: center;
    
}
div.output_subarea{
    max-width:1200px;
}
div.text_cell_render{
padding: 5em 5em 0.5em 0.5em
}

</style>
"""))

# [Drug Review Dataset](https://archive.ics.uci.edu/ml/datasets/Drug+Review+Dataset+%28Drugs.com%29)

##### Please click the link above for the original source of the dataset.

- The dataset consists of six columns 
    1. Drug Name
    2. Condition (purpose of drug)
    3. Review (text review)
    4. Date of review
    5. Useful Count: number of users that reported the review as "useful."
    6. text_len: <strong>I generated a new column that is simply the length of the review<strong> 
    
### Purpose: The dataset is quite massive so after basic exploration, I will focus on the condition of Acne and generate a model that predicts rating from a review. This model could be applied to a situation where a user has given a review but no rating. For example, in a pharmacy a customer could quickly tell a pharmacist or doctor their opinion of a drug, which then through some speech to text software could be passed into my model quickly generating a new and easily consumable insight for a firm. It could also be applied to another dataset where reviews already exist however, users have not provided a rating, which increases the value of said review. 

In [2]:
df = pd.read_csv('data/drugsComTrain_raw.tsv', sep='\t').iloc[:,1:]
df['text_len'] = df['review'].apply(len)
df['rating'] = df['rating'].apply(int)

### Below are some high level diagnostics of the dataset
- The data set consists of over 161,000 entries
- The mean rating is around 7 out of 10
- The mean length for a review is about 458 characters
- There are 3436 unique drugs and 884 unique conditions

In [3]:
df.head()

Unnamed: 0,drugName,condition,review,rating,date,usefulCount,text_len
0,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,"May 20, 2012",27,79
1,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,"April 27, 2010",192,741
2,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,"December 14, 2009",17,755
3,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,"November 3, 2015",10,448
4,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,"November 27, 2016",37,719


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161297 entries, 0 to 161296
Data columns (total 7 columns):
drugName       161297 non-null object
condition      160398 non-null object
review         161297 non-null object
rating         161297 non-null int64
date           161297 non-null object
usefulCount    161297 non-null int64
text_len       161297 non-null int64
dtypes: int64(3), object(4)
memory usage: 8.6+ MB


In [5]:
df.describe()

Unnamed: 0,rating,usefulCount,text_len
count,161297.0,161297.0,161297.0
mean,6.994377,28.004755,458.726238
std,3.272329,36.403742,242.130235
min,1.0,0.0,3.0
25%,5.0,6.0,262.0
50%,8.0,16.0,455.0
75%,10.0,36.0,691.0
max,10.0,1291.0,10787.0


In [6]:
pd.DataFrame(df[['drugName', 'condition']].nunique(), columns=['Unique Count'])

Unnamed: 0,Unique Count
drugName,3436
condition,884


### Below is a snippet of the longest review in the training set, which is over 10,000 characters. This review for Venlafaxine is clearly an outlier.

In [7]:
display(HTML(  (df[df['text_len']>10000]['review'].values[0][:500]) )) 

### I wanted to also explore if the drug in question (Venlafaxine) had higher than average review length. The filtered summary for Venlafaxine shows that the average reviews are about 500 characters, which is quite close to the average review length on the whole training set. 

In [8]:
df[df['drugName']=='Venlafaxine'].describe()

Unnamed: 0,rating,usefulCount,text_len
count,1016.0,1016.0,1016.0
mean,6.800197,33.494094,499.034449
std,3.239905,35.465378,406.551095
min,1.0,0.0,11.0
25%,4.0,9.0,295.0
50%,8.0,22.0,506.0
75%,10.0,45.25,731.25
max,10.0,274.0,10787.0


### Like the very long review above, there 168 reviews that are under 10 characters. Most of these reviews are just one word such as "Great," and so do not offer enough information for meaningful analysis. 

In [9]:
df[df['text_len']<10].describe()

Unnamed: 0,rating,usefulCount,text_len
count,168.0,168.0,168.0
mean,8.720238,13.833333,6.845238
std,2.196692,14.359652,1.353585
min,1.0,0.0,3.0
25%,8.0,4.0,6.0
50%,10.0,9.0,7.0
75%,10.0,18.0,8.0
max,10.0,71.0,9.0


In [10]:
df[df['text_len']<10].head()

Unnamed: 0,drugName,condition,review,rating,date,usefulCount,text_len
1090,Amphetamine / dextroamphetamine,ADHD,"""Great""",10,"October 20, 2009",3,7
1914,Alesse,Birth Control,"""Awesome""",10,"November 23, 2015",0,9
2193,Rivaroxaban,Deep Vein Thrombosis,"""Good""",9,"December 8, 2013",11,6
3090,Mysoline,Seizures,"""Great!""",10,"August 26, 2009",21,8
5155,Septra,Bronchitis,"""Great!""",8,"October 18, 2013",3,8


### Below are the top ten drugs with the most ratings. Contraceptive drugs, obesity drugs, and mental health drugs dominate this list

In [11]:
# pd.crosstab(df['drugName'], df['rating'])
Drugs_rating = pd.DataFrame(df.groupby('drugName')['rating'].count().sort_values(ascending=False))

Drugs_rating.head(10)

Unnamed: 0_level_0,rating
drugName,Unnamed: 1_level_1
Levonorgestrel,3657
Etonogestrel,3336
Ethinyl estradiol / norethindrone,2850
Nexplanon,2156
Ethinyl estradiol / norgestimate,2117
Ethinyl estradiol / levonorgestrel,1888
Phentermine,1543
Sertraline,1360
Escitalopram,1292
Mirena,1242


### Below are the top conditions based on number of ratings

In [12]:
Drugs_condition = pd.DataFrame(df.groupby('condition')['rating'].count().sort_values(ascending=False))
Drugs_condition.head(10)

Unnamed: 0_level_0,rating
condition,Unnamed: 1_level_1
Birth Control,28788
Depression,9069
Pain,6145
Anxiety,5904
Acne,5588
Bipolar Disorde,4224
Insomnia,3673
Weight Loss,3609
Obesity,3568
ADHD,3383


### This dataset is so massive that I will focus on a single condition, such as Acne, since it is one of the top ten that I have experienced. 

- The acne only dataset shows that the drug with the most 10 star ratings is Isotretinoin, which is the generic for the number 2 drug, Accutane. 
- One of the issues in this dataset is that generic and brand name counterparts appear as different drugs even though the active ingredient is the same. 

In [13]:
acne = df[df['condition']=='Acne']

In [14]:
pd.crosstab(acne['drugName'], df['rating']).sort_values(by=10, ascending=False).head(10)

rating,1,2,3,4,5,6,7,8,9,10
drugName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Isotretinoin,28,4,10,8,14,12,32,47,127,235
Accutane,20,3,6,4,10,7,18,18,75,160
Adapalene / benzoyl peroxide,28,14,14,10,23,13,23,54,118,100
Epiduo,34,14,18,10,20,16,23,53,108,98
Benzoyl peroxide / clindamycin,16,14,5,6,10,9,17,24,69,92
Doxycycline,17,13,18,12,16,19,35,59,91,81
Tretinoin,12,5,10,7,8,8,11,30,58,79
Adapalene,21,7,4,4,4,6,12,23,32,63
Spironolactone,13,5,9,1,6,10,12,16,43,63
Dapsone,20,7,4,2,3,2,0,13,26,50


### Visually, it does not appear as if there is much of a difference between rating and review length. 

In [20]:
acne.pivot_table(index=acne.index,values='text_len', columns='rating').iplot(kind='box',
                                                                             title='Rating vs. Review length',
                                                                              xTitle='Rating',
                                                                              yTitle='Review length')


### It is possible that as useful counts increase, so does rating. However, the relationship appears to be quite weak.

In [36]:
acne.pivot_table(index=acne.index,values='usefulCount', columns='rating').iplot(kind='box',
                                                                             title='Rating vs. Useful Count',
                                                                              xTitle='Rating',
                                                                              yTitle='Useful Count')


### The bar chart below shows that most ratings are at least 9 out of 10. 

In [38]:
acne.groupby('rating').count().iloc[:,0].iplot(kind='bar', yTitle='Count', xTitle='Rating')

In [46]:
def rating_convertion(rating):
    if rating < 5:
        return 0
    else: 
        return 1


acne['simple_rating'] = acne['rating'].apply(rating_convertion)

### Reviews less than five are classified as 0 and those 5 or over are classified as 1.

In [52]:
acne.groupby('simple_rating').describe().stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,rating,usefulCount,text_len
simple_rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,count,1129.0,1129.0,1129.0
0,mean,1.979628,7.994686,471.147919
0,std,1.089331,8.90991,244.997167
0,min,1.0,0.0,14.0
0,25%,1.0,2.0,279.0
0,50%,2.0,5.0,460.0
0,75%,3.0,10.0,694.0
0,max,4.0,66.0,3651.0
1,count,4459.0,4459.0,4459.0
1,mean,8.740076,17.837183,535.277416


### NLP classification
- Now I will try to see if I can use the text from the reviews to predict is a review will be greater than or equal to 5 stars or less than five stars. 
- First I will split the dataset 70-30 into a training and test set.
- Next I will clean the text by stripping all accents and removing all english stopwords (stopwords such as "and," "then," etc.)

In [53]:
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.corpus import stopwords
import string

In [77]:
def cleaner(mess):
    '''
    Takes in a string of text, then removes all punctuation, removes stopwords, and stems
    '''
    nopunc = mess.translate(str.maketrans("","",string.punctuation))
    
    stemmed = PorterStemmer().stem(nopunc)
        
    stops = stopwords.words('english')
    
    return [word for word in stemmed.split() if word.lower() not in stops]

In [68]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

In [62]:
X_train, X_test, y_train, y_test = train_test_split(acne['review'] , acne['simple_rating'], test_size=0.3, random_state=101)


In [86]:
pipeline = Pipeline([
    ('bow', CountVectorizer(strip_accents='ascii',stop_words='english')),
#     ('tfidf', TfidfTransformer()),
    ('classifier', RandomForestClassifier(n_estimators=200))
])

In [87]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('bow', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        s...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

### I used a Count Vectorizor method that essentially generates a giant sparse matrix based on each word count. The final modeling technique uses a Random Forest Classifier. The results are quite promising. On the test set, the model has a very high level of precision for both of the rating categories defined earlier.
- Areas of improvement: The recall score, which is basically a model's pickiness, is not very good for scores that are less than five stars. Ideally, I want high precision and high recall. Also, this True Positive, True Negative, False Positive, and False Negative tradeoff is further detailed in the lower data frame.

In [88]:
pipeline_preds = pipeline.predict(X_test)
print(classification_report(y_test, pipeline_preds))
pd.DataFrame(confusion_matrix(y_test, pipeline_preds), columns=['TN', 'TP'], index=['TN', 'TP'])

              precision    recall  f1-score   support

           0       0.99      0.45      0.62       366
           1       0.87      1.00      0.93      1311

   micro avg       0.88      0.88      0.88      1677
   macro avg       0.93      0.72      0.77      1677
weighted avg       0.89      0.88      0.86      1677



Unnamed: 0,TN,TP
TN,163,203
TP,1,1310


### I also tried to use a Naive-Bayes classifier technique, however, it does not improve my results from the RFC method. 

In [79]:
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=cleaner)),
#     ('tfidf', TfidfTransformer()),
    ('classifier', MultinomialNB())
])

pipeline.fit(X_train, y_train)

pipeline_preds = pipeline.predict(X_test)
print(classification_report(y_test, pipeline_preds))
pd.DataFrame(confusion_matrix(y_test, pipeline_preds), columns=['TN', 'TP'], index=['TN', 'TP'])

              precision    recall  f1-score   support

           0       0.78      0.44      0.57       366
           1       0.86      0.97      0.91      1311

   micro avg       0.85      0.85      0.85      1677
   macro avg       0.82      0.70      0.74      1677
weighted avg       0.84      0.85      0.84      1677



Unnamed: 0,TN,TP
TN,162,204
TP,45,1266
