In [None]:
!pip install contractions
!pip install flair
!pip install autocorrect

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from bs4 import BeautifulSoup
import re
import nltk
import seaborn
import matplotlib
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
import unicodedata
import contractions
from flair.models import TextClassifier
from flair.data import Sentence
from autocorrect import Speller

In [None]:
drug_data = pd.read_csv('../input/case-study-data/case_study_data.tsv', sep='\t')

In [None]:
print(drug_data.shape)
drug_data.head()

In [None]:
# Test size of each drug to get one with a lot of entries
drug_data.groupby('drugName').count()

### Drug to Analyze: Sertraline
- Moving Forward analyzing Sertraline, an SSRI originally developed by Pfizer under the brand name 'Zoloft'. 
- Generic forms is called "Sertraline Hydrochloride" tablets

In [None]:
sertraline_data = drug_data[drug_data['drugName']=='Sertraline']
sertraline_data.to_csv('sertraline_data.csv')
sertraline_data = sertraline_data.reset_index(drop=True)
print(sertraline_data['review'][3])
sertraline_data.head()

### Data Preprocessing Notes
- Reviews contain contractions such as: I've, didn't, wasn't. Should be expanded.
- Idiosyncratic abbreviations such as: 30's, Dr (instead of doctor),
- Numbers with units such as '50mg'
- British colloquialisms such as "3 stone"

In [None]:
# Define preprocessing function
def clean_review(raw_review, remove_stopwords=False):
    #review_text = BeautifulSoup(raw_review).get_text() # Remove HTML tags
    review_text = raw_review
    review_text = unicodedata.normalize('NFKD', review_text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    review_text = contractions.fix(review_text)
    #no_specials = re.sub('[^a-zA-z0-9.,!?/:;\"\'\s]',' ', expand_contractions) # Remove non letters
    review_text = re.sub('[^a-zA-Z]', ' ', review_text)
    review_text = review_text.lower() # Lowercase everything
    review_text = re.sub(' mg ', ' ', review_text)
    review_text = re.sub(' olof ', ' zoloft ', review_text)
    review_text = re.sub(' quot ', ' ', review_text)
    #review_text = re.sub(' zoloft ', ' ', review_text)
    #spell = Speller(lang='en')
    #review_text = spell(review_text)
    words = review_text.split() # Tokenize
    if remove_stopwords:
        stop_words = set(stopwords.words('english')) # Make stops set for quicker searching
        words = [word for word in words if not word in stop_words] # Cut stop words
    return ' '.join(words) # Rejoin from list into passage/string

# Test the method
print(sertraline_data['review'][2]+'\n\n')
print(clean_review(sertraline_data['review'][2]))

In [None]:
# Clean All of the Reviews for this Drug
# Clean all reviews
num_reviews = sertraline_data['review'].size
clean_reviews = []
for i in range(0,num_reviews):
    if((i+1)%100 == 0): print(f'Review {i+1} of {num_reviews}\n')
    clean_reviews.append(clean_review(sertraline_data['review'][i], remove_stopwords=True))

# Applying Flair Pre-trained model

In [None]:
import flair
#classifier = TextClassifier.load('en-sentiment')
sentiment_model = flair.models.TextClassifier.load('en-sentiment')

In [None]:
# Test an obvious sentence
sentence = Sentence('This drug was very effective')
sentiment_model.predict(sentence)
# print sentence with predicted labels
print('Sentence above is: ', sentence.labels)

In [None]:
# Test a single review
print(sertraline_data['review'][3])
original_sentence = Sentence(sertraline_data['review'][3])
sentiment_model.predict(original_sentence)
print('Sentence above is: ', original_sentence.labels)

In [None]:
#print(clean_reviews[3])
clean_sentence = Sentence(clean_reviews[3])
sentiment_model.predict(clean_sentence)
print('Sentence above is: ', clean_sentence.labels)
print(clean_sentence.labels[0].value)
print(clean_sentence.labels[0].score)

In [None]:
sentiment = []
confidence = []

for review in clean_reviews:
    sentence = Sentence(review)
    sentiment_model.predict(sentence)
    sentiment.append(sentence.labels[0].value)
    confidence.append(sentence.labels[0].score) 

In [None]:
df = sertraline_data
df['review'] = clean_reviews
df['sentiment'] = sentiment
df['confidence'] = confidence

In [None]:
df[df.sentiment=='NEGATIVE'].head()

In [None]:
# Export with original reviews for inspection
export_df = df
export_df['review'] = sertraline_data['review']
export_df.to_csv('sertraline_flair_analysis.csv')

In [None]:
import matplotlib.pyplot as plt
# Making a pie chart of positive vs. negative sentiment
df_pie = df.sentiment.value_counts().reset_index()
print(df.sentiment.value_counts())
fig = plt.gcf()
fig.set_size_inches(7,7)
colors = ["Pink","Green"]
plt.pie(df_pie["sentiment"],labels=("Negative","Positive"),radius=1,autopct="%1.1f%%",
        shadow = True,startangle = 90,labeldistance = 1.1,colors=colors,explode =(0.1,0.1))
plt.axis('equal')
plt.title("Sentiment of Reviews (Flair)", fontsize=20)
#plt.savefig("images/Sentiment Distribution.png")
plt.show();

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_pos = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
vecs_pos = vectorizer_pos.fit_transform(df[df.sentiment=='POSITIVE'].review)
feature_names_pos = vectorizer_pos.get_feature_names()
dense_pos = vecs_pos.todense()
lst_pos = dense_pos.tolist()
dense_df_pos = pd.DataFrame(lst_pos, columns=feature_names_pos)
dense_df_pos.T.sum(axis=1)

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud

wordcloud = WordCloud(background_color="white", max_words=50).generate_from_frequencies(dense_df_pos.T.sum(axis=1))
plt.figure(figsize=(12, 10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.title('Positive Sentiment Word Cloud\n', fontdict={'fontsize': 40})
plt.show()

In [None]:
# Word Cloud with no TFDIF Vectorizer
positive_no_weight = df[df.sentiment=='POSITIVE'].review
# Convert review column into string of words
review_string = ''

for review in positive_no_weight:
    review = str(review)
    tokens = review.split()
    review_string += ' '.join(tokens) + ' '

wordcloud = WordCloud(background_color="white", max_words=50).generate(review_string)
plt.figure(figsize=(12, 10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.title('Positive Sentiment Word Cloud (No TFDIF)\n', fontdict={'fontsize': 40})
plt.show()

In [None]:
vectorizer_neg = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
vecs_neg = vectorizer_neg.fit_transform(df[df.sentiment=='NEGATIVE'].review)
feature_names_neg = vectorizer_neg.get_feature_names()
dense_neg = vecs_neg.todense()
lst_neg = dense_neg.tolist()
dense_df_neg = pd.DataFrame(lst_neg, columns=feature_names_neg)
dense_df_neg.T.sum(axis=1)

In [None]:
wordcloud = WordCloud(background_color="white", max_words=50).generate_from_frequencies(dense_df_neg.T.sum(axis=1))
plt.figure(figsize=(12, 10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.title('Negative Sentiment Word Cloud\n', fontdict={'fontsize': 40})
plt.show()