In [None]:
!pip install contractions
!pip install flair
!pip install autocorrect

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from bs4 import BeautifulSoup
import re
import nltk
import seaborn
import matplotlib
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
import unicodedata
import contractions
from flair.models import TextClassifier
from flair.data import Sentence
from autocorrect import Speller

In [None]:
drug_data = pd.read_csv('../input/case-study-data/case_study_data.tsv', sep='\t')

In [None]:
print(drug_data.shape)
drug_data.head()

In [None]:
# Test size of each drug to get one with a lot of entries
drug_data.groupby('drugName').count()

### Drug to Analyze: Sertraline
- Moving Forward analyzing Sertraline, an SSRI originally developed by Pfizer under the brand name 'Zoloft'. 
- Generic forms is called "Sertraline Hydrochloride" tablets

In [None]:
sertraline_data = drug_data[drug_data['drugName']=='Sertraline']
sertraline_data.to_csv('sertraline_data.csv')
sertraline_data = sertraline_data.reset_index(drop=True)
print(sertraline_data['review'][3])
sertraline_data.head()

### Data Preprocessing Notes
- Reviews contain contractions such as: I've, didn't, wasn't. Should be expanded.
- Idiosyncratic abbreviations such as: 30's, Dr (instead of doctor),
- Numbers with units such as '50mg'
- British colloquialisms such as "3 stone"

In [None]:
# Define preprocessing function
def clean_review(raw_review, remove_stopwords=False):
    #review_text = BeautifulSoup(raw_review).get_text() # Remove HTML tags
    review_text = raw_review
    review_text = unicodedata.normalize('NFKD', review_text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    #spell = Speller(lang='en')
    #review_text = spell(review_text)
    review_text = contractions.fix(review_text)
    #no_specials = re.sub('[^a-zA-z0-9.,!?/:;\"\'\s]',' ', expand_contractions) # Remove non letters
    review_text = re.sub('[^a-zA-Z]', ' ', review_text)
    review_text = review_text.lower() # Lowercase everything
    review_text = re.sub(' mg ', ' ', review_text)
    review_text = re.sub(' quot ', ' ', review_text)
    review_text = re.sub(' olof ', ' zoloft ', review_text)
    #review_text = re.sub(' zoloft ', ' ', review_text)
    words = review_text.split() # Tokenize
    if remove_stopwords:
        stop_words = set(stopwords.words('english')) # Make stops set for quicker searching
        words = [word for word in words if not word in stop_words] # Cut stop words
    return ' '.join(words) # Rejoin from list into passage/string

# Test the method
print(sertraline_data['review'][2]+'\n\n')
print(clean_review(sertraline_data['review'][2]))

In [None]:
# Clean All of the Reviews for this Drug
# Clean all reviews
num_reviews = sertraline_data['review'].size
clean_reviews = []
for i in range(0,num_reviews):
    if((i+1)%100 == 0): print(f'Review {i+1} of {num_reviews}\n')
    clean_reviews.append(clean_review(sertraline_data['review'][i], remove_stopwords=True))

## Model 1 - Use Word2Vec and K Means Clustering on Unlabled Data
- Preprocess data using Regex and contractions
- Create word embeddings using word2vec
- Use clustering since data does not have labels

In [None]:
import nltk.data
from gensim.models import word2vec

**Alternative Approach: Word Vectors**
Use Google's Word2Vec
* Define new preprocessing function which OPTIONALLY removes stop words
* We will use [\W_]+ so as to keep all alphanumeric and underscore chars
* This function returns list of words rather than sentences as we did for bag of words appraoch

In [None]:
def review_to_wordlist(review, remove_stopwords = False):
    #review_text = BeautifulSoup(review).get_text()
    review_text = review
    review_text = unicodedata.normalize('NFKD', review_text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    review_text = contractions.fix(review_text)
    #no_specials = re.sub('[^a-zA-z0-9.,!?/:;\"\'\s]',' ', expand_contractions) # Remove non letters
    review_text = re.sub('[^a-zA-Z]', ' ', review_text)
    review_text = review_text.lower() # Lowercase everything
    review_text = re.sub(' mg ', ' ', review_text)
    review_text = re.sub(' olof ', ' zoloft ', review_text)
    review_text = re.sub(' quot ', ' ', review_text)
    #review_text = re.sub(' zoloft ', ' ', review_text)
    #spell = Speller(lang='en')
    #review_text = spell(review_text)
    words = review_text.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words('english'))
        words = [word for word in words if not word in stops]
    return words


In [None]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
def review_to_sentences(review, tokenizer, remove_stopwords = False):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
    return sentences

In [None]:
sentences = []
print('Parts in sentences from unlabled training set')
for review in sertraline_data['review']:
    sentences += review_to_sentences(review, tokenizer, remove_stopwords=True)

In [None]:
# Quick check that sentences are created properly
print(sertraline_data['review'][0])
print(sentences[0])
print(len(sentences))

###Default Values for Word2Vec Initializer:
- sentences=None, corpus_file=None, vector_size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=<built-in function hash>, epochs=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=10000, compute_loss=False, callbacks=(), comment=None, max_final_vocab=None, shrink_windows=True)

In [None]:
from gensim.models import word2vec
print('Training model...')

the_sentences = sentences
# Default Values
w2v_model = word2vec.Word2Vec()
w2v_model.build_vocab(the_sentences)
w2v_model.train(the_sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1 )

In [None]:
# Test the model by looking at some basic associations
w2v_model.wv.most_similar(positive=['depressed'])

### Making Feature Vectors
- Each customer review should be a list of vector embeddings that can be operated on mathematically
- We need to create a "feature set" out of each review that is the same length even though each has a different number of words

In [None]:
print(w2v_model)
w2v_model.wv['depression']

In [None]:
word_vectors = w2v_model.wv
print(type(word_vectors.vectors.astype('double')))

## Applying K Means Clustering
- Goal: Divide into 2 clusters: Positive, Negative

In [None]:
from sklearn.cluster import KMeans
model = KMeans(n_clusters=2, max_iter=1000, random_state=2, n_init=50)
model.fit(X=word_vectors.vectors.astype('double'))

In [None]:
word_vectors.similar_by_vector(model.cluster_centers_[1], topn=50, restrict_vocab=None)

In [None]:
#neutral_cluster_center = model.cluster_centers_[1]
positive_cluster_center = model.cluster_centers_[0]
negative_cluster_center = model.cluster_centers_[1]

### Calculating the Sentiments
- Total Score / Number of words
- Positive gets +1 and Negative -1
- Closeness Score will be the inverse of the minimum distance of the vector to a centroid. This means words that are close to one of the centroids will get a high value of the closenes score since model.transform([a_vector]) yields the distance to all cluster centroids. For example 'celexa' is in cluster [0] the positive cluster...so tranform yields (0.86) which means it's closer to the [0] cluster which is good!. We'll take the minimum of these two.
- Sentiment coefficient is the closeness score * cluster value (+/- 1) so we have positivity and negativity weighted by how close it is to a centroid.

In [None]:
words = pd.DataFrame(w2v_model.wv.index_to_key) # Create data frame where each row is one of the words in corpus
words.columns = ['word'] # Name that column words
words['vector'] = words.word.apply(lambda x: w2v_model.wv[f'{x}']) # Column of the vectors for each word
words['cluster'] = words.vector.apply(lambda x: model.predict([np.array(x)])) # Classify each vector in its cluster
words.cluster = words.cluster.apply(lambda x: x[0]) # Grab the cluster number so it's not in bracketss
words['cluster_value'] = [1 if i==0 else -1 for i in words.cluster] # 1 for positive guys
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vector]).min()), axis=1) # Calculate inverse of distancee of word vector to cluster centroid (since 0.99 should be very close 1/(1-0.99) = 1/0.01 > 1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value
print(words.shape)
words.head()

In [None]:
words[words['word']=='feel']['vector']
model.transform([words.iloc[766,1]])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
vecs = vectorizer.fit_transform(clean_reviews)
feature_names = vectorizer.get_feature_names()
dense = vecs.todense()
lst1 = dense.tolist()
dense_df = pd.DataFrame(lst1, columns=feature_names)
weight_df = pd.DataFrame(dense_df.T.sum(axis=1), index=None)
weight_df = weight_df.reset_index()
weight_df = weight_df.rename(columns={'index': 'word code', 0: 'weight'})
weight_df['word'] = sorted(vectorizer.vocabulary_.keys())
weight_df.head()

In [None]:
# Add column to words that has TFDIF value of each word within entire corpus
tfdif_weighted = words.merge(weight_df, how='inner', left_on='word', right_on='word')
tfdif_weighted.head()


In [None]:
tfdif_weighted['sentiment_rate'] = tfdif_weighted.apply(lambda x: np.array(x.loc['sentiment_coeff']) * np.array(x.loc['weight']), axis=1)
tfdif_weighted['prediction'] = (tfdif_weighted.sentiment_rate>0).astype('int8')
tfdif_weighted.head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

cleaned_df = sertraline_data
cleaned_df['review'] = clean_reviews

tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(cleaned_df.review)
features = pd.DataFrame(tfidf.get_feature_names())
transformed = tfidf.transform(cleaned_df.review).toarray()

In [None]:
clean_reviews[0]
tfdif_weighted[tfdif_weighted.word=='depression'].sentiment_rate.item()

In [None]:
def calc_sentiment(data, sent_dict):
    total = 0
    count = 0
    for word in data.split():
        if word in list(sent_dict.word):
            total += sent_dict[sent_dict.word == word].sentiment_rate.item()
        count += 1
    avg = total/count
    if avg < -0.15: sentiment = -1
    elif avg > 0.15: sentiment = 1
    return sentiment

In [None]:
df_cluster = sertraline_data
df_cluster['review'] = clean_reviews
df_cluster['sentiment'] = [calc_sentiment(df_cluster['review'][i], tfdif_weighted) for i in range(0, len(clean_reviews))]

In [None]:
#df_cluster = df_cluster.drop(columns=['confidence'])
# Change +1 and -1 sentiment to positive and negative respectively
df_cluster['sentiment_word'] = df_cluster['sentiment'].apply(lambda x: 'POSITIVE' if x == 1 else 'NEGATIVE')
df_cluster.head()
#print(df_cluster.sentiment_word.value_counts())

In [None]:
df_export = df_cluster
df_export.to_csv('sertraline_clustering_analysis.csv')
df_export.head()

In [None]:
import matplotlib.pyplot as plt

df_pie_cluster = df_cluster.sentiment_word.value_counts()
df_pie_cluster.index
print(df_pie_cluster)
fig = plt.gcf()
fig.set_size_inches(7,7)
#color_dict = dict(zip(np.unique(df_pie_cluster.index), plt.cm.tab10.colors))
#plt.pie(df_pie_cluster, radius=1, labels = df_pie_cluster.index, autopct="%1.1f%%",
#       shadow = True,startangle = 90,labeldistance = 1.1,colors=[color_dict[v] for v in df_pie_cluster.index],explode =(0.1,0.1))
plt.pie(df_pie_cluster, radius=1, labels = df_pie_cluster.index, autopct="%1.1f%%",
        shadow = True,startangle = 90,labeldistance = 1.1,colors=['lightgreen', 'Pink'],explode =(0.1,0.1))
plt.axis('equal')
plt.title("Sentiment of Reviews (Clustering)", fontsize=20)
plt.show()

In [None]:
# Making Word Cloud for Positive Sentiment Reviews
from wordcloud import WordCloud

vectorizer_pos = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
vecs_pos = vectorizer_pos.fit_transform(df_cluster[df_cluster.sentiment_word=='POSITIVE'].review)
feature_names = vectorizer_pos.get_feature_names()
dense_pos = vecs_pos.todense()
lst_pos = dense_pos.tolist()
dense_df_pos = pd.DataFrame(lst_pos, columns=feature_names)
dense_df_pos.T.sum(axis=1)

wordcloud = WordCloud(background_color="white", max_words=50).generate_from_frequencies(dense_df_pos.T.sum(axis=1))
plt.figure(figsize=(12, 10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.title('Positive Sentiment Word Cloud (Clustering) \n', fontdict={'fontsize': 40})
plt.show()

In [None]:
vectorizer_neg = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
vecs_neg = vectorizer_neg.fit_transform(df_cluster[df_cluster.sentiment_word=='NEGATIVE'].review)
feature_names_neg = vectorizer_neg.get_feature_names()
dense_neg = vecs_neg.todense()
lst_neg = dense_neg.tolist()
dense_df_neg = pd.DataFrame(lst_neg, columns=feature_names_neg)
dense_df_neg.T.sum(axis=1)

wordcloud = WordCloud(background_color="white", max_words=50).generate_from_frequencies(dense_df_neg.T.sum(axis=1))
plt.figure(figsize=(12, 10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.title('Negative Sentiment Word Cloud (Clustering) \n', fontdict={'fontsize': 40})
plt.show()