In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from wordcloud import WordCloud
from wordcloud import STOPWORDS

In [2]:
import nltk
import regex as re
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

In [3]:

data = pd.read_csv('new_data_processed.csv')
data.head(3)

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,review_sentiment,cleaned_review,sentiment_score,sentiment_score_clean
0,206461,valsartan,left ventricular dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27,1,no side effect take combin bystol mg fish oil,-0.296,-0.296
1,95260,guanfacine,adhd,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192,1,son halfway fourth week intuniv becam concern ...,0.8603,0.6929
2,92703,lybrel,birth control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17,0,use take anoth oral contracept pill cycl happi...,0.7962,0.2732


In [4]:
# checking for any nan values in cleaned_review feature. 

data['cleaned_review'].isna().sum()

8

In [5]:
# droping the rows containing nan values.
print('The data size before:',data.shape)
data = data.dropna(axis=0)
data.reset_index(inplace=True,drop=True)
print('The data size after dropping:',data.shape)

The data size before: (212106, 11)
The data size after dropping: (212098, 11)


### Feature Extraction 

In [6]:
# Adding the year as feature 

data['date'] = pd.to_datetime(data['date'])
data['year'] = data['date'].dt.year
data.head(2)

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,review_sentiment,cleaned_review,sentiment_score,sentiment_score_clean,year
0,206461,valsartan,left ventricular dysfunction,"""It has no side effect, I take it in combinati...",9,2012-05-20,27,1,no side effect take combin bystol mg fish oil,-0.296,-0.296,2012
1,95260,guanfacine,adhd,"""My son is halfway through his fourth week of ...",8,2010-04-27,192,1,son halfway fourth week intuniv becam concern ...,0.8603,0.6929,2010


In [9]:
# Adding the word count, stopword count,char length, unique words count, mean word length, puncation count
import string
stop_words = set(stopwords.words('english'))

In [10]:
#reference from quora question pair case study

#Word count in each review
data['word_count']=data["cleaned_review"].apply(lambda x: len(str(x).split()))

#Unique word count 
data['unique_word_count']=data["cleaned_review"].apply(lambda x: len(set(str(x).split())))

#character count
data['char_length']=data["cleaned_review"].apply(lambda x: len(str(x)))

#punctuation count
data["count_punctuations"] = data["review"].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))


#Number of stopwords
data["stopword_count"] = data["review"].apply(lambda x: len([w for w in str(x).lower().split() if w in stop_words]))

#Average length of the words
data["mean_word_len"] = data["cleaned_review"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

In [11]:
data.head(2)

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,review_sentiment,cleaned_review,sentiment_score,sentiment_score_clean,year,word_count,unique_word_count,char_length,count_punctuations,stopword_count,mean_word_len
0,206461,valsartan,left ventricular dysfunction,"""It has no side effect, I take it in combinati...",9,2012-05-20,27,1,no side effect take combin bystol mg fish oil,-0.296,-0.296,2012,9,9,46,3,7,4.111111
1,95260,guanfacine,adhd,"""My son is halfway through his fourth week of ...",8,2010-04-27,192,1,son halfway fourth week intuniv becam concern ...,0.8603,0.6929,2010,65,54,372,23,69,4.723077


In [12]:
data.corr()

Unnamed: 0,uniqueID,rating,usefulCount,review_sentiment,sentiment_score,sentiment_score_clean,year,word_count,unique_word_count,char_length,count_punctuations,stopword_count,mean_word_len
uniqueID,1.0,0.016202,0.017484,0.014954,0.008353,0.003485,-0.006946,0.006946,0.006702,0.007508,0.004517,0.007805,0.004428
rating,0.016202,1.0,0.236272,0.916472,0.327504,0.200211,-0.194851,0.022936,0.020129,0.023432,0.002107,0.032672,0.002393
usefulCount,0.017484,0.236272,1.0,0.213866,0.05764,0.016842,-0.27379,0.030587,0.033853,0.037274,-0.003497,0.014023,0.050871
review_sentiment,0.014954,0.916472,0.213866,1.0,0.304231,0.184088,-0.188974,0.028739,0.026048,0.029466,0.006406,0.037407,0.004027
sentiment_score,0.008353,0.327504,0.05764,0.304231,1.0,0.699318,-0.091548,-0.065219,-0.061688,-0.067056,-0.022686,-0.035761,-0.01207
sentiment_score_clean,0.003485,0.200211,0.016842,0.184088,0.699318,1.0,-0.075665,-0.085405,-0.081571,-0.087271,-0.035117,-0.052401,-0.014395
year,-0.006946,-0.194851,-0.27379,-0.188974,-0.091548,-0.075665,1.0,0.236491,0.238791,0.226877,0.226172,0.200004,-0.087774
word_count,0.006946,0.022936,0.030587,0.028739,-0.065219,-0.085405,0.236491,1.0,0.983075,0.99415,0.762291,0.904555,-0.042058
unique_word_count,0.006702,0.020129,0.033853,0.026048,-0.061688,-0.081571,0.238791,0.983075,1.0,0.980232,0.7582,0.889597,-0.023822
char_length,0.007508,0.023432,0.037274,0.029466,-0.067056,-0.087271,0.226877,0.99415,0.980232,1.0,0.752559,0.895756,0.042597


<b> Extracing the subject and object count for each review</b>

In [13]:
#https://smartenglishnotes.com/2021/11/19/main-features-of-a-sentence/
#https://stackoverflow.com/questions/28618400/how-to-identify-the-subject-of-a-sentence#:~:text=To%20mark%20the%20subject%2C%20write,an%20elephant%20with%20a%20gun%20.


import spacy
nlp = spacy.load("en_core_web_sm")

def subj_obj_count(review):

    sent = review
    doc=nlp(sent)
    sub_words = set([str(word) for word in doc if (word.dep_ == "nsubj")])

    obj_words = set([str(word) for word in doc if (word.dep_ == "dobj")])

    return len(sub_words),len(obj_words)


2022-05-02 17:30:09.052778: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-05-02 17:30:09.052815: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [16]:
from tqdm import tqdm
count = []

for r in tqdm(data['review']):
    count.append(subj_obj_count(r))

100%|██████████| 212098/212098 [1:57:32<00:00, 30.07it/s]  


In [90]:
sub_obj = pd.DataFrame(count,columns=['subj_count','obj_count'])
sub_obj.head()

Unnamed: 0,subj_count,obj_count
0,2,2
1,8,6
2,10,10
3,8,5
4,10,11


In [13]:
#sub_obj.to_csv('sub_obj.csv',index=False)
sub_obj = pd.read_csv('sub_obj.csv')
sub_obj.shape

(212098, 2)

<b> Extracting Named Entity recognition features for each cleaned review</b>

In [44]:
#https://towardsdatascience.com/text-analysis-feature-engineering-with-nlp-502d6ea9225d


ner_lst = nlp.pipe_labels['ner']

def ner(review):

    sent = review
    doc=nlp(sent)
    dic = {}.fromkeys(ner_lst,0)
    for word in doc.ents:
        dic[word.label_]+=1
        
    return dic


In [84]:
entity = pd.DataFrame([ner(r) for r in tqdm(data['cleaned_review'])])

100%|██████████| 212098/212098 [57:24<00:00, 61.58it/s]  


In [14]:
#entity.to_csv('entities.csv',index=False)
entity = pd.read_csv('entities.csv')
print(entity.shape)
entity.head(3)


(212098, 18)


Unnamed: 0,CARDINAL,DATE,EVENT,FAC,GPE,LANGUAGE,LAW,LOC,MONEY,NORP,ORDINAL,ORG,PERCENT,PERSON,PRODUCT,QUANTITY,TIME,WORK_OF_ART
0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,0,5,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,3,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0


<b> Topic Modelling on cleaned reviews </b>

In [3]:
import gensim


In [69]:
corpus = data['cleaned_review']

## pre-process corpus for unigram words in a cleaned reviews
lst_corpus = []
for string in tqdm(corpus):
    lst_words = string.split()
    lst_grams = [" ".join(lst_words[i:i + 1]) for i in range(0, len(lst_words), 1)]
    lst_corpus.append(lst_grams)
    
## map words to an id
id2word = gensim.corpora.Dictionary(lst_corpus)

## create dictionary word:freq
dic_corpus = [id2word.doc2bow(word) for word in lst_corpus] 

## train LDA
lda_model = gensim.models.ldamodel.LdaModel(corpus=dic_corpus, id2word=id2word, num_topics=20, chunksize=100, passes=10, alpha='auto', per_word_topics=True)
 

100%|██████████| 212098/212098 [00:05<00:00, 41127.28it/s]


In [70]:
# storing the topic vectors for each review in a list
train_vecs = []
for i in range(len(corpus)):
    top_topics = (
        lda_model.get_document_topics(dic_corpus[i],
                                      minimum_probability=0.0)
    )
    topic_vec = [top_topics[i][1] for i in range(20)]

    train_vecs.append(topic_vec)

In [71]:
topics = pd.DataFrame(train_vecs)
print(topics.shape)
topics.head(3)

(212098, 20)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.011948,0.009549,0.015107,0.0483,0.008554,0.072174,0.0123,0.024713,0.100153,0.107563,0.007515,0.007186,0.01794,0.008827,0.00974,0.010562,0.006819,0.429381,0.013676,0.077992
1,0.005685,0.013905,0.007188,0.032345,0.00407,0.006259,0.005853,0.01194,0.188056,0.116717,0.003576,0.003419,0.008536,0.0042,0.004634,0.005026,0.003245,0.494278,0.006508,0.074561
2,0.005631,0.013771,0.007121,0.03204,0.004032,0.0062,0.005798,0.071161,0.14923,0.117662,0.022085,0.003387,0.008456,0.004161,0.11589,0.004978,0.003214,0.363426,0.006446,0.05531


In [15]:
#topics.to_csv('topics.csv',index=False)
topics = pd.read_csv('topics.csv')
topics.shape

(212098, 20)

Now combining the features extracted above - subject object count,named entity recognition,topic modelling vectors for each of the review. 

In [16]:
data = pd.concat([data,sub_obj,entity,topics],axis=1)
print(data.shape)
data.tail(3)

(212098, 58)


Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,review_sentiment,cleaned_review,sentiment_score,...,10,11,12,13,14,15,16,17,18,19
212095,130945,levonorgestrel,birth control,"""I&#039;m married, 34 years old and I have no ...",8,2010-11-15,7,1,marri year old no kid take pill hassl decid ge...,-0.9589,...,0.01331,0.003517,0.008782,0.004321,0.0144,0.00517,0.003338,0.369418,0.006695,0.134499
212096,47656,tapentadol,pain,"""I was prescribed Nucynta for severe neck/shou...",1,2011-11-28,20,0,prescrib nucynta sever neck shoulder pain take...,-0.926,...,0.006605,0.006316,0.015768,0.007758,0.025856,0.009283,0.005994,0.39469,0.012021,0.068549
212097,113712,arthrotec,sciatica,"""It works!!!""",9,2009-09-13,46,1,work,0.0,...,0.008716,0.008333,0.020806,0.010237,0.011295,0.012249,0.007909,0.406685,0.015861,0.09045


In [17]:
data.corr()

Unnamed: 0,uniqueID,rating,usefulCount,review_sentiment,sentiment_score,sentiment_score_clean,year,word_count,unique_word_count,char_length,...,10,11,12,13,14,15,16,17,18,19
uniqueID,1.0,0.016202,0.017484,0.014954,0.008353,0.003485,-0.006946,0.006946,0.006702,0.007508,...,0.006675,-0.005742,-0.030863,-0.008921,-0.069508,-0.01333,0.017929,0.022729,0.062335,0.017354
rating,0.016202,1.0,0.236272,0.916472,0.327504,0.200211,-0.194851,0.022936,0.020129,0.023432,...,-0.019261,-0.035628,0.0202,-0.093961,-0.040518,-0.019251,0.038065,0.073189,0.019986,0.132905
usefulCount,0.017484,0.236272,1.0,0.213866,0.05764,0.016842,-0.27379,0.030587,0.033853,0.037274,...,-0.041124,0.001133,0.0084,-0.044054,-0.137872,-0.021784,0.059729,0.155219,0.172004,-0.017751
review_sentiment,0.014954,0.916472,0.213866,1.0,0.304231,0.184088,-0.188974,0.028739,0.026048,0.029466,...,-0.024252,-0.031909,0.018734,-0.088705,-0.042324,-0.013976,0.032914,0.084088,0.023902,0.115678
sentiment_score,0.008353,0.327504,0.05764,0.304231,1.0,0.699318,-0.091548,-0.065219,-0.061688,-0.067056,...,-0.039387,-0.051972,0.086386,-0.087053,0.014078,0.021728,0.004403,0.050031,-0.091916,0.148381
sentiment_score_clean,0.003485,0.200211,0.016842,0.184088,0.699318,1.0,-0.075665,-0.085405,-0.081571,-0.087271,...,-0.032564,-0.056391,0.0917,-0.01786,0.028761,0.009351,-0.019169,0.068322,-0.089059,0.128777
year,-0.006946,-0.194851,-0.27379,-0.188974,-0.091548,-0.075665,1.0,0.236491,0.238791,0.226877,...,0.054482,0.036727,-0.003533,0.025033,0.038688,0.021917,0.006281,-0.112532,-0.037116,-0.022441
word_count,0.006946,0.022936,0.030587,0.028739,-0.065219,-0.085405,0.236491,1.0,0.983075,0.99415,...,0.020941,0.048844,0.019241,-0.050067,0.021375,-0.023725,-0.013183,-0.122276,0.025354,-0.008378
unique_word_count,0.006702,0.020129,0.033853,0.026048,-0.061688,-0.081571,0.238791,0.983075,1.0,0.980232,...,0.020348,0.045598,0.023369,-0.043874,0.011276,-0.0204,-0.007574,-0.125349,0.030874,-0.006442
char_length,0.007508,0.023432,0.037274,0.029466,-0.067056,-0.087271,0.226877,0.99415,0.980232,1.0,...,0.012309,0.042661,0.015986,-0.048259,0.013413,-0.021687,-0.008892,-0.124358,0.047288,-0.00821


In [106]:
#data.to_csv('final_new_data_processed.csv',index=False)