In [38]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from wordcloud import WordCloud
from wordcloud import STOPWORDS

In [40]:
import nltk
import regex as re
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [41]:
data = pd.read_csv("/content/drive/MyDrive/Python Practice/Prathamesh/DRUG RECOMMENDATION SYSTEM/csv_data.csv")
data.head(3)

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,review_sentiment,cleaned_review,sentiment_score,sentiment_score_clean
0,163740,mirtazapine,depression,"""I&#039;ve tried a few antidepressants over th...",10,2012-02-28,22,1,tri antidepress year citalopram fluoxetin amit...,-0.4596,0.847
1,206473,mesalamine,"crohn's disease, maintenance","""My son has Crohn&#039;s disease and has done ...",8,2009-05-17,17,1,son crohn diseas done well asacol no complaint...,0.0736,-0.5423
2,159672,bactrim,urinary tract infection,"""Quick reduction of symptoms""",9,2017-09-29,3,1,quick reduct symptom,0.0,0.0


In [42]:
# checking for any nan values in cleaned_review feature.

data['cleaned_review'].isna().sum()

4

In [43]:
# droping the rows containing nan values.
print('The data size before:',data.shape)
data = data.dropna(axis=0)
data.reset_index(inplace=True,drop=True)
print('The data size after dropping:',data.shape)

The data size before: (106094, 11)
The data size after dropping: (106090, 11)


### Feature Extraction

In [44]:
# Adding the year as feature

data['date'] = pd.to_datetime(data['date'])
data['year'] = data['date'].dt.year
data.head(2)

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,review_sentiment,cleaned_review,sentiment_score,sentiment_score_clean,year
0,163740,mirtazapine,depression,"""I&#039;ve tried a few antidepressants over th...",10,2012-02-28,22,1,tri antidepress year citalopram fluoxetin amit...,-0.4596,0.847,2012
1,206473,mesalamine,"crohn's disease, maintenance","""My son has Crohn&#039;s disease and has done ...",8,2009-05-17,17,1,son crohn diseas done well asacol no complaint...,0.0736,-0.5423,2009


In [45]:
# Adding the word count, stopword count,char length, unique words count, mean word length, puncation count
import string
stop_words = set(stopwords.words('english'))

In [46]:
#Word count in each review
data['word_count']=data["cleaned_review"].apply(lambda x: len(str(x).split()))

#Unique word count
data['unique_word_count']=data["cleaned_review"].apply(lambda x: len(set(str(x).split())))

#character count
data['char_length']=data["cleaned_review"].apply(lambda x: len(str(x)))

#punctuation count
data["count_punctuations"] = data["review"].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

#Number of stopwords
data["stopword_count"] = data["review"].apply(lambda x: len([w for w in str(x).lower().split() if w in stop_words]))

#Average length of the words
data["mean_word_len"] = data["cleaned_review"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

In [47]:
data.head(2)

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,review_sentiment,cleaned_review,sentiment_score,sentiment_score_clean,year,word_count,unique_word_count,char_length,count_punctuations,stopword_count,mean_word_len
0,163740,mirtazapine,depression,"""I&#039;ve tried a few antidepressants over th...",10,2012-02-28,22,1,tri antidepress year citalopram fluoxetin amit...,-0.4596,0.847,2012,38,35,260,22,27,5.842105
1,206473,mesalamine,"crohn's disease, maintenance","""My son has Crohn&#039;s disease and has done ...",8,2009-05-17,17,1,son crohn diseas done well asacol no complaint...,0.0736,-0.5423,2009,26,25,148,13,22,4.692308


In [48]:
data.corr()

Unnamed: 0,uniqueID,rating,usefulCount,review_sentiment,sentiment_score,sentiment_score_clean,year,word_count,unique_word_count,char_length,count_punctuations,stopword_count,mean_word_len
uniqueID,1.0,0.019414,0.020608,0.017797,0.007261,-0.00061,-0.003013,0.009764,0.009561,0.010861,0.006873,0.007313,0.008492
rating,0.019414,1.0,0.238638,0.916933,0.325417,0.197016,-0.198467,0.0253,0.020406,0.025999,0.001165,0.035578,0.007281
usefulCount,0.020608,0.238638,1.0,0.218401,0.051219,0.010272,-0.271909,0.034105,0.037448,0.041419,-0.002827,0.016458,0.052792
review_sentiment,0.017797,0.916933,0.218401,1.0,0.303313,0.181034,-0.190987,0.030023,0.025832,0.031058,0.005674,0.038383,0.008549
sentiment_score,0.007261,0.325417,0.051219,0.303313,1.0,0.696435,-0.089231,-0.062034,-0.058682,-0.064461,-0.024067,-0.032469,-0.014793
sentiment_score_clean,-0.00061,0.197016,0.010272,0.181034,0.696435,1.0,-0.075497,-0.087449,-0.083406,-0.090169,-0.036926,-0.053397,-0.017331
year,-0.003013,-0.198467,-0.271909,-0.190987,-0.089231,-0.075497,1.0,0.236344,0.238131,0.227546,0.225131,0.196968,-0.083012
word_count,0.009764,0.0253,0.034105,0.030023,-0.062034,-0.087449,0.236344,1.0,0.984005,0.993963,0.759289,0.901937,-0.04458
unique_word_count,0.009561,0.020406,0.037448,0.025832,-0.058682,-0.083406,0.238131,0.984005,1.0,0.981092,0.75531,0.888218,-0.025993
char_length,0.010861,0.025999,0.041419,0.031058,-0.064461,-0.090169,0.227546,0.993963,0.981092,1.0,0.749304,0.893226,0.04177


<b> Extracing the subject and object count for each review</b>

In [49]:
#https://smartenglishnotes.com/2021/11/19/main-features-of-a-sentence/
#https://stackoverflow.com/questions/28618400/how-to-identify-the-subject-of-a-sentence#:~:text=To%20mark%20the%20subject%2C%20write,an%20elephant%20with%20a%20gun%20.


import spacy
nlp = spacy.load("en_core_web_sm")

def subj_obj_count(review):

    sent = review
    doc=nlp(sent)
    sub_words = set([str(word) for word in doc if (word.dep_ == "nsubj")])

    obj_words = set([str(word) for word in doc if (word.dep_ == "dobj")])

    return len(sub_words),len(obj_words)


In [53]:
from tqdm import tqdm
count = []

for r in tqdm(data['review']):
    count.append(subj_obj_count(r))

 19%|█▊        | 19886/106090 [08:09<35:22, 40.62it/s]


KeyboardInterrupt: ignored

In [54]:
sub_obj = pd.DataFrame(count,columns=['subj_count','obj_count'])
sub_obj.head()

Unnamed: 0,subj_count,obj_count
0,4,6
1,3,5
2,0,0
3,10,10
4,6,11


In [55]:
csv_data=sub_obj.to_csv(index=False)
file_path = '/content/drive/MyDrive/Python Practice/Prathamesh/DRUG RECOMMENDATION SYSTEM/sub_obj.csv'
with open(file_path, 'w') as file:
    file.write(csv_data)
sub_obj = pd.read_csv('sub_obj.csv')
sub_obj.shape

(106090, 2)

<b> Extracting Named Entity recognition features for each cleaned review</b>

In [56]:
#https://towardsdatascience.com/text-analysis-feature-engineering-with-nlp-502d6ea9225d


ner_lst = nlp.pipe_labels['ner']

def ner(review):

    sent = review
    doc=nlp(sent)
    dic = {}.fromkeys(ner_lst,0)
    for word in doc.ents:
        dic[word.label_]+=1

    return dic


In [57]:
entity = pd.DataFrame([ner(r) for r in tqdm(data['cleaned_review'])])

  1%|          | 669/106090 [00:10<25:41, 68.41it/s]

KeyboardInterrupt: ignored

In [58]:
entity.to_csv(index=False)
file_path = '/content/drive/MyDrive/Python Practice/Prathamesh/DRUG RECOMMENDATION SYSTEM/entity.csv'
with open(file_path, 'w') as file:
    file.write(csv_data)
print(entity.shape)
entity.head(3)


(106090, 18)


Unnamed: 0,CARDINAL,DATE,EVENT,FAC,GPE,LANGUAGE,LAW,LOC,MONEY,NORP,ORDINAL,ORG,PERCENT,PERSON,PRODUCT,QUANTITY,TIME,WORK_OF_ART
0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


<b> Topic Modelling on cleaned reviews </b>

In [59]:
import gensim


In [62]:
corpus = data['cleaned_review']

## pre-process corpus for unigram words in a cleaned reviews
lst_corpus = []
for string in (corpus):
    lst_words = string.split()
    lst_grams = [" ".join(lst_words[i:i + 1]) for i in range(0, len(lst_words), 1)]
    lst_corpus.append(lst_grams)

## map words to an id
id2word = gensim.corpora.Dictionary(lst_corpus)

## create dictionary word:freq
dic_corpus = [id2word.doc2bow(word) for word in lst_corpus]

## train LDA
lda_model = gensim.models.ldamodel.LdaModel(corpus=dic_corpus, id2word=id2word, num_topics=20, chunksize=100, passes=10, alpha='auto', per_word_topics=True)


  1%|          | 671/106090 [03:30<9:12:26,  3.18it/s]


In [63]:
# storing the topic vectors for each review in a list
train_vecs = []
for i in range(len(corpus)):
    top_topics = (
        lda_model.get_document_topics(dic_corpus[i],
                                      minimum_probability=0.0)
    )
    topic_vec = [top_topics[i][1] for i in range(20)]

    train_vecs.append(topic_vec)

In [64]:
topics = pd.DataFrame(train_vecs)
print(topics.shape)
topics.head(3)

(106090, 20)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.008727,0.009068,0.007773,0.007274,0.008011,0.009936,0.104258,0.45258,0.015911,0.006854,0.065639,0.009914,0.004923,0.137026,0.008752,0.022762,0.0091,0.01364,0.089726,0.008125
1,0.010287,0.025587,0.024063,0.038371,0.009443,0.011712,0.050508,0.316037,0.018755,0.008079,0.077374,0.071292,0.005803,0.183236,0.010316,0.011928,0.010726,0.016077,0.03124,0.069166
2,0.015651,0.016263,0.01394,0.035716,0.014366,0.017819,0.071257,0.363266,0.028533,0.012292,0.049701,0.017779,0.008829,0.220508,0.015694,0.018148,0.016318,0.02446,0.024889,0.01457


In [65]:
topics.to_csv(index=False)
file_path = '/content/drive/MyDrive/Python Practice/Prathamesh/DRUG RECOMMENDATION SYSTEM/topics.csv'
with open(file_path, 'w') as file:
    file.write(csv_data)
print(topics.shape)


(106090, 20)


Now combining the features extracted above - subject object count,named entity recognition,topic modelling vectors for each of the review.

In [66]:
data = pd.concat([data,sub_obj,entity,topics],axis=1)
print(data.shape)
data.tail(3)

(106090, 58)


Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,review_sentiment,cleaned_review,sentiment_score,...,10,11,12,13,14,15,16,17,18,19
106087,130945,levonorgestrel,birth control,"""I&#039;m married, 34 years old and I have no ...",8,2010-11-15,7,1,marri year old no kid take pill hassl decid ge...,-0.9589,...,0.058912,0.007461,0.003705,0.187661,0.006586,0.007616,0.016359,0.057825,0.010445,0.006114
106088,47656,tapentadol,pain,"""I was prescribed Nucynta for severe neck/shou...",1,2011-11-28,20,0,prescrib nucynta sever neck shoulder pain take...,-0.926,...,0.03907,0.013976,0.006941,0.244634,0.012337,0.032087,0.012828,0.037048,0.019565,0.011454
106089,113712,arthrotec,sciatica,"""It works!!!""",9,2009-09-13,46,1,work,0.0,...,0.05206,0.018623,0.009248,0.230974,0.016439,0.019009,0.017093,0.025621,0.02607,0.015262


In [68]:
data.corr()

Unnamed: 0,uniqueID,rating,usefulCount,review_sentiment,sentiment_score,sentiment_score_clean,year,word_count,unique_word_count,char_length,...,10,11,12,13,14,15,16,17,18,19
uniqueID,1.0,0.019414,0.020608,0.017797,0.007261,-0.00061,-0.003013,0.009764,0.009561,0.010861,...,0.023749,-0.020463,-0.005758,0.020176,-0.026335,0.012597,0.01439,0.027531,-0.04787,0.005643
rating,0.019414,1.0,0.238638,0.916933,0.325417,0.197016,-0.198467,0.0253,0.020406,0.025999,...,0.118807,-0.005952,-0.092108,0.013849,0.010922,-0.008581,-0.01964,-0.099497,-0.027867,0.020334
usefulCount,0.020608,0.238638,1.0,0.218401,0.051219,0.010272,-0.271909,0.034105,0.037448,0.041419,...,0.080233,0.071567,-0.074305,0.094871,-0.076143,0.017446,-0.036351,-0.105496,-0.056592,-0.07176
review_sentiment,0.017797,0.916933,0.218401,1.0,0.303313,0.181034,-0.190987,0.030023,0.025832,0.031058,...,0.113479,0.003193,-0.086178,0.020258,0.010307,-0.004702,-0.016905,-0.098248,-0.029639,0.016434
sentiment_score,0.007261,0.325417,0.051219,0.303313,1.0,0.696435,-0.089231,-0.062034,-0.058682,-0.064461,...,0.082291,-0.035556,-0.018029,0.024243,0.060818,-0.066936,-0.016045,-0.084973,0.057666,-0.001219
sentiment_score_clean,-0.00061,0.197016,0.010272,0.181034,0.696435,1.0,-0.075497,-0.087449,-0.083406,-0.090169,...,0.023824,-0.04388,-0.005701,0.049945,0.097111,-0.086842,-0.009262,-0.090946,0.082312,-0.005191
year,-0.003013,-0.198467,-0.271909,-0.190987,-0.089231,-0.075497,1.0,0.236344,0.238131,0.227546,...,-0.067898,-0.059718,0.079286,-0.061237,0.035376,0.037893,0.012743,0.119904,0.009726,0.063442
word_count,0.009764,0.0253,0.034105,0.030023,-0.062034,-0.087449,0.236344,1.0,0.984005,0.993963,...,-0.058872,-0.0177,-0.015389,-0.075028,0.052377,0.05117,0.006748,0.063934,0.016011,0.031961
unique_word_count,0.009561,0.020406,0.037448,0.025832,-0.058682,-0.083406,0.238131,0.984005,1.0,0.981092,...,-0.051713,-0.014154,-0.010928,-0.083506,0.054324,0.050514,0.013902,0.073734,0.025041,0.030279
char_length,0.010861,0.025999,0.041419,0.031058,-0.064461,-0.090169,0.227546,0.993963,0.981092,1.0,...,-0.047698,-0.008122,-0.022619,-0.080095,0.048368,0.050741,0.00878,0.064307,0.013206,0.023357


In [69]:
csv_data = data.to_csv(index=False)
# Specify the file path and name for saving the CSV file
file_path = '/content/drive/MyDrive/Python Practice/Prathamesh/DRUG RECOMMENDATION SYSTEM/data.csv'
with open(file_path, 'w') as file:
    file.write(csv_data)
print("CSV file saved successfully at", file_path)

CSV file saved successfully at /content/drive/MyDrive/Python Practice/Prathamesh/DRUG RECOMMENDATION SYSTEM/data.csv


In [70]:
#data.to_csv('final_new_data_processed.csv',index=False)