In [3]:
import collections
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
%matplotlib inline

In [None]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
train_df = pd.read_csv('drugsComTrain_raw.tsv', delimiter='\t')
test_df = pd.read_csv('drugsComTest_raw.tsv', delimiter='\t')

In [None]:
comb_df = pd.concat([train_df, test_df])

In [None]:
sw = stopwords.words('english')
not_stop = ["aren't","couldn't","didn't","doesn't","don't","hadn't","hasn't",\
            "haven't","isn't","mightn't","mustn't","needn't","no","nor","not",\
            "shan't","shouldn't","wasn't","weren't","wouldn't"]
sw.extend(['year', 'old', 'im', 'mg', 'son', 'daughter', 'medicine', 'ive', \
           'get', 'medic'])

for i in not_stop:
    sw.remove(i)
ps = PorterStemmer()
ls = WordNetLemmatizer()
stemmer = SnowballStemmer('english')

def cleanReview(review: str):
  review = review.lower()
  nts = re.sub('[^a-z ]+', '', review).split()
  nts = [word for word in nts if word not in sw]
  nts = [stemmer.stem(w) for w in nts]
  return ' '.join(nts)


def cleanDataset(df: pd.DataFrame) -> pd.DataFrame:
  df = df.dropna(subset=['review', 'drugName'])
  df = df.drop_duplicates(ignore_index = True)
  df['review'] = df['review'].apply(cleanReview)
  return df

In [None]:
def filterReviewsForDrug(df: pd.DataFrame, drug: str) -> pd.DataFrame:
  return df[df.drugName == drug]

def filterPositiveReviews(df: pd.DataFrame) -> pd.DataFrame:
  return df[df.rating >= 5]

def filterNegativeReviews(df: pd.DataFrame) -> pd.DataFrame:
  return df[df.rating < 5]

def Tfidf_Vector(col):
    vectorizer = TfidfVectorizer(max_features=10)
    X = vectorizer.fit_transform(col)
    print(pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out()).head())

In [None]:
def createNgram(review, nGram=1):
  token = [token for token in review.lower().split(" ") if token != ""]
  ngrams = zip(*[token[i:] for i in range(nGram)])
  return [" ".join(ngram) for ngram in ngrams]


def createNgramFrequencyDf(df, nGram=1):
  freq_dict = collections.defaultdict(int)
  for review in df["review"]:
      for word in createNgram(review, nGram):
          freq_dict[word] += 1
  if len(freq_dict) > 0:
    fd_sorted = pd.DataFrame(sorted(freq_dict.items(), key=lambda x: x[1])[::-1])
    fd_sorted.columns = ["word", "wordcount"]
    return fd_sorted
  return None

In [None]:
#cleaned_df = cleanDataset(comb_df)
#testdf = filterReviewsForDrug(cleaned_df, 'Valsartan')

#posReviews = filterNegativeReviews(testdf)

#createNgramFrequencyDf(testdf, 3)

Unnamed: 0,word,wordcount
0,high blood pressur,8
1,no side effect,6
2,control blood pressur,5
3,blood pressur medic,5
4,lower blood pressur,4
...,...,...
2275,bystol fish oil,1
2276,combin bystol fish,1
2277,take combin bystol,1
2278,effect take combin,1


In [None]:
def createDataFrame(df: pd.DataFrame) -> pd.DataFrame:
  cleanedDf = cleanDataset(df)
  uniqueDrugList = cleanedDf['drugName'].unique()
  drugToTagDict = {}
  for drug in uniqueDrugList:
    drugDf = filterReviewsForDrug(cleanedDf, drug)
    reviewNGrams = createNgramFrequencyDf(drugDf, 3)
    if reviewNGrams is not None:
      drugToTagDict[drug] = reviewNGrams['word'].head(5).tolist()


  resultDf = pd.DataFrame(columns=['drugName', 'tag1', 'tag2', 'tag3', 'tag4', 'tag5'])
  for key, values in drugToTagDict.items():
    newRow = {}
    newRow['drugName'] = key
    for i in range(len(values)):
      newRow['tag'+str(i+1)] = values[i]
    resultDf = resultDf.append(newRow, ignore_index=True)

  return resultDf


In [None]:
resultDf = createDataFrame(comb_df)

In [None]:
resultDf.to_csv('result.csv')