In [None]:
import pandas as pd
import itertools
import string
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline
#to show all the rows of pandas dataframe
pd.set_option('display.max_rows',None)

In [None]:
import numpy as np

In [None]:
data = pd.read_csv(r"C:\Users\satish\Downloads\drug+review+dataset+drugs+com\drugsComTrain_raw.tsv", sep='\t')

In [None]:
data.head()

In [None]:
data.condition.value_counts()

In [None]:
data_train=data[(data['condition']=='Birth Control')|(data['condition']=='Diabetes, Type 2')|(data['condition']=='Depression')|(data['condition']=='High Blood Pressure')]

In [None]:
data.shape

In [None]:
data_train.shape

In [None]:
X=data_train.drop(['Unnamed: 0','drugName','rating','date','usefulCount'],axis=1)

In [None]:
X.condition.value_counts()

In [None]:
X.head()

In [None]:
X_birth=X[(X['condition']=='Birth Control')]
X_dep=X[(X['condition']=='Depression')]
X_bp=X[(X['condition']=='High Blood Pressure')]
X_diab=X[(X['condition']=='Diabetes, Type 2')]

In [None]:
from wordcloud import WordCloud
plt.figure(figsize=(20, 20)) #Text that is Fake News Headline
wc = WordCloud(max_words=500, width=1600, height=800).generate(" ".join(X_birth['review']))
plt.imshow(wc, interpolation='bilinear')
plt.title("word cloud for Birth Control",fontsize = 14)


In [None]:
from wordcloud import WordCloud
plt.figure(figsize=(20, 20)) #Text that is Fake News Headline
wc = WordCloud(max_words=500, width=1600, height=800).generate(" ".join(X_dep['review']))
plt.imshow(wc, interpolation='bilinear')
plt.title("word cloud for Depression",fontsize = 14)


In [None]:
from wordcloud import WordCloud
plt.figure(figsize=(20, 20)) #Text that is Fake News Headline
wc = WordCloud(max_words=500, width=1600, height=800).generate(" ".join(X_bp['review']))
plt.imshow(wc, interpolation='bilinear')
plt.title("word cloud for Blood Pressure",fontsize = 14)


In [None]:
from wordcloud import WordCloud
plt.figure(figsize=(20, 20)) #Text that is Fake News Headline
wc = WordCloud(max_words=500, width=1600, height=800).generate(" ".join(X_diab['review']))
plt.imshow(wc, interpolation='bilinear')
plt.title("word cloud for Diabetes Type 2",fontsize = 14)


# data preprocessing

In [None]:
X['review'][2]

In [None]:
X['review'][11]

In [None]:
for i,col in enumerate(X.columns):
    X.iloc[:, i] = X.iloc[:, i].str.replace('"','')

In [None]:
#to set the width of the column to maximum
pd.set_option('max_colwidth', None)

In [None]:
X.head()

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
!pip install spacy

import spacy 

In [None]:

from nltk.corpus import stopwords
stop = stopwords.words("english")


In [None]:
stop

In [None]:
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Create instances of lemmatizer and stemmer
lemmatizer = WordNetLemmatizer()
porter = PorterStemmer()

In [None]:
!pip install html5lib
!pip install lxml

In [None]:
from bs4 import BeautifulSoup
import re





In [None]:
import nltk
nltk.download('wordnet')


In [None]:
from bs4 import BeautifulSoup

# Example HTML content
html_content = "<html><body><h1>Hello, BeautifulSoup!</h1></body></html>"

# Parse with html.parser
soup = BeautifulSoup(html_content, 'html.parser')



In [None]:
def review_to_words(raw_review):
    # 1 Delete punchuation
    pattern='\w+'
    review_text1=' '.join(re.findall(pattern,raw_review))
    # 2 Make a space
    letters_only = re.sub('[^a-zA-Z]',' ',review_text1)
    # 3 lower letters
    words = letters_only.lower().split()
    # 4 stop words
    meaningful_words = [w for w in words if not w in stop]
    # 5 Lemmatization
    lemmatize_words = [lemmatizer.lemmatize(w) for w in  meaningful_words]
    # 7 space join words
    return(' '.join(lemmatize_words))

In [None]:
X['review_clean'] = X['review'].apply(review_to_words)

In [None]:
X.head()

## creatingfeatures and target variables


In [None]:
X_feat = X['review_clean']
y = X['condition']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_feat,y,stratify=y,test_size=0.2,random_state=0)

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')
    
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], '.2f' if normalize else 'd'),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

# Bag of Words

In [None]:
count_vectorizer = CountVectorizer(stop_words = 'english')
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

In [None]:
count_train

# Machine Learning Models : Naive Bayes

In [8]:
mnb = MultinomialNB()
mnb.fit(count_train, y_train)
pred = mnb.predict(count_test)
score = metrics.accuracy_score(y_test,pred)
print("accuracy: %0.3f" % score)

cm = metrics.confusion_matrix(y_test, pred, labels=['Birth Control', 'Depression', 'Diabetes, Type 2', 'High Blood Pressure'])
plot_confusion_matrix(cm, classes=['Birth Control', 'Depression', 'Diabetes, Type 2', 'High Blood Pressure'])
plt.show()

NameError: name 'count_train' is not defined

# Machine Learning Model : Passive Aggresive classifier

In [None]:
from sklearn.linear_model import PassiveAggressiveClassifier,LogisticRegression

passive = PassiveAggressiveClassifier()
passive.fit(count_train, y_train)
pred = passive.predict(count_test)
score = metrics.accuracy_score(y_test,pred)
print("accuracy: %0.3f" % score)

cm = metrics.confusion_matrix(y_test, pred, labels=['Birth Control', 'Depression', 'Diabetes, Type 2', 'High Blood Pressure'])
plot_confusion_matrix(cm, classes=['Birth Control', 'Depression', 'Diabetes, Type 2', 'High Blood Pressure'])
plt.show()

# TFIDF


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(stop_words ='english' ,max_df = 0.8)
tfidf_train_2 = tfidf_vectorizer.fit_transform(X_train)
tfidf_test_2 = tfidf_vectorizer.transform(X_test)

# Machine Learning Models : Naive Bayes

In [None]:
mnb_tf = MultinomialNB()
mnb_tf.fit(tfidf_train_2, y_train)
pred = mnb_tf.predict(tfidf_test_2)
score = metrics.accuracy_score(y_test,pred)
print("accuracy: %0.3f" % score)

cm = metrics.confusion_matrix(y_test, pred, labels=['Birth Control', 'Depression', 'Diabetes, Type 2', 'High Blood Pressure'])
plot_confusion_matrix(cm, classes=['Birth Control', 'Depression', 'Diabetes, Type 2', 'High Blood Pressure'])
plt.show()

# TFIDF : Passive Aggresive classifier

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words ='english' ,max_df = 0.8)
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

pass_tf = PassiveAggressiveClassifier()
pass_tf.fit(tfidf_train, y_train)
pred = pass_tf.predict(tfidf_test )
score = metrics.accuracy_score(y_test,pred)
print("accuracy: %0.3f" % score)

cm = metrics.confusion_matrix(y_test, pred, labels=['Birth Control', 'Depression', 'Diabetes, Type 2', 'High Blood Pressure'])
plot_confusion_matrix(cm, classes=['Birth Control', 'Depression', 'Diabetes, Type 2', 'High Blood Pressure'])
plt.show()

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words ='english' ,max_df = 0.8,ngram_range = (1,2))
tfidf_train_2 = tfidf_vectorizer.fit_transform(X_train)
tfidf_test_2 = tfidf_vectorizer.transform(X_test)

In [None]:
pass_tf = PassiveAggressiveClassifier()
pass_tf.fit(tfidf_train_2, y_train)
pred = pass_tf.predict(tfidf_test_2)
score = metrics.accuracy_score(y_test,pred)
print("accuracy: %0.3f" % score)

cm = metrics.confusion_matrix(y_test, pred, labels=['Birth Control', 'Depression', 'Diabetes, Type 2', 'High Blood Pressure'])
plot_confusion_matrix(cm, classes=['Birth Control', 'Depression', 'Diabetes, Type 2', 'High Blood Pressure'])
plt.show()

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words ='english' ,max_df = 0.8,ngram_range = (1,3))
tfidf_train_3 = tfidf_vectorizer.fit_transform(X_train)
tfidf_test_3 = tfidf_vectorizer.transform(X_test)

In [None]:
pass_tf = PassiveAggressiveClassifier()
pass_tf.fit(tfidf_train_3, y_train)
pred = pass_tf.predict(tfidf_test_3)
score = metrics.accuracy_score(y_test,pred)
print("accuracy: %0.3f" % score)

cm = metrics.confusion_matrix(y_test, pred, labels=['Birth Control', 'Depression', 'Diabetes, Type 2', 'High Blood Pressure'])
plot_confusion_matrix(cm, classes=['Birth Control', 'Depression', 'Diabetes, Type 2', 'High Blood Pressure'])
plt.show()

# Most informative feature 

In [None]:
def most_informative_feature_for_class(vectorizer,classifier,classlabel,n=10):
    labelid = list(classifier.classes_).index(classlabel)
    feature_names = vectorizer.get_feature_names_out()
    topn = sorted(zip(classifier.coef_[labelid],feature_names))[-n:]
    
    for coef,feat in topn:
        print (classlabel,feat,coef) 

In [None]:
most_informative_feature_for_class(tfidf_vectorizer,pass_tf,'Birth Control')

In [None]:
most_informative_feature_for_class(tfidf_vectorizer,pass_tf,'Depression')

In [None]:
most_informative_feature_for_class(tfidf_vectorizer,pass_tf,'High Blood Pressure')

In [None]:
most_informative_feature_for_class(tfidf_vectorizer,pass_tf,'Diabetes, Type 2')

In [None]:
X.tail()

In [None]:
text=['I just got diagnosed with type 2. My doctor prescribed Invokana and metformin from the beginning. My sugars went down to normal by the second week. I am losing so much weight. No side effects yet. Miracle medicine for me']

test=tfidf_vectorizer.transform(text)
pred1=pass_tf.predict(test)[0]
pred1

In [None]:
text=['I have only been on Tekturna for 9 days. The effect was immediate. I am also on a calcium channel blocker (Tiazac) and hydrochlorothiazide. I was put on Tekturna because of palpitations experienced with Diovan (ugly drug in my opinion, same company produces both however). The palpitations were pretty bad on Diovan, 24 hour monitor by EKG etc. After a few days of substituting Tekturna for Diovan, there are no more palpitations.']
test=tfidf_vectorizer.transform(text)
pred1=pass_tf.predict(test)[0]
pred1

In [5]:
test_data = pd.read_csv(r"C:\Users\satish\Downloads\drug+review+dataset+drugs+com\drugsComTest_raw.tsv", sep='\t')

In [6]:
test_data.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,163740,Mirtazapine,Depression,"""I&#039;ve tried a few antidepressants over th...",10.0,"February 28, 2012",22
1,206473,Mesalamine,"Crohn's Disease, Maintenance","""My son has Crohn&#039;s disease and has done ...",8.0,"May 17, 2009",17
2,159672,Bactrim,Urinary Tract Infection,"""Quick reduction of symptoms""",9.0,"September 29, 2017",3
3,39293,Contrave,Weight Loss,"""Contrave combines drugs that were used for al...",9.0,"March 5, 2017",35
4,97768,Cyclafem 1 / 35,Birth Control,"""I have been on this birth control for one cyc...",9.0,"October 22, 2015",4


In [7]:
import pickle

In [None]:
with open()