In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

import os
import string
color = sns.color_palette()

%matplotlib inline

from sklearn import model_selection, preprocessing, metrics, naive_bayes, linear_model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.exceptions import NotFittedError

import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import nltk

# Load data
df1 = pd.read_csv('drugs1.tsv', sep='\t')   
df2 = pd.read_csv('drugs2.tsv', sep='\t') 
df = pd.concat([df1, df2], axis=0)


dff=df
# Data Preprocessing
df = df.dropna(axis=0)
df.drop(['Unnamed: 0', 'rating', 'date', 'usefulCount'], axis=1, inplace=True)

df_train = df[(df['condition'] == 'Birth Control') | 
              (df['condition'] == 'Depression') | 
              (df['condition'] == 'Pain') | 
              (df['condition'] == 'Anxiety')]

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

stop = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

def review_to_words(raw_review):
    review_text = BeautifulSoup(raw_review, 'html.parser').get_text()
    letters_only = re.sub('[^a-zA-Z]', ' ', review_text)
    words = letters_only.lower().split()
    meaningful_words = [w for w in words if w not in stop]
    lemmatized_words = [lemmatizer.lemmatize(w) for w in meaningful_words]
    return ' '.join(lemmatized_words)

# Apply preprocessing
df_train['review_clean'] = df_train['review'].apply(review_to_words)

# Feature and target variables
X_feat = df_train['review_clean']
y = df_train['condition']

# Train-Test Split
X_train, X_test, y_train, y_test = model_selection.train_test_split(X_feat, y, test_size=0.2, random_state=42)

# TF-IDF Vectorization
tfidf_vectorizer2 = TfidfVectorizer(max_df=0.8, ngram_range=(1, 2))
tfidf_train_2 = tfidf_vectorizer2.fit_transform(X_train)
tfidf_test_2 = tfidf_vectorizer2.transform(X_test)

# Model Training
pass_tf = PassiveAggressiveClassifier()
pass_tf.fit(tfidf_train_2, y_train)
pred = pass_tf.predict(tfidf_test_2)
score = metrics.accuracy_score(y_test, pred)
print(f"Accuracy: {score:.3f}")
cm = metrics.confusion_matrix(y_test, pred, labels=df_train['condition'].unique())
print("Confusion Matrix:")
print(cm)






# Sample sentences for recommending drugs


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Windows\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Windows\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Accuracy: 0.974
Confusion Matrix:
[[7767    6    5    2]
 [  17 2236   17   73]
 [  10   33 1570   25]
 [  11  121   20 1419]]


In [2]:
# Model Training
pass_tf = PassiveAggressiveClassifier()
pass_tf.fit(tfidf_train_2, y_train)
pred = pass_tf.predict(tfidf_test_2)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred, labels=df_train['condition'].unique())
print("Confusion Matrix:")
print(cm)

# Assuming df and other required variables (e.g., tfidf_vectorizer2, mnb_tf, review_to_words) are already defined
df = dff

def top_drugs_extractor(condition):
    df_top = df[(df['rating'] >= 9) & (df['usefulCount'] >= 100)].sort_values(by=['rating', 'usefulCount'], ascending=[False, False])
    drug_lst = df_top[df_top['condition'] == condition]['drugName'].head(3).tolist()
    return drug_lst if drug_lst else ['No top drugs available']  # Handling case when no drugs are found

def predict_text(lst_text):
    df_test = pd.DataFrame(lst_text, columns=['test_sent'])
    df_test["test_sent"] = df_test["test_sent"].apply(review_to_words)
    tfidf_bigram = tfidf_vectorizer2.transform(df_test["test_sent"])
    prediction = pass_tf.predict(tfidf_bigram)
    df_test['prediction'] = prediction
    return df_test

sentences = [
    "I have only been on Tekturna for 9 days. The effect was immediate. I am also on a calcium channel blocker (Tiazac) and hydrochlorothiazide. I was put on Tekturna because of palpitations experienced with Diovan (ugly drug in my opinion, same company produces both however). The palpitations were pretty bad on Diovan, 24 hour monitor by EKG etc. After a few days of substituting Tekturna for Diovan, there are no more palpitations.",
    "This is the third med I've tried for anxiety and mild depression. Been on it for a week and I hate it so much. I am so dizzy, I have major diarrhea and feel worse than I started. Contacting my doc in the am and changing asap.",
    "I just got diagnosed with type 2. My doctor prescribed Invokana and metformin from the beginning. My sugars went down to normal by the second week. I am losing so much weight. No side effects yet. Miracle medicine for me"
]

# Combine sentences into one string
combined_text = ' '.join(sentences)

# Predict using the combined text
df_predictions = predict_text([combined_text])  # Pass combined_text as a list

# Extract and print results
text = combined_text
label = df_predictions['prediction'][0]
    
# Map the label to the condition
if label == "High Blood Pressure":
    target = "High Blood Pressure"
elif label == "Depression":
    target = "Depression"
elif label == "Diabetes, Type 2":
    target = "Diabetes, Type 2"
else:
    target = "Birth Control"

top_drugs = top_drugs_extractor(target)

# Print results
print("Condition:", target)
print("Top 3 Suggested Drugs:")
for drug in top_drugs:
    print(drug)
print()


accuracy:   0.974
Confusion Matrix:
[[7763    8    7    2]
 [  18 2234   19   72]
 [  10   34 1577   17]
 [  11  122   25 1413]]
Condition: Depression
Top 3 Suggested Drugs:
Sertraline
Zoloft
Viibryd



In [8]:
import pickle

# Save the TF-IDF Vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer2, f)

# Save the MultinomialNB model
with open('PassiveAggressiveClassifier_model.pkl', 'wb') as f:
    pickle.dump(pass_tf, f)


