In [4]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load the data
dtype_dict = {'Category': str}
data = pd.read_csv("book2.csv", dtype=dtype_dict, nrows=133)

# Map categories to numerical labels
category_map = {"\"Adverse Effects\"": 0, "\"Product Quality\"": 1, "\"Medical Information\"": 2}
category_explanation = {
    0: "Side effects related complaints.",
    1: "Complaints related to medicine being broken, wrong color, expired, etc.",
    2: "Non-event related complaints."
}
data['label'] = data['Category'].map(category_map)

# Handle missing values
data['review'] = data['review'].fillna("No information available")

# Define the prompts for each category
prompts = [
    "This complaint is about adverse effects such as side effects related to the medicine.",
    "This complaint is about product quality such as broken medicine, wrong color, or expired product.",
    "This complaint is about medical information and non-event related issues."
]

# Load pre-trained BioBERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.1')
model = BertModel.from_pretrained('dmis-lab/biobert-base-cased-v1.1')
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to encode text
def encode_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()

# Function to classify review using prompts
def classify_review_with_prompts(review_text):
    review_embedding = encode_text(review_text)
    prompt_embeddings = [encode_text(prompt) for prompt in prompts]
    
    similarities = [cosine_similarity(review_embedding, prompt_embedding)[0][0] for prompt_embedding in prompt_embeddings]
    best_match = np.argmax(similarities)
    
    return best_match, category_explanation[best_match]

# Classify each review in the dataset
data['predicted_label'] = data['review'].apply(lambda x: classify_review_with_prompts(x)[0])
data['predicted_explanation'] = data['review'].apply(lambda x: classify_review_with_prompts(x)[1])

# Print the results
print(data[['review', 'Category', 'predicted_label', 'predicted_explanation']])


                                                review            Category  \
0    "It has no side effect, I take it in combinati...  "Adversal Effects"   
1    "My son is halfway through his fourth week of ...  "Adversal Effects"   
2    "I used to take another oral contraceptive, wh...  "Adversal Effects"   
3    "This is my first time using any form of birth...  "Adversal Effects"   
4    "Suboxone has completely turned my life around...  "Adversal Effects"   
..                                                 ...                 ...   
128                           No information available                 NaN   
129                           No information available                 NaN   
130                           No information available                 NaN   
131                           No information available                 NaN   
132                           No information available                 NaN   

     predicted_label                              predicted_exp

In [9]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load the data
dtype_dict = {'review': str}
data = pd.read_csv("book2.csv", dtype=dtype_dict, nrows=133)

# Handle missing values
data['review'] = data['review'].fillna("No information available")

# Define the prompts for each category
prompts = [
    "This complaint is about adverse effects such as side effects related to the medicine.",
    "This complaint is about product quality such as broken medicine, wrong color, or expired product.",
    "This complaint is about medical information and non-event related issues."
]

category_explanation = {
    0: "Adverse Effects: Side effects related complaints.",
    1: "Product Quality: Complaints related to medicine being broken, wrong color, expired, etc.",
    2: "Non-event:If complaints dont belong to adverse effects or product quality"
}

# Load pre-trained BioBERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.1')
model = BertModel.from_pretrained('dmis-lab/biobert-base-cased-v1.1')
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to encode text
def encode_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()

# Function to classify review using prompts
def classify_review_with_prompts(review_text):
    review_embedding = encode_text(review_text)
    prompt_embeddings = [encode_text(prompt) for prompt in prompts]
    
    similarities = [cosine_similarity(review_embedding, prompt_embedding)[0][0] for prompt_embedding in prompt_embeddings]
    best_match = np.argmax(similarities)
    
    return best_match, category_explanation[best_match]

# Classify each review in the dataset
data['predicted_label'], data['predicted_explanation'] = zip(*data['review'].apply(classify_review_with_prompts))

# Print the results
print(data[['review', 'predicted_label', 'predicted_explanation']])

# Function to classify user query
def classify_user_query(query):
    _, explanation = classify_review_with_prompts(query)
    return explanation


                                                review  predicted_label  \
0    "It has no side effect, I take it in combinati...                0   
1    "My son is halfway through his fourth week of ...                1   
2    "I used to take another oral contraceptive, wh...                1   
3    "This is my first time using any form of birth...                0   
4    "Suboxone has completely turned my life around...                1   
..                                                 ...              ...   
128                           No information available                1   
129                           No information available                1   
130                           No information available                1   
131                           No information available                1   
132                           No information available                1   

                                 predicted_explanation  
0    Adverse Effects: Side effects related

In [10]:
# Example user query
user_query = "Discovered a new drug approval for the treatment of osteoporosis. Excited about potential advancements in bone health management."
user_query_classification = classify_user_query(user_query)
print("User Query Classification:", user_query_classification)

User Query Classification: Adverse Effects: Side effects related complaints.
