<a href="https://colab.research.google.com/github/samarthya04/IIIT-A-internship/blob/main/DrugLib_Reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

import warnings
warnings.filterwarnings('ignore')

In [3]:
train_path = '/content/drive/MyDrive/IIIT-A-Internship/Drug Review/Drug Reviews (Druglib.com)/drugLibTest_raw.csv'
test_path = '/content/drive/MyDrive/IIIT-A-Internship/Drug Review/Drug Reviews (Druglib.com)/drugLibTest_raw.csv'

In [4]:
cols = ['reviewID', 'drugName', 'rating', 'effectiveness', 'sideEffects', 'condition', 'benefitsReview', 'sideEffectsReview', 'commentsReview']

train_data = pd.read_csv(train_path, names=cols).drop(0)
test_data = pd.read_csv(test_path, names=cols).drop(0)

df = pd.concat([train_data, test_data],ignore_index=True,axis=0)

display(df)

Unnamed: 0,reviewID,drugName,rating,effectiveness,sideEffects,condition,benefitsReview,sideEffectsReview,commentsReview
0,1366.0,biaxin,9,Considerably Effective,Mild Side Effects,sinus infection,The antibiotic may have destroyed bacteria cau...,"Some back pain, some nauseau.",Took the antibiotics for 14 days. Sinus infect...
1,3724.0,lamictal,9,Highly Effective,Mild Side Effects,bipolar disorder,Lamictal stabilized my serious mood swings. On...,"Drowsiness, a bit of mental numbness. If you t...",Severe mood swings between hypomania and depre...
2,3824.0,depakene,4,Moderately Effective,Severe Side Effects,bipolar disorder,Initial benefits were comparable to the brand ...,"Depakene has a very thin coating, which caused...",Depakote was prescribed to me by a Kaiser psyc...
3,969.0,sarafem,10,Highly Effective,No Side Effects,bi-polar / anxiety,It controlls my mood swings. It helps me think...,I didnt really notice any side effects.,This drug may not be for everyone but its wond...
4,696.0,accutane,10,Highly Effective,Mild Side Effects,nodular acne,Within one week of treatment superficial acne ...,Side effects included moderate to severe dry s...,Drug was taken in gelatin tablet at 0.5 mg per...
...,...,...,...,...,...,...,...,...,...
2067,690.0,accutane,7,Considerably Effective,Severe Side Effects,acne vulgaris,Detoxing effect by pushing out the system thro...,"Hairloss, extreme dry skin, itchiness, raises ...",Treatment period is 3 months/12 weeks. Dosage ...
2068,1071.0,proair-hfa,10,Highly Effective,No Side Effects,asthma,"The albuterol relieved the constriction, irrit...",I have experienced no side effects.,I use the albuterol as needed because of aller...
2069,681.0,accutane,8,Considerably Effective,Moderate Side Effects,serve acne,Serve Acne has turned to middle,"Painfull muscles, problems with seeing at night","This drug is highly teratogenic ,females must ..."
2070,2709.0,divigel,10,Highly Effective,No Side Effects,menopause,"My overall mood, sense of well being, energy l...",No side effects of any kind were noted or appa...,Divigel is a topically applied Bio-Identical H...


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2072 entries, 0 to 2071
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   reviewID           2072 non-null   float64
 1   drugName           2072 non-null   object 
 2   rating             2072 non-null   object 
 3   effectiveness      2072 non-null   object 
 4   sideEffects        2072 non-null   object 
 5   condition          2072 non-null   object 
 6   benefitsReview     2062 non-null   object 
 7   sideEffectsReview  2026 non-null   object 
 8   commentsReview     2070 non-null   object 
dtypes: float64(1), object(8)
memory usage: 145.8+ KB


In [6]:
# Convert rating to numeric
df['rating'] = df['rating'].apply(pd.to_numeric, errors='coerce', downcast='signed')

In [7]:
# Convert categorical columns to category dtype
categorical_columns = ['drugName', 'effectiveness', 'sideEffects', 'condition']
for col in categorical_columns:
    df[col] = df[col].astype('category')

In [8]:
# Drop missing values and duplicates
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

In [9]:
# Check data types
print(df.dtypes)

reviewID              float64
drugName             category
rating                   int8
effectiveness        category
sideEffects          category
condition            category
benefitsReview         object
sideEffectsReview      object
commentsReview         object
dtype: object


In [10]:
# Exploratory Data Analysis (EDA)
# Distribution of Ratings
fig_rating = px.histogram(df, x='rating', title='Rating Distribution in Drug Lib', labels={'rating': 'Rating', 'count': 'Count'})
fig_rating.show()

In [11]:
# Effectiveness Analysis
fig_effectiveness = px.histogram(df, x='effectiveness', title='Effectiveness Distribution', labels={'effectiveness': 'Effectiveness', 'count': 'Count'})
fig_effectiveness.show()

In [12]:
# Side Effects Analysis
fig_side_effects = px.histogram(df, x='sideEffects', title='Side Effects Distribution', labels={'sideEffects': 'Side Effects', 'count': 'Count'})
fig_side_effects.update_xaxes(tickangle=-90)
fig_side_effects.show()

In [13]:
# Function to clean text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text

In [14]:
# Apply text cleaning function to review columns
df['benefitsReview'] = df['benefitsReview'].apply(clean_text)
df['sideEffectsReview'] = df['sideEffectsReview'].apply(clean_text)
df['commentsReview'] = df['commentsReview'].apply(clean_text)

In [15]:
# Combine the reviews into a single column
df['review'] = df['benefitsReview'] + ' ' + df['sideEffectsReview'] + ' ' + df['commentsReview']

In [16]:
# Drop the original review columns
df.drop(['benefitsReview', 'sideEffectsReview', 'commentsReview'], inplace=True, axis=1)

In [17]:
# Check for empty reviews and drop them
df['review'].replace('', np.nan, inplace=True)
df.dropna(subset=['review'], inplace=True)

In [18]:
# Label the data for sentiment analysis (assuming a 10-point scale where ratings 1-4 are negative, 5-6 are neutral, and 7-10 are positive)

In [19]:
def label_sentiment(rating):
    if rating <= 4:
        return 'negative'
    elif rating <= 6:
        return 'neutral'
    else:
        return 'positive'

In [20]:
df['sentiment'] = df['rating'].apply(label_sentiment)

In [21]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)

In [22]:
# Vectorizing the text data
count_vectorizer = CountVectorizer(stop_words='english')
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

X_train_counts = count_vectorizer.fit_transform(X_train)
X_test_counts = count_vectorizer.transform(X_test)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [23]:
# Train and evaluate classifiers
classifiers = {
    'MultinomialNB (Count Vectorizer)': MultinomialNB(),
    'MultinomialNB (TF-IDF Vectorizer)': MultinomialNB(),
    'BernoulliNB (Count Vectorizer)': BernoulliNB(),
    'BernoulliNB (TF-IDF Vectorizer)': BernoulliNB(),
    'Linear SVM (Count Vectorizer)': SVC(kernel='linear'),
    'Linear SVM (TF-IDF Vectorizer)': SVC(kernel='linear'),
    'Polynomial SVM (Count Vectorizer)': SVC(kernel='poly'),
    'Polynomial SVM (TF-IDF Vectorizer)': SVC(kernel='poly'),
    'RBF SVM (Count Vectorizer)': SVC(kernel='rbf'),
    'RBF SVM (TF-IDF Vectorizer)': SVC(kernel='rbf'),
    'Sigmoid SVM (Count Vectorizer)': SVC(kernel='sigmoid'),
    'Sigmoid SVM (TF-IDF Vectorizer)': SVC(kernel='sigmoid'),
    'Decision Tree (Count Vectorizer)': DecisionTreeClassifier(),
    'Decision Tree (TF-IDF Vectorizer)': DecisionTreeClassifier(),
    'AdaBoost (Count Vectorizer)': AdaBoostClassifier(algorithm='SAMME'),
    'AdaBoost (TF-IDF Vectorizer)': AdaBoostClassifier(algorithm='SAMME')
}

In [24]:
# Train and evaluate each classifier
results = {}
for name, clf in classifiers.items():
    if 'Count' in name:
        X_train_vectorized = X_train_counts
        X_test_vectorized = X_test_counts
    else:
        X_train_vectorized = X_train_tfidf
        X_test_vectorized = X_test_tfidf

    clf.fit(X_train_vectorized, y_train)
    y_pred = clf.predict(X_test_vectorized)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    results[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }

    print(f"{name} - Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")

MultinomialNB (Count Vectorizer) - Accuracy: 0.73, Precision: 0.66, Recall: 0.73, F1 Score: 0.67
MultinomialNB (TF-IDF Vectorizer) - Accuracy: 0.67, Precision: 0.45, Recall: 0.67, F1 Score: 0.54
BernoulliNB (Count Vectorizer) - Accuracy: 0.67, Precision: 0.61, Recall: 0.67, F1 Score: 0.56
BernoulliNB (TF-IDF Vectorizer) - Accuracy: 0.67, Precision: 0.61, Recall: 0.67, F1 Score: 0.56
Linear SVM (Count Vectorizer) - Accuracy: 0.70, Precision: 0.69, Recall: 0.70, F1 Score: 0.69
Linear SVM (TF-IDF Vectorizer) - Accuracy: 0.74, Precision: 0.66, Recall: 0.74, F1 Score: 0.68
Polynomial SVM (Count Vectorizer) - Accuracy: 0.66, Precision: 0.45, Recall: 0.66, F1 Score: 0.54
Polynomial SVM (TF-IDF Vectorizer) - Accuracy: 0.67, Precision: 0.45, Recall: 0.67, F1 Score: 0.54
RBF SVM (Count Vectorizer) - Accuracy: 0.67, Precision: 0.45, Recall: 0.67, F1 Score: 0.54
RBF SVM (TF-IDF Vectorizer) - Accuracy: 0.67, Precision: 0.45, Recall: 0.67, F1 Score: 0.54
Sigmoid SVM (Count Vectorizer) - Accuracy: 0.

In [25]:
# Convert results to DataFrame for easier comparison
results_df = pd.DataFrame(results).T
print("\nComparative Performance Analysis:\n")
display(results_df)


Comparative Performance Analysis:



Unnamed: 0,Accuracy,Precision,Recall,F1 Score
MultinomialNB (Count Vectorizer),0.732673,0.658347,0.732673,0.670627
MultinomialNB (TF-IDF Vectorizer),0.673267,0.453289,0.673267,0.541801
BernoulliNB (Count Vectorizer),0.673267,0.610761,0.673267,0.55884
BernoulliNB (TF-IDF Vectorizer),0.673267,0.610761,0.673267,0.55884
Linear SVM (Count Vectorizer),0.70297,0.687767,0.70297,0.694156
Linear SVM (TF-IDF Vectorizer),0.737624,0.664087,0.737624,0.67748
Polynomial SVM (Count Vectorizer),0.663366,0.451089,0.663366,0.537011
Polynomial SVM (TF-IDF Vectorizer),0.673267,0.453289,0.673267,0.541801
RBF SVM (Count Vectorizer),0.668317,0.452194,0.668317,0.539413
RBF SVM (TF-IDF Vectorizer),0.673267,0.453289,0.673267,0.541801


In [26]:
# Plot the performance metrics using seaborn barplot
results_df_melted = results_df.reset_index().melt(id_vars='index', var_name='Metric', value_name='Score')
fig = px.bar(results_df_melted, x='index', y='Score', color='Metric',
             title='Comparative Performance Analysis',
             labels={'index': 'Classifiers', 'Score': 'Scores'},
             barmode='group', # Use 'group' for side-by-side bars
             width=800, height=600)
fig.update_xaxes(tickangle=-90) # Rotate x-axis labels for better readability

fig.show()