In [1]:
import os
import re
import sys
import json
from collections import Counter, defaultdict
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx

from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud

In [2]:
try:
    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
    vader_available = True
except Exception:
    vader_available = False

In [3]:
DATA_PATH = "/content/drugs_side_effects_drugs_com.csv"
OUT_DIR = "outputs"
os.makedirs(OUT_DIR, exist_ok=True)

sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)

In [4]:
def normalize_colname(c):
    return c.strip().lower().replace(" ", "_")

In [5]:
def extract_list_from_string(s):
    """Convert comma- or pipe-separated text to a clean list."""
    if pd.isna(s):
        return []
    s2 = re.sub(r"[\[\]\(\)]", "", str(s))
    parts = re.split(r"\s*[,\|;/]\s*", s2)
    clean_parts = [p.strip() for p in parts if p.strip() and p.strip().lower() not in ("na", "none", "no info")]
    return clean_parts

In [6]:
def clean_text(s):
    """Clean review text: remove punctuation, numbers, and extra spaces."""
    if pd.isna(s):
        return ""
    s = str(s)
    s = re.sub(r"http\S+", "", s)
    s = re.sub(r"[^0-9A-Za-z' ]+", " ", s)
    s = re.sub(r"\s+", " ", s)
    return s.lower().strip()

In [7]:
print(" Loading data...")
df = pd.read_csv(DATA_PATH, low_memory=False)
df.columns = [normalize_colname(c) for c in df.columns]
print("Columns:", df.columns.tolist())
print("Dataset size:", df.shape)

 Loading data...
Columns: ['drug_name', 'medical_condition', 'side_effects', 'generic_name', 'drug_classes', 'brand_names', 'activity', 'rx_otc', 'pregnancy_category', 'csa', 'alcohol', 'related_drugs', 'medical_condition_description', 'rating', 'no_of_reviews', 'drug_link', 'medical_condition_url']
Dataset size: (2931, 17)


In [8]:
drug_col = next((c for c in df.columns if "drug" in c), None)
side_col = next((c for c in df.columns if "side" in c), None)
cond_col = next((c for c in df.columns if "condition" in c or "disease" in c or "treat" in c), None)
rating_col = next((c for c in df.columns if "rating" in c or "score" in c), None)
review_col = next((c for c in df.columns if "review" in c or "comment" in c or "text" in c), None)


In [9]:
print(f"Detected columns:\n drug={drug_col}, side={side_col}, cond={cond_col}, rating={rating_col}, review={review_col}")


Detected columns:
 drug=drug_name, side=side_effects, cond=medical_condition, rating=rating, review=no_of_reviews


In [10]:
df["drug"] = df[drug_col] if drug_col else np.nan
df["side_effects"] = df[side_col] if side_col else ""
df["condition"] = df[cond_col] if cond_col else ""
df["rating"] = pd.to_numeric(df[rating_col], errors="coerce") if rating_col else np.nan
df["review"] = df[review_col].astype(str) if review_col else ""

df["side_effects_list"] = df["side_effects"].apply(extract_list_from_string)
df["condition_list"] = df["condition"].apply(extract_list_from_string)
df["clean_review"] = df["review"].apply(clean_text)

In [11]:
print("Unique drugs:", df["drug"].nunique())
print("Unique conditions:", df["condition"].nunique())

top_drugs = df["drug"].value_counts().head(10)
print("\nTop Drugs:\n", top_drugs)

Unique drugs: 2912
Unique conditions: 47

Top Drugs:
 drug
triamcinolone     3
minoxidil         2
beclomethasone    2
cromolyn          2
acyclovir         2
erythromycin      2
mometasone        2
hydrocortisone    2
diclofenac        2
minocycline       2
Name: count, dtype: int64


In [12]:
if df["rating"].notna().any():
    plt.figure()
    sns.histplot(df["rating"].dropna(), bins=20, kde=True, color="skyblue")
    plt.title("Distribution of Drug Ratings")
    plt.xlabel("Rating")
    plt.ylabel("Count")
    plt.savefig(os.path.join(OUT_DIR, "ratings_distribution.png"))
    plt.close()

In [13]:
side_counts = Counter()
for lst in df["side_effects_list"]:
    side_counts.update(lst)

top_sides = pd.DataFrame(side_counts.most_common(20), columns=["Side Effect", "Count"])
top_sides.to_csv(os.path.join(OUT_DIR, "top_side_effects.csv"), index=False)

plt.figure(figsize=(8, 5))
sns.barplot(y="Side Effect", x="Count", data=top_sides, palette="magma")
plt.title("Top 20 Most Common Side Effects")
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "top_side_effects.png"))
plt.close()



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(y="Side Effect", x="Count", data=top_sides, palette="magma")


In [14]:
all_text = " ".join(df["clean_review"].tolist())
if len(all_text) > 10:
    wc = WordCloud(width=1000, height=600, background_color="white", max_words=200).generate(all_text)
    plt.figure(figsize=(10, 6))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title("WordCloud of Drug Reviews")
    plt.tight_layout()
    plt.savefig(os.path.join(OUT_DIR, "reviews_wordcloud.png"))
    plt.close()

In [15]:
if vader_available:
    analyzer = SentimentIntensityAnalyzer()
    df["sentiment"] = df["clean_review"].apply(lambda t: analyzer.polarity_scores(t)["compound"])
    avg_sent = df.groupby("drug")["sentiment"].mean().sort_values(ascending=False).head(10)
    print("\nTop 10 Drugs by Positive Sentiment:\n", avg_sent)
    avg_sent.to_csv(os.path.join(OUT_DIR, "avg_sentiment_by_drug.csv"))
else:
    print(" VADER sentiment not installed. Skipping sentiment analysis.")

 VADER sentiment not installed. Skipping sentiment analysis.


In [16]:
print("\nCalculating TF-IDF keywords...")
vectorizer = TfidfVectorizer(stop_words="english", max_features=1500)
X = vectorizer.fit_transform(df["clean_review"])
feature_names = np.array(vectorizer.get_feature_names_out())
mean_tfidf = np.asarray(X.mean(axis=0)).ravel()
top_terms = feature_names[np.argsort(mean_tfidf)[::-1][:30]]

pd.DataFrame({"term": top_terms}).to_csv(os.path.join(OUT_DIR, "top_tfidf_terms.csv"), index=False)
print("Saved TF-IDF keywords.")


Calculating TF-IDF keywords...
Saved TF-IDF keywords.


In [17]:
print("Building Drug–Side Effect network...")
G = nx.Graph()

Building Drug–Side Effect network...


In [18]:
top_drug_list = df["drug"].value_counts().head(30).index
top_side_list = [s for s, _ in side_counts.most_common(50)]

for _, row in df.iterrows():
    drug = row["drug"]
    for se in row["side_effects_list"]:
        if drug in top_drug_list and se in top_side_list:
            G.add_edge(drug, f"SE::{se}")

plt.figure(figsize=(12, 10))
pos = nx.spring_layout(G, seed=42, k=0.5)
nx.draw_networkx_nodes(G, pos, node_color="skyblue", node_size=300)
nx.draw_networkx_edges(G, pos, alpha=0.4)
nx.draw_networkx_labels(G, pos, font_size=7)
plt.title("Drug ↔ Side Effect Co-occurrence Network")
plt.axis("off")
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "drug_sideeffect_network.png"))
plt.close()


In [19]:
print("\n Analysis complete! Outputs saved in:", OUT_DIR)
print("Files generated:")
for f in os.listdir(OUT_DIR):
    print(" -", f)


 Analysis complete! Outputs saved in: outputs
Files generated:
 - top_side_effects.csv
 - drug_sideeffect_network.png
 - top_tfidf_terms.csv
 - ratings_distribution.png
 - reviews_wordcloud.png
 - top_side_effects.png
