In [1]:
#!pip install nltk
#!pip install textblob
#!pip install wordcloud matplotlib


In [2]:
import pandas as pd
import numpy as np
from ucimlrepo import fetch_ucirepo
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from wordcloud import WordCloud


In [3]:
# datasets
drug_reviews_drugs_com = fetch_ucirepo(id=462)
drug_reviews_druglib_com = fetch_ucirepo(id=461)

# Load the datasets
df1 = drug_reviews_drugs_com.data.features
df2 = drug_reviews_druglib_com.data.features


In [6]:
# Combine datasets
combined_df = pd.concat([df1, df2], ignore_index=True)


In [None]:
## Exploratory Data Analysis (EDA)

In [None]:
print(combined_df.head())
print(combined_df.info())
print("Unique Drug Names:", combined_df['urlDrugName'].nunique())
print("Unique Conditions:", combined_df['condition'].nunique())


In [None]:
print(combined_df.describe())
print(combined_df['rating'].value_counts())


In [None]:
# Convert reviews to string and calculate lengths
#converts review data into string format (to handle potential NaN values), calculates the length of each review, stores those lengths in new columns
combined_df['benefitsLength'] = combined_df['benefitsReview'].astype(str).apply(len)
combined_df['sideEffectsLength'] = combined_df['sideEffectsReview'].astype(str).apply(len)
combined_df['commentsLength'] = combined_df['commentsReview'].astype(str).apply(len)

# Display descriptive statistics (lengths)
print(combined_df[['benefitsLength', 'sideEffectsLength', 'commentsLength']].describe())


In [None]:
# Descriptive statistics for numerical columns
print(combined_df.describe())

# distribution of the rating column
print(combined_df['rating'].value_counts())


In [None]:
plt.figure(figsize=(11, 8))
sns.countplot(data=combined_df, x='rating', palette='viridis')
plt.title('Distribution of Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()


In [None]:
plt.figure(figsize=(9, 5))
sns.countplot(data=combined_df, x='effectiveness', palette='plasma')
plt.title('Effectiveness Distribution')
plt.xlabel('Effectiveness')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()


In [None]:
correlation_matrix = combined_df.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()


In [None]:

# Determine the top N useful review counts (adjust N as needed) USING TOP #10
top_n = 10 
top_useful_counts = combined_df['usefulCount'].value_counts().nlargest(top_n)

plt.figure(figsize=(12, 6))

# the top N useful reviews
sns.countplot(data=combined_df[combined_df['usefulCount'].isin(top_useful_counts.index)],
                            x='usefulCount', palette='Set2', order=top_useful_counts.index)

# Customize the plot
plt.title('Distribution of Top Useful Reviews', fontsize=16)
plt.xlabel('Useful Reviews Count', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.xticks(rotation=45)  # Rotate x-axis labels for better visibility
plt.grid(axis='y', linestyle='--', alpha=0.7)  # grid for better readability
plt.tight_layout()  # Adjust layout for better fit

# Show the plot
plt.show()



In [None]:
def rename_columns(df):
    df = df.rename(columns={
        'urlDrugName': 'drug_name',
        'rating': 'rating',
        'effectiveness': 'effectiveness',
        'sideEffects': 'side_effects',
        'condition': 'condition',
        'benefitsReview': 'benefits_review',
        'sideEffectsReview': 'side_effects_review',
        'commentsReview': 'comments_review'
    })
    return df

df1 = rename_columns(df1)
df2 = rename_columns(df2)

# Check the column names
print("Columns in Dataset 1:", df1.columns)
print("Columns in Dataset 2:", df2.columns)


In [None]:
def plot_drug_ratings(df, dataset_name, top_n=30):
    plt.figure(figsize=(12, 6))
    
    # Check if drug_name exists 
    if 'drug_name' in df.columns:
        # Get the top N drugs based on count
        top_drugs = df['drug_name'].value_counts().nlargest(top_n).index
        sns.countplot(data=df[df['drug_name'].isin(top_drugs)], x='drug_name', order=top_drugs)
        plt.title(f'Top {top_n} Drug Ratings Count in {dataset_name}')
        plt.xticks(rotation=90)
        plt.show()
    else:
        print(f"'drug_name' column not found in {dataset_name}.")

# Plot for both datasets
plot_drug_ratings(df1, "Dataset 1", top_n=30)
plot_drug_ratings(df2, "Dataset 2", top_n=30)


In [None]:
def plot_word_cloud(df, column_name, dataset_name):
    plt.figure(figsize=(10, 8))
    
    # Check if the column exists
    if column_name in df.columns:
        # Combine all reviews into a single string
        all_reviews = ' '.join(df[column_name].dropna())
        
        # Generate the word cloud
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_reviews)
        
        # Display the word cloud
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')  # Hide axes
        plt.title(f'Word Cloud for {column_name} in {dataset_name}')
        plt.show()
    else:
        print(f"'{column_name}' column not found in {dataset_name}.")

#  word clouds for both datasets
plot_word_cloud(df1, 'benefits_review', "Dataset 1")
plot_word_cloud(df2, 'benefits_review', "Dataset 2")


In [None]:
def plot_drug_word_cloud(df, dataset_name):
    plt.figure(figsize=(10, 8))
    
    # Check if 'drug_name' column exists
    if 'drug_name' in df.columns:
        # Combine all drug names into a single string
        all_drugs = ' '.join(df['drug_name'].dropna())
        
        # Generate the word cloud
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_drugs)
        
        # Display the word cloud
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')  # Hide axes
        plt.title(f'Word Cloud of Drug Names in {dataset_name}')
        plt.show()
    else:
        print(f"'drug_name' column not found in {dataset_name}.")

# Create word clouds for drug names in both datasets
plot_drug_word_cloud(df1, "Dataset 1")
plot_drug_word_cloud(df2, "Dataset 2")


In [None]:
def plot_word_frequency(df, column_name):
    if column_name in df.columns:
        all_reviews = ' '.join(df[column_name].dropna())
        words = all_reviews.split()
        word_freq = pd.Series(words).value_counts().head(20)  # Top 20 words
        word_freq.plot(kind='bar', figsize=(12, 6))
        plt.title(f'Top 20 Words in {column_name}')
        plt.show()
    else:
        print(f"'{column_name}' column not found in the DataFrame.")

# Plot for benefits reviews
plot_word_frequency(df1, 'benefits_review')
plot_word_frequency(df2, 'benefits_review')


In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer

# Download the necessary NLTK resources
nltk.download('vader_lexicon')

# Step 2: Load Data
drug_reviews_drugs_com = fetch_ucirepo(id=462)
X1 = drug_reviews_drugs_com.data.features

# Step 3: Inspect Available Columns
print("Available columns in the dataset:")
print(X1.columns)

# Proceed only if the expected columns exist
expected_columns = ['benefitsReview', 'sideEffectsReview', 'commentsReview', 'urlDrugName', 'rating']
missing_columns = [col for col in expected_columns if col not in X1.columns]

if missing_columns:
    print(f"Warning: The following expected columns are missing: {missing_columns}")
    # Show available columns after the warning
    print("Available columns:")
    print(X1.columns)
else:
    # Combine relevant text fields for analysis
    X1['combined_reviews'] = (
        X1['benefitsReview'].fillna("") + " " +
        X1['sideEffectsReview'].fillna("") + " " +
        X1['commentsReview'].fillna("")
    )


In [None]:
 # Initialize the Sentiment Intensity Analyzer
sia = SentimentIntensityAnalyzer()
    
# Calculate sentiment scores for the combined reviews
X1['sentiment'] = X1['combined_reviews'].apply(lambda x: sia.polarity_scores(x)['compound'])


In [None]:
# Categorize the sentiment scores into negative, neutral, and positive
    X1['sentiment_category'] = pd.cut(
        X1['sentiment'],
        bins=[-1, -0.05, 0.05, 1],
        labels=['negative', 'neutral', 'positive']
    )


In [None]:
# Check if 'sentiment_category' was created successfully
if 'sentiment_category' in X1.columns:


In [None]:
# Count the number of reviews in each sentiment category
sentiment_counts = X1['sentiment_category'].value_counts()
print("Sentiment Counts:")
print(sentiment_counts)


In [None]:
# Visualization of sentiment distribution using seaborn
plt.figure(figsize=(8, 6))
sns.countplot(data=X1, x='sentiment_category', palette='viridis')
plt.title('Sentiment Distribution of Drug Reviews')
plt.xlabel('Sentiment Category')
plt.ylabel('Count')
plt.show()


In [None]:
# Plotting sentiment distribution
sns.countplot(data=df, x='sentiment')
plt.title('Sentiment Distribution of Drug Reviews')
plt.show()


In [None]:
from wordcloud import WordCloud

# Generate word cloud for side effects
side_effects_text = ' '.join(df['sideEffectsReview'].dropna())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(side_effects_text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Common Side Effects in Reviews')
plt.show()
