In [25]:
#!pip install nltk
#!pip install textblob
#pip install wordcloud matplotlib
#!pip install spacy
#!python -m spacy download en_core_web_sm
#!pip install ucimlrepo
#!pip install scikit-learn
#pip install seaborn





In [26]:
import pandas as pd
import numpy as np
from ucimlrepo import fetch_ucirepo
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report




In [23]:
# datasets
drug_reviews_drugs_com = fetch_ucirepo(id=462)
drug_reviews_druglib_com = fetch_ucirepo(id=461)

# Load the datasets
df1 = drug_reviews_drugs_com.data.features
df2 = drug_reviews_druglib_com.data.features


In [24]:
# Combine datasets
combined_df = pd.concat([df1, df2], ignore_index=True)


In [None]:
## Exploratory Data Analysis (EDA)

In [None]:
print(combined_df.head())
print(combined_df.info())
print("Unique Drug Names:", combined_df['urlDrugName'].nunique())
print("Unique Conditions:", combined_df['condition'].nunique())


In [None]:
print(combined_df.describe())
print(combined_df['rating'].value_counts())


In [None]:
# Convert reviews to string and calculate lengths
#converts review data into string format calculates the length of each review, stores those lengths in new columns
combined_df['benefitsLength'] = combined_df['benefitsReview'].astype(str).apply(len)
combined_df['sideEffectsLength'] = combined_df['sideEffectsReview'].astype(str).apply(len)
combined_df['commentsLength'] = combined_df['commentsReview'].astype(str).apply(len)

# Display descriptive statistics (lengths)
print(combined_df[['benefitsLength', 'sideEffectsLength', 'commentsLength']].describe())


In [None]:
# Descriptive statistics for numerical columns
print(combined_df.describe())

# distribution of the rating column
print(combined_df['rating'].value_counts())


In [None]:
plt.figure(figsize=(11, 8))
sns.countplot(data=combined_df, x='rating', palette='viridis')
plt.title('Distribution of Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()


In [None]:
plt.figure(figsize=(9, 5))
sns.countplot(data=combined_df, x='effectiveness', palette='plasma')
plt.title('Effectiveness Distribution')
plt.xlabel('Effectiveness')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()


In [None]:
correlation_matrix = combined_df.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()


In [None]:

# Determine the top N useful review counts USING TOP #10
top_n = 10 
top_useful_counts = combined_df['usefulCount'].value_counts().nlargest(top_n)

plt.figure(figsize=(12, 6))

# the top N useful reviews
sns.countplot(data=combined_df[combined_df['usefulCount'].isin(top_useful_counts.index)],
                            x='usefulCount', palette='Set2', order=top_useful_counts.index)

# Customize the plot
plt.title('Distribution of Top Useful Reviews', fontsize=16)
plt.xlabel('Useful Reviews Count', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.xticks(rotation=45)  # Rotate x-axis labels
plt.grid(axis='y', linestyle='--', alpha=0.7)  # grid for better readability
plt.tight_layout()  

# Show the plot
plt.show()



In [None]:
def rename_columns(df):
    df = df.rename(columns={
        'urlDrugName': 'drug_name',
        'sideEffects': 'side_effects',
        'benefitsReview': 'benefits_review',
        'sideEffectsReview': 'side_effects_review',
        
    })
    return df

df1 = rename_columns(df1)
df2 = rename_columns(df2)

# Check the column names
print("Columns in Dataset 1:", df1.columns)
print("Columns in Dataset 2:", df2.columns)


In [None]:
def plot_drug_ratings(df, dataset_name, top_n=30):
    plt.figure(figsize=(12, 6))
    
    # Check if drug_name exists 
    if 'drug_name' in df.columns:
        # Get the top N drugs based on count
        top_drugs = df['drug_name'].value_counts().nlargest(top_n).index
        sns.countplot(data=df[df['drug_name'].isin(top_drugs)], x='drug_name', order=top_drugs)
        plt.title(f'Top {top_n} Drug Ratings Count in {dataset_name}')
        plt.xticks(rotation=90)
        plt.show()
    else:
        print(f"'drug_name' column not found in {dataset_name}.")

# Plot for both datasets
plot_drug_ratings(df1, "Dataset 1", top_n=30)
plot_drug_ratings(df2, "Dataset 2", top_n=30)


In [None]:
def plot_word_cloud(df, column_name, dataset_name):
    plt.figure(figsize=(10, 8))
    
    # Check if the column exists
    if column_name in df.columns:
        # Combine all reviews into a single string
        all_reviews = ' '.join(df[column_name].dropna())
        
        # Generate the word cloud
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_reviews)
        
        # Display the word cloud
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')  # Hide axes
        plt.title(f'Word Cloud for {column_name} in {dataset_name}')
        plt.show()
    else:
        print(f"'{column_name}' column not found in {dataset_name}.")

#  word clouds for both datasets
plot_word_cloud(df1, 'benefits_review', "Dataset 1")
plot_word_cloud(df2, 'benefits_review', "Dataset 2")


In [None]:
def plot_drug_word_cloud(df, dataset_name):
    plt.figure(figsize=(10, 8))
    
    # Check if 'drug_name' column exists
    if 'drug_name' in df.columns:
        # Combine all drug names into a single string
        all_drugs = ' '.join(df['drug_name'].dropna())
        
        # Generate the word cloud
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_drugs)
        
        # Display the word cloud
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')  # Hide axes
        plt.title(f'Word Cloud of Drug Names in {dataset_name}')
        plt.show()
    else:
        print(f"'drug_name' column not found in {dataset_name}.")

# Create word clouds for drug names in both datasets
plot_drug_word_cloud(df1, "Dataset 1")
plot_drug_word_cloud(df2, "Dataset 2")


In [None]:
def plot_word_frequency(df, column_name):
    if column_name in df.columns:
        all_reviews = ' '.join(df[column_name].dropna())
        words = all_reviews.split()
        word_freq = pd.Series(words).value_counts().head(20)  # Top 20 words
        word_freq.plot(kind='bar', figsize=(12, 6))
        plt.title(f'Top 20 Words in {column_name}')
        plt.show()
    else:
        print(f"'{column_name}' column not found in the DataFrame.")

# Plot for benefits reviews
plot_word_frequency(df1, 'benefits_review')
plot_word_frequency(df2, 'benefits_review')


In [None]:
# Generate word cloud for conditions in df1
condition_text = ' '.join(df1['condition'].dropna())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(condition_text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Common Conditions in Reviews (Dataset 1)')
plt.show()


In [None]:
# Generate word cloud for side effects in df2
side_effects_text = ' '.join(df2['side_effects_review'].dropna())  # Use df2 for Dataset 2
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(side_effects_text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Common Side Effects in Reviews (Dataset 2)')
plt.show()


In [None]:
# Generate word cloud for conditions in df2
condition_text = ' '.join(df2['condition'].dropna())  # Use df2 and focus on 'condition' column
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(condition_text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Common Conditions in Reviews (Dataset 2)')
plt.show()


In [None]:
# Function to clean text (remove punctuation, lowercase, remove numbers)
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()  
        text = re.sub(r'\d+', '', text)  
        text = re.sub(r'[^\w\s]', '', text)  
    return text

# Apply the cleaning function to the review column
df1['cleaned_review'] = df1['review'].apply(clean_text)

# Check cleaned reviews
print("Cleaned review:", df1['cleaned_review'].head())


In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')  
# Set of English stop words
stop_words = set(stopwords.words('english'))

# Function to clean text 
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()  
        text = re.sub(r'\d+', '', text)  
        text = re.sub(r'[^\w\s]', '', text)
    return text

# Function to tokenize and remove stopwords
def tokenize_and_remove_stopwords(text):
    if isinstance(text, str):
        words = word_tokenize(text)
        words = [word for word in words if word not in stop_words]
        return words
    return []  # Return an empty list if text is not a string

# Clean the review column
df1['cleaned_review'] = df1['review'].apply(clean_text)

# Apply tokenization and stopword removal
df1['tokens_review'] = df1['cleaned_review'].apply(tokenize_and_remove_stopwords)

# tokenized data
print("Tokenized review:", df1['tokens_review'].head())


In [None]:
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to lemmatize words
def lemmatize_words(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

# Apply lemmatization
df1['lemmatized_review'] = df1['tokens_review'].apply(lemmatize_words)

# lemmatized data
print("Lemmatized review:", df1['lemmatized_review'].head())


In [None]:
# Check for the lemmatized review column
if 'lemmatized_review' in df1.columns:
    # Use the entire dataset for TF-IDF
    text_column = df1['lemmatized_review'].astype(str)
    
    # Initialize TF-IDF Vectorizer
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
    X_tfidf = tfidf_vectorizer.fit_transform(text_column)

    # Display the shape of the TF-IDF matrix
    print("TF-IDF matrix shape:", X_tfidf.shape)
else:
    print("'lemmatized_review' is missing.")


In [None]:
from sklearn.decomposition import PCA

# Check for the lemmatized review column
if 'lemmatized_review' in df1.columns:
    # Use the entire dataset for TF-IDF
    text_column = df1['lemmatized_review'].astype(str)
    
    # Initialize TF-IDF Vectorizer
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
    X_tfidf = tfidf_vectorizer.fit_transform(text_column)

    # Display the shape of the TF-IDF matrix
    print("TF-IDF matrix shape:", X_tfidf.shape)

    # Convert to DataFrame for easier analysis
    tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
    
    # Display top terms by their mean TF-IDF score
    mean_tfidf_scores = tfidf_df.mean().sort_values(ascending=False)
    print("Top TF-IDF terms:\n", mean_tfidf_scores.head(10))

    # Visualize the top TF-IDF terms
    plt.figure(figsize=(10, 6))
    sns.barplot(x=mean_tfidf_scores.head(10).values, y=mean_tfidf_scores.head(10).index)
    plt.title('Top 10 TF-IDF Terms')
    plt.xlabel('Mean TF-IDF Score')
    plt.ylabel('Terms')
    plt.show()

    # Dimensionality reduction using PCA for visualization
    pca = PCA(n_components=2)
    reduced_data = pca.fit_transform(X_tfidf.toarray())

    # Create a DataFrame for the PCA result
    pca_df = pd.DataFrame(reduced_data, columns=['PC1', 'PC2'])

    # Visualize PCA results
    plt.figure(figsize=(10, 8))
    sns.scatterplot(x='PC1', y='PC2', data=pca_df, alpha=0.5)
    plt.title('PCA of TF-IDF Features')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.show()

else:
    print("'lemmatized_review' is missing.")


In [None]:

# Initialize VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Function to get sentiment score
def analyze_sentiment(text):
    if isinstance(text, str):
        return sia.polarity_scores(text)
    else:
        return {'neg': 0, 'neu': 0, 'pos': 0, 'compound': 0}

# Ensure that you have a column with the raw reviews, for example, 'review'
if 'review' in df1.columns:
    # Apply sentiment analysis to the raw review column
    df1['comments_review_sentiment'] = df1['review'].apply(analyze_sentiment)

    # Extract compound sentiment score
    df1['comments_sentiment_score'] = df1['comments_review_sentiment'].apply(lambda x: x['compound'])

    # Check the distribution of sentiment scores
    print("Sentiment score distribution:")
    print(df1['comments_sentiment_score'].describe())

    # Print sentiment scores for the top 30 rows
    print("Sentiment scores for comments reviews:")
    print(df1[['review', 'comments_sentiment_score']].head(30))

    # Identify reviews with significant sentiment scores
    significant_sentiment = df1[df1['comments_sentiment_score'].abs() > 0.1]  # Adjust the threshold as needed
    print("Reviews with significant sentiment scores:")
    print(significant_sentiment[['review', 'comments_sentiment_score']].head(30))
else:
    print("'review' column is missing.")


In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Initialize VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Function to get sentiment score
def analyze_sentiment(text):
    if isinstance(text, str):
        return sia.polarity_scores(text)
    else:
        return {'neg': 0, 'neu': 0, 'pos': 0, 'compound': 0}

# Apply sentiment analysis to the entire DataFrame
df1['comments_review_sentiment'] = df1['lemmatized_review'].apply(analyze_sentiment)

# Extract compound sentiment score
df1['comments_sentiment_score'] = df1['comments_review_sentiment'].apply(lambda x: x['compound'])

# Check the distribution of sentiment scores
print("Sentiment score distribution:")
print(df1['comments_sentiment_score'].describe())

# Print sentiment scores for the top 30 rows
print("Sentiment scores for comments reviews:")
print(df1[['lemmatized_review', 'comments_sentiment_score']].head(15))

# Identify reviews with significant sentiment scores
significant_sentiment = df1[df1['comments_sentiment_score'].abs() > 0.1]  # Adjust the threshold as needed
print("Reviews with significant sentiment scores:")
print(significant_sentiment[['lemmatized_review', 'comments_sentiment_score']].head(15))



In [None]:
from sklearn.cluster import KMeans

#  'review' column exists
if 'review' in df1.columns:
    # Use TF-IDF for feature extraction
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    X_tfidf = tfidf_vectorizer.fit_transform(df1['review'].astype(str))

    # Apply K-means clustering
    n_clusters = 5  # Adjust the number of clusters as needed
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    df1['cluster'] = kmeans.fit_predict(X_tfidf)

    # Print cluster assignments
    print("Cluster assignments for reviews:")
    print(df1[['review', 'cluster']].head(30))


In [None]:
# Check for lemmatized review column
if 'lemmatized_review' in df1.columns:
    # Use TF-IDF for feature extraction
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    X_tfidf = tfidf_vectorizer.fit_transform(df1['lemmatized_review'].astype(str))

    # Apply K-means clustering
    n_clusters = 5  # Adjust the number of clusters as needed
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    df1['cluster'] = kmeans.fit_predict(X_tfidf)

    # Print cluster assignments
    print("Cluster assignments for reviews:")
    print(df1[['lemmatized_review', 'cluster']].head(30))


In [None]:
def plot_most_frequent_words(text_column, top_n=20):
    # Use CountVectorizer to get word frequencies
    vectorizer = CountVectorizer(stop_words='english', max_features=top_n)
    word_matrix = vectorizer.fit_transform(text_column)
    word_freq = np.asarray(word_matrix.sum(axis=0)).flatten()
    words = vectorizer.get_feature_names_out()

    # Create a DataFrame for plotting
    word_freq_dict = dict(zip(words, word_freq))
    sorted_words = sorted(word_freq_dict.items(), key=lambda x: x[1], reverse=True)
    word_df = pd.DataFrame(sorted_words, columns=['Word', 'Frequency'])

    # Plot the results
    sns.barplot(x='Frequency', y='Word', data=word_df)
    plt.title(f'Top {top_n} Most Frequent Words')
    plt.show()

# Check for the original review column
if 'lemmatized_review' in df1.columns:  
    # Plot the most frequent words in the entire dataset
    plot_most_frequent_words(df1['lemmatized_review'].astype(str), top_n=25)
else:
    print("'lemmatized_review' is missing.")

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

# Check if necessary columns exist
if 'lemmatized_review' in df1.columns:  
    # Optionally limit the size of the dataset for faster processing
    max_rows = 100000  # Adjust this as needed
    text_column = df1['lemmatized_review'].astype(str).head(max_rows)  # Use only the top 'max_rows'

    # Use CountVectorizer for LDA with a limit on the number of features
    vectorizer = CountVectorizer(stop_words='english', max_features=300)  # Reduce max_features further
    X = vectorizer.fit_transform(text_column)

    # Apply LDA for topic modeling with fewer topics
    lda = LatentDirichletAllocation(n_components=3, random_state=42, n_jobs=-1)  # Use all available cores
    lda.fit(X)

    # Function to display top words in each topic
    def print_lda_topics(lda, vectorizer, top_n=10):
        words = vectorizer.get_feature_names_out()
        for i, topic in enumerate(lda.components_):
            print(f"Topic {i + 1}:")
            print([words[i] for i in topic.argsort()[-top_n:]])

    # Print the top words for each topic
    print_lda_topics(lda, vectorizer)
else:
    print("'lemmatized_review' is missing.")


In [None]:
# Function to clean text
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'\d+', '', text) 
        text = re.sub(r'[^\w\s]', '', text) 
    return text

# Clean the benefits_review and commentsReview columns
df2['cleaned_benefits_review'] = df2['benefits_review'].apply(clean_text)
df2['cleaned_comments_review'] = df2['commentsReview'].apply(clean_text)

# Check the cleaned columns
print(df2[['cleaned_benefits_review', 'cleaned_comments_review']].head())


In [None]:
# Tokenization function
def tokenize_and_remove_stopwords(text):
    if isinstance(text, str):
        words = word_tokenize(text)
        words = [word for word in words if word not in stop_words]
        return words
    return []

# Apply tokenization
df2['tokens_benefits_review'] = df2['cleaned_benefits_review'].apply(tokenize_and_remove_stopwords)
df2['tokens_comments_review'] = df2['cleaned_comments_review'].apply(tokenize_and_remove_stopwords)

# Check the tokenized columns
print(df2[['tokens_benefits_review', 'tokens_comments_review']].head())


In [None]:

# Lemmatization function
def lemmatize_words(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

# Apply lemmatization
df2['lemmatized_benefits_review'] = df2['tokens_benefits_review'].apply(lemmatize_words)
df2['lemmatized_comments_review'] = df2['tokens_comments_review'].apply(lemmatize_words)

# Check the lemmatized columns
print(df2[['lemmatized_benefits_review', 'lemmatized_comments_review']].head())


In [None]:
# Initialize sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Analyze sentiment function
def analyze_sentiment(tokens):
    text = ' '.join(tokens)
    return sia.polarity_scores(text)

# Apply sentiment analysis to comments review
df2['comments_review_sentiment'] = df2['lemmatized_comments_review'].apply(analyze_sentiment)
df2['comments_sentiment_score'] = df2['comments_review_sentiment'].apply(lambda x: x['compound'])

# Check sentiment analysis results
print(df2[['comments_sentiment_score', 'comments_review_sentiment']].head())


In [None]:
# Function to plot most frequent words
def plot_most_frequent_words(text_column, top_n=15):
    vectorizer = CountVectorizer(stop_words='english', max_features=top_n)
    word_matrix = vectorizer.fit_transform(text_column)
    word_freq = np.asarray(word_matrix.sum(axis=0)).flatten()
    words = vectorizer.get_feature_names_out()

    word_freq_dict = dict(zip(words, word_freq))
    sorted_words = sorted(word_freq_dict.items(), key=lambda x: x[1], reverse=True)
    word_df = pd.DataFrame(sorted_words, columns=['Word', 'Frequency'])

    sns.barplot(x='Frequency', y='Word', data=word_df)
    plt.title(f'Top {top_n} Most Frequent Words')
    plt.show()

# Analyze the whole dataset
if 'cleaned_benefits_review' in df2.columns:
    plot_most_frequent_words(df2['cleaned_benefits_review'].astype(str), top_n=20)
else:
    print("'cleaned_benefits_review' is missing.")

In [None]:
# Topic modeling on lemmatized benefits review
if 'lemmatized_benefits_review' in df2.columns:
    # Use the entire dataset instead of sampling
    text_column = df2['lemmatized_benefits_review'].astype(str)

    # Create the CountVectorizer
    vectorizer = CountVectorizer(stop_words='english', max_features=500)
    X = vectorizer.fit_transform(text_column)

    # Apply LDA for topic modeling
    lda = LatentDirichletAllocation(n_components=3, random_state=42) 
    lda.fit(X)

    # Function to display top words in each topic
    def print_lda_topics(lda, vectorizer, top_n=10):
        words = vectorizer.get_feature_names_out()
        for i, topic in enumerate(lda.components_):
            print(f"Topic {i + 1}:")
            print([words[i] for i in topic.argsort()[-top_n:]])

    # Display the topics
    print_lda_topics(lda, vectorizer)
else:
    print("The required column 'lemmatized_benefits_review' is missing.")


In [None]:
# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Function to perform NER on a batch of texts
def extract_entities_batch(texts):
    docs = nlp.pipe(texts, batch_size=50)  # Process in batches of 50
    return [[(ent.text, ent.label_) for ent in doc.ents] for doc in docs]

# Apply NER to df1 (assuming df1 is already defined)
df1['entities'] = extract_entities_batch(df1['review'].astype(str).tolist())

# Apply NER to df2 (assuming df2 is already defined)
df2['entities'] = extract_entities_batch(df2['review'].astype(str).tolist())

# Display the entities found in the top 30 reviews of df1
print("Entities extracted from df1 reviews:")
print(df1[['review', 'entities']].head(30))

# Display the entities found in the top 30 reviews of df2
print("Entities extracted from df2 reviews:")
print(df2[['review', 'entities']].head(30))

# Count the frequency of key entities
from collections import Counter

def count_entities(df):
    entity_list = []
    for entities in df['entities']:
        entity_list.extend([ent[0] for ent in entities])
    return Counter(entity_list)

# Count entities in both DataFrames
df1_entities_count = count_entities(df1)
df2_entities_count = count_entities(df2)

# Display the most common entities in df1
print("Most common entities in df1:")
print(df1_entities_count.most_common(10))

# Display the most common entities in df2
print("Most common entities in df2:")
print(df2_entities_count.most_common(10))
