In [None]:
#IMPORT ALL NECESSARY REQUIREMENTS
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
import pandas as pd

df = pd.read_csv(
    r"C:\narative nexus\Amazon_Reviews.csv",
    engine="python",
    on_bad_lines="skip"
)

df.head()



In [None]:
# Keep required columns
df = df[['Review Text', 'Rating','Review Title']].dropna()


print(df.shape)

In [None]:
#basic data check
#shape
print("SHAPE:\n",df.shape)
#info
print("\nINFO:\n")
df.info()

In [None]:
print(f"Shape of DataFrame before dropping nulls: {df.shape}")


df_cleaned = df.dropna().copy()

print(f"Shape of DataFrame after dropping nulls: {df_cleaned.shape}")

# **Data preprocessing**

In [None]:
#Natural Language Toolkit SETUP
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
STOPWORDS = set(stopwords.words('english'))
LEMMATIZER = WordNetLemmatizer()
from nltk.tokenize import word_tokenize

# Download NLTK resources (run once)
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab') # Added to resolve LookupError
nltk.download('averaged_perceptron_tagger_eng')

# **Text Cleaning (Lowercasing, Special Character Removal, Stopword Removal)**

In [None]:
import re
def clean_text(text):
    if text is None: # Handle None values
        return ""
    # Remove URLs (http, https, www)
    text = re.sub(r'https?://\S+|www\.\S+', '', text, flags=re.IGNORECASE)

    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)

    # Remove hashtags and mentions
    text = re.sub(r'#\w+|@\w+', '', text)

    # Remove copyright symbols and special markers
    text = re.sub(r'¬©|¬Æ|‚Ñ¢', '', text)

    # Convert to lowercase
    text = text.lower()

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)

    return text.strip()
    df['clean_text'] = df['Text'].apply(clean_text)

# **Text Lemmatization and Tokenization**

In [None]:
from nltk.tag import pos_tag

def tokenize_and_lemmatize(text):
    # Tokenize
    tokens = word_tokenize(text)

    # Get stopwords
    stop_words = set(stopwords.words('english'))

    # Initialize lemmatizer
    lemmatizer = WordNetLemmatizer()

    # Process tokens: filter, lemmatize, and keep only alphabetic words
    processed_tokens = []
    for token in tokens:
        # Keep only alphabetic tokens with length > 2
        if token.isalpha() and len(token) > 2:
            if token not in stop_words:
                lemmatized = lemmatizer.lemmatize(token)
                processed_tokens.append(lemmatized)

    return processed_tokens

In [None]:
TEXT_COL = 'Review Text'   # ‚Üê change ONLY this if needed

df['cleaned_text'] = df[TEXT_COL].astype(str).apply(clean_text)
df['tokens'] = df['cleaned_text'].apply(tokenize_and_lemmatize)
df['lemmatized_text'] = df['tokens'].apply(lambda x: ' '.join(x))


In [None]:
df[[TEXT_COL, 'cleaned_text', 'tokens', 'lemmatized_text']].head()


# **TOPIC MODELLING**

# **LATENT DIRICHLET ALLOCATION**

In [None]:
pip install gensim

In [None]:
import matplotlib.pyplot as plt
import re
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
import gensim
from gensim.models import LdaModel

In [None]:
data_words = df['tokens'].tolist()
# CREATE DICTIONARY & CORPUS ---
# Create Dictionary
id2word = Dictionary(data_words)

# Filter extremes: Remove words in <15 docs or >50% of docs
id2word.filter_extremes(no_below=15, no_above=0.5)

# Create Corpus (Term Document Frequency)
corpus = [id2word.doc2bow(text) for text in data_words]

print(f"   - Dictionary size: {len(id2word)} unique tokens")
print(f"   - Corpus size: {len(corpus)} documents")

In [None]:
def evaluate_lda_models(dictionary, corpus, texts, start, limit, step):
    coherence_values = []
    perplexity_values = []
    model_list = []

    for num_topics in range(start, limit, step):
        print(f"Training model with {num_topics} topics...")
        # Build LDA model
        model = gensim.models.ldamodel.LdaModel(
            corpus=corpus,
            num_topics=num_topics,
            id2word=dictionary,
            random_state=100,
            passes=10,
            alpha=0.01
        )
        model_list.append(model)

        # Compute Coherence Score
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

        # Compute Perplexity score
        log_perp = model.log_perplexity(corpus)
        perplexity_values.append(np.exp(-1 * log_perp))

    return model_list, coherence_values, perplexity_values

In [None]:
start, limit, step = 5,30,5
model_list, coherence_values, perplexity_values = evaluate_lda_models(
    dictionary=id2word, corpus=corpus, texts=data_words, start=start, limit=limit, step=step
)

In [None]:
k_values = list(range(start, limit + 1, step))

print("Topic No | Coherence Score | Perplexity Score")
print("-" * 45)

for k, coh, perp in zip(k_values, coherence_values, perplexity_values):
    print(f"{k:^8} | {coh:^15.4f} | {perp:^17.4f}")


In [None]:
import matplotlib.pyplot as plt

# Build k_values safely from coherence_values length
k_values = list(range(start, start + step * len(coherence_values), step))

plt.figure(figsize=(12, 6))

# Plot Coherence
plt.plot(k_values,coherence_values,marker='o',label='Coherence Score')

# Create second y-axis for Perplexity
ax1 = plt.gca()
ax2 = ax1.twinx()

ax2.plot(k_values,perplexity_values,marker='s',linestyle='--',label='Perplexity Score')

ax1.set_xlabel("Number of Topics (k)")
ax1.set_ylabel("Coherence Score")
ax2.set_ylabel("Perplexity Score")

plt.title("LDA Coherence & Perplexity vs Number of Topics")
plt.grid(alpha=0.3)

# Combine legends
lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc='best')

plt.show()


In [None]:
best_k = k_values[coherence_values.index(max(coherence_values))]
best_coh = max(coherence_values)

print(f"Best Topic Count: {best_k}")
print(f"Best Coherence Score: {best_coh:.4f}")


In [None]:
# Find best model index using coherence
best_index = coherence_values.index(max(coherence_values))

# Compute best k
best_k = start + best_index * step

# Get best LDA model
best_lda_model = model_list[best_index]

print(f"Best Topic Count: {best_k}")
print(f"Best Coherence Score: {coherence_values[best_index]:.4f}")

# Display topics
topics = best_lda_model.show_topics(
    num_topics=best_k,
    num_words=10,
    formatted=False
)

for topic_id, words in topics:
    print(f"Topic {topic_id}: {[word for word, _ in words]}")

    


# **Non-Negative Matrix Factorization**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

from gensim.corpora import Dictionary
from gensim.models import CoherenceModel


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(
    max_df=0.95,
    min_df=5,
    ngram_range=(1, 2),
    stop_words='english'
)

tfidf = tfidf_vectorizer.fit_transform(df['lemmatized_text'])
feature_names = tfidf_vectorizer.get_feature_names_out()

print("TF-IDF shape:", tfidf.shape)


In [None]:
from gensim.corpora import Dictionary

texts = df['tokens']

dictionary = Dictionary(texts)
dictionary.filter_extremes(
    no_below=20,
    no_above=0.4
)

print("Dictionary size:", len(dictionary))


In [None]:
def compute_nmf_coherence_values(tfidf,feature_names,texts,dictionary,start=5,limit=30,step=5):
    coherence_values = []
    model_list = []
    
    for num_topics in range(start, limit + 1, step):
        nmf_model = NMF(n_components=num_topics,random_state=42,init='nndsvd',max_iter=500)
        W = nmf_model.fit_transform(tfidf)
        H = nmf_model.components_

        model_list.append(nmf_model)

        # Extract topics
        topics = [
            [feature_names[i] for i in topic.argsort()[:-11:-1]]
            for topic in H
        ]

        coherence_model = CoherenceModel(topics=topics,texts=texts,dictionary=dictionary,coherence='c_v')

        coherence = coherence_model.get_coherence()
        coherence_values.append(coherence)

        print(f"Topics: {num_topics} | Coherence: {coherence:.4f}")

    return model_list, coherence_values


In [None]:
start, limit, step = 5, 25, 5

nmf_models, nmf_coherence_values = compute_nmf_coherence_values(tfidf=tfidf,feature_names=feature_names,texts=df['tokens'],dictionary=dictionary,start=start,limit=limit,step=step)


In [None]:
import matplotlib.pyplot as plt

topic_range = list(range(start, limit + 1, step))

plt.figure(figsize=(12, 6))
plt.plot(topic_range, nmf_coherence_values, marker='o')
plt.xlabel("Number of Topics")
plt.ylabel("Coherence Score (c_v)")
plt.title("NMF Topic Coherence")
plt.grid(True)
plt.show()


In [None]:
optimal_topics = topic_range[np.argmax(nmf_coherence_values)]
print("Optimal number of topics:", best_k)


In [None]:
import numpy as np

topic_range = list(range(start, limit + 1, step))

best_index = np.argmax(nmf_coherence_values)
optimal_topics = topic_range[best_index]
best_nmf_model = nmf_models[best_index]

print(f"\n Optimal number of topics: {optimal_topics}\n")

# ----- DISPLAY NMF TOPICS -----
n_top_words = 10

for topic_idx, topic in enumerate(best_nmf_model.components_):
    top_words = [
        feature_names[i]
        for i in topic.argsort()[:-n_top_words - 1:-1]
    ]
    print(f"Topic {topic_idx}: {top_words}")


In [None]:
topic_labels = {
    0: "Negative Reviews",
    1: "Wrong Items",
    2: "Customer Service",
    3: "Positive Service",
    4: "Shopping Experience",
    5: "Prime Membership",
    6: "Delivery Issues",
    7: "Payment & Gift Cards",
    8: "Positive Experience",
    9: "Product Quality",
    10: "Account Issues",
    11: "Order Management",
    12: "Delivery Speed",
    13: "Online Shopping Satisfaction",
    14: "Ease of Use",
    15: "Shipping Delays",
    16: "Fast Shipping",
    17: "Time Wastage",
    18: "Company Reputation",
    19: "Refunds & Returns"
}
table_data = []
for topic_idx, topic in enumerate(best_nmf_model.components_):
    top_words = [
        feature_names[i]
        for i in topic.argsort()[:-11:-1]
    ]
    table_data.append([
        f"Topic {topic_idx}",
        topic_labels.get(topic_idx, "Unlabeled"),
        ", ".join(top_words)
    ])

row_count = len(table_data)
fig_height = max(6, row_count * 0.45)

fig, ax = plt.subplots(figsize=(16, fig_height))
ax.axis("off")

table = ax.table(
    cellText=table_data,
    colLabels=["Topic", "Label", "Top Keywords"],
    cellLoc="left",
    loc="center"
)

table.auto_set_font_size(False)
table.set_fontsize(10)
table.auto_set_column_width(col=list(range(3)))
table.scale(1, 1.2)

plt.title("NMF Topics with Human-Readable Labels", fontsize=16, pad=20)
plt.show()

# **COMPARASION BETWEEN THE COHERENCE SCORES OF LDA AND NMF**

In [None]:
import matplotlib.pyplot as plt
import numpy as np

k_values_lda = list(range(5, 5 * (len(coherence_values) + 1), 5))
k_values_nmf = list(range(5, 5 * (len(nmf_coherence_values) + 1), 5))

plt.figure(figsize=(12, 7))

plt.plot(k_values_lda,coherence_values,marker='o',linestyle='-',label='LDA Coherence (c_v)')

plt.plot(k_values_nmf,nmf_coherence_values,marker='s',linestyle='-',label='NMF Coherence (c_v)')

plt.xlabel('Number of Topics (k)')
plt.ylabel('Coherence Score (c_v)')
plt.title('Comparison of LDA and NMF Coherence Scores')

plt.xticks(sorted(set(k_values_lda + k_values_nmf)))
plt.grid(True, alpha=0.3)
plt.legend()
plt.show()


# **SENTIMENTAL ANALYSIS**

In [None]:
!pip install nltk vaderSentiment scikit-learn matplotlib seaborn


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

def vader_sentiment(text):
    score = analyzer.polarity_scores(text)["compound"]
    if score >= 0.1:
        return "Positive"
    elif score <= -0.1:
        return "Negative"
    else:
        return "Neutral"

df["Sentiment"] = df["Review Text"].apply(vader_sentiment)
df["Sentiment"].value_counts()


In [None]:
label_map = {
    "Negative": 0,
    "Neutral": 1,
    "Positive": 2
}

df["sentiment_label"] = df["Sentiment"].map(label_map)


In [None]:
from sklearn.model_selection import train_test_split

X = tfidf
y = df["sentiment_label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
from sklearn.linear_model import LogisticRegression

log_reg_model = LogisticRegression(
    max_iter=1000,
    class_weight="balanced"
)

log_reg_model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

y_pred = log_reg_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6,5))
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=["Negative", "Neutral", "Positive"],
    yticklabels=["Negative", "Neutral", "Positive"]
)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Sentiment Classification Confusion Matrix (With Neutral)")
plt.show()


In [None]:
#Integrate Sentiment with Topics
nmf_doc_topics = best_nmf_model.transform(tfidf)
df["dominant_topic"] = np.argmax(nmf_doc_topics, axis=1)

df[[TEXT_COL, "dominant_topic", "Sentiment"]].head()


In [None]:
#Topic-wise Sentiment Distribution
topic_sentiment = pd.crosstab(df["dominant_topic"], df["Sentiment"])

topic_sentiment.plot(kind="bar", stacked=True, figsize=(10,5))
plt.title("Topic-wise Sentiment Distribution")
plt.xlabel("Topic")
plt.ylabel("Number of Documents")
plt.show()


#  **INSIGHT GENERATION & TEXT SUMMARIZATION**

In [None]:
summary_df = df[["Review Text", "dominant_topic", "Sentiment"]].dropna()
summary_df.head()


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
#Extractive Summarization Function
def extractive_summary_short(texts, top_n=4, max_chars=50):
    vectorizer = TfidfVectorizer(stop_words="english")
    tfidf = vectorizer.fit_transform(texts)
    
    scores = np.asarray(tfidf.sum(axis=1)).ravel()
    top_indices = scores.argsort()[-top_n:][::-1]
    
    summary = []
    for i in top_indices:
        line = texts[i]
        line = line.replace("\n", " ").strip()
        if len(line) > max_chars:
            line = line[:max_chars] + " "
        summary.append(line)
        
    return summary


In [None]:
topic_summaries = {}

for topic in sorted(summary_df["dominant_topic"].unique()):
    topic_texts = summary_df[
        summary_df["dominant_topic"] == topic
    ]["Review Text"].tolist()
    
    if len(topic_texts) >= 4:
        topic_summaries[topic] = extractive_summary_short(
            topic_texts,
            top_n=4,
            max_chars=50
        )
    else:
        topic_summaries[topic] = topic_texts


In [None]:
for topic, summaries in topic_summaries.items():
    print(f"\nüîπ Topic {topic} Summary:")
    for s in summaries:
        print("-", s)


In [None]:
#Sentiment-wise Summarization
sentiment_summaries = {}

for sentiment in ["Negative", "Neutral", "Positive"]:
    texts = summary_df[
        summary_df["Sentiment"] == sentiment
    ]["Review Text"].tolist()
    
    if len(texts) >= 3:
        sentiment_summaries[sentiment] = extractive_summary_short(texts, top_n=3)
    else:
        sentiment_summaries[sentiment] = texts


In [None]:
for sentiment, summaries in sentiment_summaries.items():
    print(f"\nüî∏ {sentiment} Sentiment Summary:")
    for s in summaries:
        print("-", s)


In [None]:
#Topic + Sentiment Combined Insights
for topic in sorted(summary_df["dominant_topic"].unique()):
    print(f"\nüìå Topic {topic} Insights:")
    
    for sentiment in ["Negative", "Neutral", "Positive"]:
        texts = summary_df[
            (summary_df["dominant_topic"] == topic) &
            (summary_df["Sentiment"] == sentiment)
        ]["Review Text"].tolist()
        
        if len(texts) >= 2:
            print(f"\n{sentiment} feedback:")
            for s in extractive_summary_short(texts, top_n=2):
                print("-", s)


In [None]:
import pickle

with open("summarization_results.pkl", "wb") as f:
    pickle.dump({
        "topic_summaries": topic_summaries,
        "sentiment_summaries": sentiment_summaries
    }, f)


#  **VISUALIZATION**

In [None]:
import matplotlib.pyplot as plt

sentiment_counts = df["Sentiment"].value_counts()

plt.figure(figsize=(6,4))
sentiment_counts.plot(kind="bar")
plt.title("Overall Sentiment Distribution")
plt.xlabel("Sentiment")
plt.ylabel("Number of Reviews")
plt.tight_layout()
plt.show()


# Topic Distribution

In [None]:
topic_counts = df["dominant_topic"].value_counts().sort_index()

plt.figure(figsize=(6,4))
topic_counts.plot(kind="bar")
plt.title("Topic Distribution")
plt.xlabel("Topic")
plt.ylabel("Number of Documents")
plt.tight_layout()
plt.show()


# Topic-wise Sentiment Distribution

In [None]:
import pandas as pd

topic_sentiment = pd.crosstab(df["dominant_topic"], df["Sentiment"])

topic_sentiment.plot(
    kind="bar",
    stacked=True,
    figsize=(10,5)
)

plt.title("Topic-wise Sentiment Distribution")
plt.xlabel("Topic")
plt.ylabel("Number of Documents")
plt.legend(title="Sentiment")
plt.tight_layout()
plt.show()


# WORLD CLOUD

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np

word_weights = {}

for topic in best_nmf_model.components_:
    for word_idx, weight in enumerate(topic):
        word = feature_names[word_idx]
        word_weights[word] = word_weights.get(word, 0) + weight

wordcloud = WordCloud(
    width=1400,
    height=700,
    background_color="white",
    max_words=200,
    collocations=False
).generate_from_frequencies(word_weights)


plt.figure(figsize=(16, 8))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title(" Topics Word Cloud ", fontsize=18)
plt.show()
