In [1]:

!pip install nltk scikit-learn


import nltk
nltk.download('punkt')
nltk.download('stopwords')

import re
import string
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import ngrams
from google.colab import files


uploaded = files.upload()





[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Saving chat.txt to chat.txt


In [4]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [5]:

# main class
class ChatSummarizer:
    def __init__(self, filepath):
        self.filepath = filepath
        self.user_msgs = []
        self.ai_msgs = []
        self.stop_words = set(stopwords.words('english')).union({'hello', 'hi', 'can', 'please', 'explain', 'what', 'is'})

# Reads the chat line by line.
    def parse_chat(self):
        with open(self.filepath, 'r', encoding='utf-8') as file:
            lines = file.readlines()
            for line in lines:
                if line.startswith("User:"):
                    self.user_msgs.append(line.replace("User:", "").strip())
                elif line.startswith("AI:"):
                    self.ai_msgs.append(line.replace("AI:", "").strip())

# Lowers the text to ensure uniformity. Removes punctuation using str.maketrans. Tokenizes the text (splits into words). Removes stop words to leave only meaningful terms.

    def preprocess_text(self, text):
        text = text.lower()
        text = text.translate(str.maketrans('', '', string.punctuation))
        tokens = word_tokenize(text)
        return [t for t in tokens if t not in self.stop_words]


# This method extracts the most frequent unigrams (single words) and bigrams (phrases of 2 words).
    def keyword_frequency(self, top_n=5):
        all_text = " ".join(self.user_msgs + self.ai_msgs)
        tokens = self.preprocess_text(all_text)
        # Add bigrams for phrases like "machine learning"
        bigram_list = list(ngrams(tokens, 2))
        bigram_text = ['_'.join(gram) for gram in bigram_list]
        all_terms = bigram_text + tokens
        return Counter(all_terms).most_common(top_n)


# TF-IDF (Term Frequency-Inverse Document Frequency) gives importance scores to words. Uses user messages to find keywords.
    def tfidf_keywords(self, top_n=5):
        docs = self.user_msgs if self.user_msgs else self.ai_msgs  # Focus on user messages
        if not docs:
            return []
        tfidf = TfidfVectorizer(stop_words=list(self.stop_words), ngram_range=(1, 2))
        try:
            tfidf_matrix = tfidf.fit_transform(docs)
            scores = zip(tfidf.get_feature_names_out(), tfidf_matrix.toarray()[0])
            return sorted(scores, key=lambda x: x[1], reverse=True)[:top_n]
        except ValueError:
            return []

# Parses chat.Calculates the total number of exchanges (messages from both sides).
    def generate_summary(self, use_tfidf=True):
        self.parse_chat()
        total = len(self.user_msgs) + len(self.ai_msgs)

# Extract keywords
        if use_tfidf:
            keywords = self.tfidf_keywords(top_n=5)
            keywords_list = [kw[0].replace('_', ' ') for kw in keywords]
        else:
            keywords = self.keyword_frequency(top_n=5)
            keywords_list = [kw[0].replace('_', ' ') for kw in keywords]

# for main user query  not greetings
        main_query = next((msg for msg in self.user_msgs if msg.lower() not in ['hello!', 'hi!']), None)
        if main_query:

            main_topic = next((kw for kw in keywords_list if ' ' in kw), keywords_list[0] if keywords_list else 'unknown topic')

  # Use 'its definition' if query asks 'what'
            related_aspect = 'its definition' if 'what' in main_query.lower() else 'its details'
            topic_sentence = f"The user asked mainly about {main_topic} and {related_aspect}."
        else:
            topic_sentence = "The user did not ask any specific questions."

        print("\n Summary")
        print(f" -The conversation had exchanges. Total exchanges: {total}\n")
        print(f" -User messages ({len(self.user_msgs)}):")
        for msg in self.user_msgs:
            print(f"- {msg}")
        print(f"\n -AI messages ({len(self.ai_msgs)}):")
        for msg in self.ai_msgs:
            print(f"- {msg}")
        print(f"\n -Most common keywords: {', '.join(keywords_list)}")
        print(f"\n-Topic Summary: {topic_sentence}")

# summarizer variable
summarizer = ChatSummarizer("chat.txt")
summarizer.generate_summary(use_tfidf=True)

#  5 most frequent words
top_keywords = summarizer.keyword_frequency(top_n=5)
print("\nTop 5 Most Frequent Words/Phrases:")
for i, (word, freq) in enumerate(top_keywords, 1):
    print(f"{i}. {word.replace('_', ' ')} (Frequency: {freq})")


 Summary
 -The conversation had exchanges. Total exchanges: 4

 -User messages (2):
- Hello!
- Can you explain what machine learning is?

 -AI messages (2):
- Hi! How can I assist you today?
- Certainly! Machine learning is a field of AI that allows systems to learn from data.

 -Most common keywords: learning, machine, machine learning

-Topic Summary: The user asked mainly about machine learning and its definition.

Top 5 Most Frequent Words/Phrases:
1. machine learning (Frequency: 2)
2. machine (Frequency: 2)
3. learning (Frequency: 2)
4. learning assist (Frequency: 1)
5. assist today (Frequency: 1)
