In [1]:
import re
import pandas as pd

In [2]:
f = open('chat.txt','r', encoding='utf-8')

In [3]:
data = f.read()

In [4]:
data

'User: Hi, can you tell me about Python?,\nAI: Sure! Python is a popular programming language known for its readability,\nUser: What can I use it for?,\nAI: You can use Python for web development, data analysis, AI, and more,'

In [5]:
lines = data.split('\n')
df = pd.DataFrame({'user_message': lines})

In [6]:
df.head()

Unnamed: 0,user_message
0,"User: Hi, can you tell me about Python?,"
1,AI: Sure! Python is a popular programming lang...
2,"User: What can I use it for?,"
3,"AI: You can use Python for web development, da..."


In [7]:
df.shape

(4, 1)

In [8]:
new_rows = []

for row in df['user_message']:
    parts = row.split('\t')
    for i, part in enumerate(parts):
        part = part.strip()
        # Try to extract "User: message"
        match = re.match(r'^(\w+):\s*(.+)', part)
        if match:
            user = match.group(1)
            message = match.group(2)
        else:
            # If no user prefix, assign default user1/user2
            user = f"user{i+1}"
            message = part
        new_rows.append({'user': user, 'message': message})

df_cleaned = pd.DataFrame(new_rows)
print(df_cleaned.head())

   user                                            message
0  User                 Hi, can you tell me about Python?,
1    AI  Sure! Python is a popular programming language...
2  User                            What can I use it for?,
3    AI  You can use Python for web development, data a...


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import nltk

documents = df_cleaned['message'].tolist()

# Load NLTK English stopwords
default_stopwords = set(stopwords.words('english'))

# Add custom generic/common words
custom_words = {
    'hi', 'popular', 'known', 'can', 'use', 'tell', 'sure', 'also', 'get', 'like',
    'one', 'many', 'used', 'based', 'etc', 'really', 'know', 'would'
}

final_stopwords = list(default_stopwords.union(custom_words))

# Initialize TF-IDF vectorizer with custom stopwords list
vectorizer = TfidfVectorizer(stop_words=final_stopwords)
X = vectorizer.fit_transform(documents)

# Extract keywords and scores
feature_names = vectorizer.get_feature_names_out()
scores = X.toarray().sum(axis=0)

tfidf_scores = sorted(zip(feature_names, scores), key=lambda x: x[1], reverse=True)
top_keywords = [word for word, _ in tfidf_scores[:5]]

print("Top keywords after removing stopwords and generic terms:")
for word, score in tfidf_scores[:5]:
    print(f"{word}: {score:.4f}")

print(top_keywords)

Top keywords after removing stopwords and generic terms:
python: 1.6203
language: 0.5417
programming: 0.5417
readability: 0.5417
ai: 0.4300
['python', 'language', 'programming', 'readability', 'ai']


In [10]:
import spacy
from collections import Counter

nlp = spacy.load("en_core_web_sm")

# Combine all messages into a single text block
full_text = " ".join(df_cleaned['message'].tolist())

# Process text with spaCy
doc = nlp(full_text)
noun_chunks = [chunk.text.lower() for chunk in doc.noun_chunks if len(chunk.text.split()) > 1]

# Generate keyword list
keywords = Counter(noun_chunks).most_common(5)
keyword_list = [kw[0] for kw in keywords]

# Extract named entities and noun chunks to infer topic
entities = [ent.text for ent in doc.ents if ent.label_ in ["ORG", "PERSON", "PRODUCT", "LANGUAGE", "WORK_OF_ART"]]
noun_chunks = [chunk.text.lower() for chunk in doc.noun_chunks if len(chunk.text.split()) > 1]

summary = f"""
The user asked mainly about {', '.join(set(entities + keyword_list))}.
"""
print(summary)


The user asked mainly about Python, web development, a popular programming language, data analysis, its readability.



In [11]:
total_exchanges = df_cleaned.shape[0]

In [12]:
total_exchanges

4

In [13]:
summary = f"""
Summary:
- The conversation had {total_exchanges} exchanges.
- The user asked mainly about {', '.join(set(entities + keyword_list))}.
- Most common keywords: {', '.join(top_keywords)}.
"""
print(summary.strip())

Summary:
- The conversation had 4 exchanges.
- The user asked mainly about Python, web development, a popular programming language, data analysis, its readability.
- Most common keywords: python, language, programming, readability, ai.
