In [1]:
import re
import pandas as pd

In [2]:
f = open('chat.txt','r', encoding='utf-8')

In [3]:
data = f.read()

In [4]:
data

'User: Hi, can you tell me about Python?,\nAI: Sure! Python is a popular programming language known for its readability,\nUser: What can I use it for?,\nAI: You can use Python for web development, data analysis, AI, and more,'

In [5]:
lines = data.split('\n')
df = pd.DataFrame({'user_message': lines})

In [6]:
df.head()

Unnamed: 0,user_message
0,"User: Hi, can you tell me about Python?,"
1,AI: Sure! Python is a popular programming lang...
2,"User: What can I use it for?,"
3,"AI: You can use Python for web development, da..."


In [7]:
df.shape

(4, 1)

In [8]:
# Split user and message
users = []
messages = []

for message in df['user_message']:
    match = re.match(r'^(\w+):\s(.+)', message.strip())
    if match:
        users.append(match.group(1))       
        messages.append(match.group(2))    
df['user'] = users
df['message'] = messages
df.drop(columns=['user_message'], inplace=True)

df.head()

Unnamed: 0,user,message
0,User,"Hi, can you tell me about Python?,"
1,AI,Sure! Python is a popular programming language...
2,User,"What can I use it for?,"
3,AI,"You can use Python for web development, data a..."


In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import nltk

documents = df['message'].tolist()

# Load NLTK English stopwords
default_stopwords = set(stopwords.words('english'))

# Add custom generic/common words
custom_words = {
    'hi', 'popular', 'known', 'can', 'use', 'tell', 'sure', 'also', 'get', 'like',
    'one', 'many', 'used', 'based', 'etc', 'really', 'know', 'would'
}

final_stopwords = list(default_stopwords.union(custom_words))

# Initialize TF-IDF vectorizer with custom stopwords list
vectorizer = TfidfVectorizer(stop_words=final_stopwords)
X = vectorizer.fit_transform(documents)

# Extract keywords and scores
feature_names = vectorizer.get_feature_names_out()
scores = X.toarray().sum(axis=0)

tfidf_scores = sorted(zip(feature_names, scores), key=lambda x: x[1], reverse=True)
top_keywords = [word for word, _ in tfidf_scores[:5]]

print("Top keywords after removing stopwords and generic terms:")
for word, score in tfidf_scores[:5]:
    print(f"{word}: {score:.4f}")

print(top_keywords)

Top keywords after removing stopwords and generic terms:
python: 1.6203
language: 0.5417
programming: 0.5417
readability: 0.5417
ai: 0.4300
['python', 'language', 'programming', 'readability', 'ai']
