In [1]:
import re
import pandas as pd

In [2]:
f = open('dialogs.txt','r', encoding='utf-8')

In [3]:
data = f.read()

In [4]:
data

'hi, how are you doing?\ti\'m fine. how about yourself?\ni\'m fine. how about yourself?\ti\'m pretty good. thanks for asking.\ni\'m pretty good. thanks for asking.\tno problem. so how have you been?\nno problem. so how have you been?\ti\'ve been great. what about you?\ni\'ve been great. what about you?\ti\'ve been good. i\'m in school right now.\ni\'ve been good. i\'m in school right now.\twhat school do you go to?\nwhat school do you go to?\ti go to pcc.\ni go to pcc.\tdo you like it there?\ndo you like it there?\tit\'s okay. it\'s a really big campus.\nit\'s okay. it\'s a really big campus.\tgood luck with school.\ngood luck with school.\tthank you very much.\nhow\'s it going?\ti\'m doing well. how about you?\ni\'m doing well. how about you?\tnever better, thanks.\nnever better, thanks.\tso how have you been lately?\nso how have you been lately?\ti\'ve actually been pretty good. you?\ni\'ve actually been pretty good. you?\ti\'m actually in school right now.\ni\'m actually in school r

In [5]:
lines = data.split('\n')
df = pd.DataFrame({'user_message': lines})

In [6]:
df.head()

Unnamed: 0,user_message
0,"hi, how are you doing?\ti'm fine. how about yo..."
1,i'm fine. how about yourself?\ti'm pretty good...
2,i'm pretty good. thanks for asking.\tno proble...
3,no problem. so how have you been?\ti've been g...
4,i've been great. what about you?\ti've been go...


In [7]:
df.shape

(3725, 1)

In [8]:
new_rows = []

for row in df['user_message']:
    parts = row.split('\t')
    for i, part in enumerate(parts):
        part = part.strip()
        # Try to extract "User: message"
        match = re.match(r'^(\w+):\s*(.+)', part)
        if match:
            user = match.group(1)
            message = match.group(2)
        else:
            # If no user prefix, assign default user1/user2
            user = f"user{i+1}"
            message = part
        new_rows.append({'user': user, 'message': message})

df_cleaned = pd.DataFrame(new_rows)
print(df_cleaned.head())

    user                              message
0  user1               hi, how are you doing?
1  user2        i'm fine. how about yourself?
2  user1        i'm fine. how about yourself?
3  user2  i'm pretty good. thanks for asking.
4  user1  i'm pretty good. thanks for asking.


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import nltk

documents = df_cleaned['message'].tolist()

# Load NLTK English stopwords
default_stopwords = set(stopwords.words('english'))

# Add custom generic/common words
custom_words = {
    'hi', 'popular', 'known', 'can', 'use', 'tell', 'sure', 'also', 'get', 'like',
    'one', 'many', 'used', 'based', 'etc', 'really', 'know', 'would'
}

final_stopwords = list(default_stopwords.union(custom_words))

# Initialize TF-IDF vectorizer with custom stopwords list
vectorizer = TfidfVectorizer(stop_words=final_stopwords)
X = vectorizer.fit_transform(documents)

# Extract keywords and scores
feature_names = vectorizer.get_feature_names_out()
scores = X.toarray().sum(axis=0)

tfidf_scores = sorted(zip(feature_names, scores), key=lambda x: x[1], reverse=True)
top_keywords = [word for word, _ in tfidf_scores[:5]]

print("Top keywords after removing stopwords and generic terms:")
for word, score in tfidf_scores[:5]:
    print(f"{word}: {score:.4f}")

print(top_keywords)

Top keywords after removing stopwords and generic terms:
yes: 151.2807
go: 118.7254
think: 112.7003
going: 102.9135
good: 97.3695
['yes', 'go', 'think', 'going', 'good']


In [10]:
!python -m spacy download en_core_web_sm

Defaulting to user installation because normal site-packages is not writeable


[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip



Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.1/12.8 MB 204.8 kB/s eta 0:01:03
     --------------------------------------- 0.1/12.8 MB 217.9 kB/s eta 0:00:59
     --------------------------------------- 0.1/12.8 MB 273.1 kB/s eta 0:00:47
     --------------------------------------- 0.1/12.8 MB 327.1 kB/s eta 0:00:39
      -------------------------------------- 0.2/12.8 MB 360.9 kB/s eta 0:00:36
      -------------------------------------- 0.2/12.8 MB 357.2 

In [11]:
import spacy
from collections import Counter

nlp = spacy.load("en_core_web_sm")

# Combine all messages into a single text block
full_text = " ".join(df_cleaned['message'].tolist())

# Process text with spaCy
doc = nlp(full_text)
noun_chunks = [chunk.text.lower() for chunk in doc.noun_chunks if len(chunk.text.split()) > 1]

# Generate keyword list
keywords = Counter(noun_chunks).most_common(5)
keyword_list = [kw[0] for kw in keywords]

# Extract named entities and noun chunks to infer topic
entities = [ent.text for ent in doc.ents if ent.label_ in ["ORG", "PERSON", "PRODUCT", "LANGUAGE", "WORK_OF_ART"]]
noun_chunks = [chunk.text.lower() for chunk in doc.noun_chunks if len(chunk.text.split()) > 1]

summary = f"""
The user asked mainly about {', '.join(set(entities + keyword_list))}.
"""
print(summary)


The user asked mainly about r&b, pimples suck, ahora, pbs, beer, god, cadillac, mcdonald's, google, what kind, andy warhol, a movie, shakespeare, am i. so am i. i, ralph nader, ruth, yuck, a lot, pasadena, the matter, chuck, the white house, debrah, spanish, lunch, i., obama, the weather, zzz, bush, jessica, am i., judy, the democratic party, pasta, english.



In [12]:
total_exchanges = df_cleaned.shape[0]

In [13]:
total_exchanges

7450

In [14]:
summary = f"""
Summary:
- The conversation had {total_exchanges} exchanges.
- The user asked mainly about {', '.join(set(entities + keyword_list))}.
- Most common keywords: {', '.join(top_keywords)}.
"""
print(summary.strip())

Summary:
- The conversation had 7450 exchanges.
- The user asked mainly about r&b, pimples suck, ahora, pbs, beer, god, cadillac, mcdonald's, google, what kind, andy warhol, a movie, shakespeare, am i. so am i. i, ralph nader, ruth, yuck, a lot, pasadena, the matter, chuck, the white house, debrah, spanish, lunch, i., obama, the weather, zzz, bush, jessica, am i., judy, the democratic party, pasta, english.
- Most common keywords: yes, go, think, going, good.
