In [1]:
# !pip install pandas bertopic tqdm sentence_transformers tqdm

In [1]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import os

os.environ["TOKENIZERS_PARALLELISM"] = "true"

  from .autonotebook import tqdm as notebook_tqdm


In [24]:
df = pd.read_csv('parenting_smaller_set.csv', low_memory=False)
def convert_utc_to_day_level(utc):
    return pd.to_datetime(utc, unit='s').strftime('%Y-%m-%d')
df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s').apply(convert_utc_to_day_level)
df = df[['title', 'selftext', 'created_utc']]
df

Unnamed: 0,title,selftext,created_utc
0,Parenting and Identity,One of the difficult things for me to navigate...,2024-07-08
1,26 MO doesn’t listen when told not to do somet...,My 26 MO will frequently do something when tol...,2024-07-08
2,Advice for teaching preteens how to safely nav...,I’m looking for advice on how best to teach ki...,2024-07-08
3,Is anyone unsatisfied with parenthood?,I’m probably going to get a lot of hate from t...,2024-07-08
4,Best Play Mat for Babies?,I’m in need of some advice! I’m looking for a ...,2024-07-08
...,...,...,...
17930,Is it a bad idea to let my toddler place in he...,edit to title: let my toddler \*play\* in her ...,2024-03-01
17931,Siblings sharing mom with each other and new b...,Help! I’m an early childhood educator and I’m ...,2024-03-01
17932,Morning drop off,"So every morning, without fail (actually not e...",2024-03-01
17933,Help,My kid is 5 and he was hard since a baby. He w...,2024-03-01


In [25]:
import pandas as pd
from bertopic.representation import KeyBERTInspired

df = df.sample(1000)
docs = df['selftext'].tolist()


embedder = SentenceTransformer('paraphrase-MiniLM-L6-v2')
embeddings = embedder.encode(docs, show_progress_bar=True)

# Create your representation model
representation_model = KeyBERTInspired()

# Use the representation model in BERTopic on top of the default pipeline
model = BERTopic(representation_model=representation_model, embedding_model=embedder)
topic_model = model.fit(docs)
print("fitting complete")
topics, probs = model.transform(docs)

Batches: 100%|██████████| 32/32 [01:03<00:00,  1.99s/it]


fitting complete


In [64]:
# print topics and keywords
df['topic'] = topics
df['probability'] = probs
df['representative'] = df['topic'].apply(lambda x: '-'.join([y[0] for y in model.get_topic(x)]))
df

Unnamed: 0,title,selftext,created_utc,topic,probability,representative
16747,Appropriate response to disrespectful older ki...,What's the appropriate response to an older ki...,2024-03-10,9,1.000000,toys-age-kids-children-child-kid-playground-to...
13837,Ready to throw in the towel,Hello all. \n\nI am a mom of 3. My oldest is 1...,2024-03-31,-1,0.000000,parents-daughter-son-husband-kids-home-old-fee...
17711,18 month old spinning wheels and other things,"My LO looks at the wheels of everithing, cars,...",2024-03-03,1,1.000000,autism-son-child-kid-kindergarten-kids-teacher...
7349,When do you leave them alone?,Moving into a two bedroom townhouse from a one...,2024-05-18,6,0.884363,baby-toddler-bedroom-crib-room-bed-daughter-na...
10438,My teen took an entire 400mg edible. How was y...,"Ugh. For the record, I have very good kids. My...",2024-04-25,-1,0.000000,parents-daughter-son-husband-kids-home-old-fee...
...,...,...,...,...,...,...
10579,Is there a way to track down hospital baby pic...,I was really broke when I had my middle child ...,2024-04-24,-1,0.000000,parents-daughter-son-husband-kids-home-old-fee...
13448,What's a go-to meal you make for your kids way...,Maybe your easy go-to will inspire someone els...,2024-04-03,2,0.474197,feeding-baby-hungry-bites-feed-eating-eat-milk...
12944,Parenting with a partner who has previously ha...,My husband has 3 kids from a previous marriage...,2024-04-07,0,0.862417,husband-family-mom-being-parents-her-feel-been...
14755,Is a coat needed to move our sleeping toddler ...,"Sorry, bad English. By coat, I meant jacket\n\...",2024-03-24,-1,0.000000,parents-daughter-son-husband-kids-home-old-fee...


In [65]:
# print 3 representative documents for each topic
for i in range(len(df['topic'].unique())):
    print(f"Topic {i}")
    repr = df[df['topic'] == i].sort_values('probability', ascending=False).head(3)[['title', 'selftext', 'representative']]
    for j, row in repr.iterrows():
        print(f"Words: {row['representative']}")
        print(f"Title: {row['title']}")
        print(f"Text: {row['selftext']}")
        print()
    print("****"*40)

Topic 0
Words: husband-family-mom-being-parents-her-feel-been-she-son
Title: I am officially this parent now 😅
Text: I always said i wont let my daughter watch tv until shes 3. here we are, shes 2 now and i put her in front of the TV to get things done. I always hated when my partner let her watch tv in the morning when i‘m working but now i don’t know how i can get anything in the house done when he’s not around 😂 i feel kinda bad because of it but i feel like sometimes you just have to go with it.. 🤷‍♀️

Words: husband-family-mom-being-parents-her-feel-been-she-son
Title: Do you encourage siblings to hug/hold hands/be lovey or just let it happen naturally?
Text: I absolutely love when my kids show affection towards each other but kinda feel weird telling them to do it. Maybe I'm over thinking it but I really want them to be close and love each other. I'd love to know other thoughts and opinions. 

Words: husband-family-mom-being-parents-her-feel-been-she-son
Title: Dad workouts to ca

In [4]:
df = df.sample(1000)
docs = (df['title'] + ' ' + df['selftext']).tolist()
# docs = df['title'].tolist()
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(docs, show_progress_bar=True)

Batches: 100%|██████████| 32/32 [00:07<00:00,  4.41it/s]


In [7]:
from bertopic.representation import KeyBERTInspired
import openai
from bertopic.representation import OpenAI

def generate_topic_model(docs, embeddings, representation_model=None):
    # client = openai.OpenAI(
    #     api_key="key",
    #     organization="key"
    # )



    # Create topic model
    max_topics = 30
    if representation_model:
        summarization_prompt = """
            I have a topic that contains the following documents: [DOCUMENTS]
            The topic is described by the following keywords: [KEYWORDS]

            Based on the information above, extract a short topic label in the following format:
            topic: <topic label>
            """
        representation_model = OpenAI(client, model="gpt-3.5-turbo", chat=True, prompt=summarization_prompt, nr_docs=5, delay_in_seconds=3)
        model = BERTopic(nr_topics=max_topics, representation_model=representation_model)
    else:
        model = BERTopic(nr_topics=max_topics)
    topic_model = model.fit(docs, embeddings)
    # topic_model.reduce_topics(docs, nr_topics=max_topics)

    topics, probs = model.transform(docs, embeddings)
    return topic_model, topics, probs

def get_topic_representations(topic_model, topics):
    topic_representations = []
    for topic_id in topics:
        if topic_id != -1:  # Exclude outliers
            topic_words = topic_model.get_topic(topic_id)
            representation = ', '.join([word[0] for word in topic_words])
        else:
            representation = None
        topic_representations.append(representation)
    return topic_representations

In [8]:
topic_model, topics, probs = generate_topic_model(docs, embeddings)
df['topic'] = topics
df['topic_prob'] = probs
df['topic_representations'] = get_topic_representations(topic_model, topics)
df

Unnamed: 0,title,selftext,created_utc,topic,topic_prob,topic_representations
4139,Is TRAP music safe for 10 year old boy,My son has started listening to TRAP music esp...,2024-06-11,1,0.808810,"to, and, the, my, of, is, he, that, for, her"
3953,Desperately Need a Break,Has anyone paid someone to babysit their infan...,2024-06-12,1,1.000000,"to, and, the, my, of, is, he, that, for, her"
16702,Traveling with a baby under 1,"Hi all, my husband and I plan on traveling to ...",2024-03-10,1,1.000000,"to, and, the, my, of, is, he, that, for, her"
15535,I hate the notion that if kids are picky eater...,I did BLW with my oldest. Highly suspect she h...,2024-03-19,3,0.817157,"eat, and, to, food, it, the, she, is, of, but"
3757,Milk weaning toddler,I have a 21 month old toddler who's going thro...,2024-06-14,2,1.000000,"bottle, to, the, and, milk, her, is, she, my, ..."
...,...,...,...,...,...,...
1730,After a year of trying my husband doesn’t want...,After our daughter was born my husband and I w...,2024-06-27,1,1.000000,"to, and, the, my, of, is, he, that, for, her"
2397,"Tell me the truth, should we have a third child?",We have two kids ages 4 and 2. I feel so torn ...,2024-06-23,1,1.000000,"to, and, the, my, of, is, he, that, for, her"
875,Thought he was a typical 26 month old,Just got absolutely obliterated on his Early I...,2024-07-02,1,1.000000,"to, and, the, my, of, is, he, that, for, her"
6688,Ac is broken scared baby will overheat,My baby is 6 months and currently it is 80 deg...,2024-05-23,-1,0.000000,


### Visualizing Topic Distances

In [26]:
topic_model.visualize_topics()

In [28]:
from umap import UMAP
# Run the visualization with the original embeddings
# topic_model.visualize_documents(docs, embeddings=embeddings)

# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)

## Hierarchical Topical Modelling

In [29]:
from scipy.cluster import hierarchy as sch


# Hierarchical topics
linkage_function = lambda x: sch.linkage(x, 'single', optimal_ordering=True)
hierarchical_topics = topic_model.hierarchical_topics(docs, linkage_function=linkage_function)

  0%|          | 0/15 [00:00<?, ?it/s]

100%|██████████| 15/15 [00:57<00:00,  3.86s/it]


In [30]:
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

## Looking at the titles and documents to learn more about the themes of the topics

The top n representative titles of the topics

In [32]:
# Get topic representations
topic_info = topic_model.get_topic_info()

# Get representative documents for each topic
representative_docs = topic_model.get_representative_docs(docs)

representative_indices = {}
for topic_num, topic_docs in representative_docs.items():
    indices = [docs.index(doc) for doc in topic_docs]
    representative_indices[topic_num] = indices

The texts of the top 3 representative documents 

In [33]:
def remove_spaces_between_new_lines(text):
    lines = text.split('\n')
    lines = [line.strip() for line in lines if line.strip()]
    processed_text = ' '.join(lines)
    return processed_text


for topic_num, indices in representative_indices.items():
    print(f"Topic {topic_num}:")
    for index in indices[:3]:
        print(f"Title - {df.iloc[index]['title']}")
    for index in indices[:3]:  # Print the first 10 indices for each topic
        print(f"Text - {remove_spaces_between_new_lines(df.iloc[index]['selftext'])}")
    print()

Topic -1:
Title - Frustrated that my partner continues to criticize my parenting in front of my child. 
Title - Ex husband won’t pay for childcare so I can move out of his house.
Title - Divorcing with a child…So many emotions
Text - I am the first to admit that I’m far from a perfect parent,  there have been times where I’ve dealt with a parenting situation and in hindsight thought I probably could have handled that better. My wife is a trained social worker who specializes in social emotional learning. She works in the school system and has years of training dealing with children with every need under the son.  She is a great mom and does handle herself very well with our son. The thing that I’m growing more and more frustrated with that she is very quick to criticize my parenting in front of our son.  I know I make mistakes but can’t she save the coaching until later when my son isn’t around?  My son sees this continually and I believe it’s affecting his opinion of my “authority” fo

## Interaction Analysis
Which type of posts get the highest interaction (Views/up- and down-votes).
- Sentiment, Toxicity, LLM emotion checker

In [None]:
df.columns

Index(['id', 'url', 'title', 'selftext', 'created_utc', 'num_comments',
       'comment_1', 'comment_2', 'comment_3', 'comment_4', 'comment_5',
       'comment_6', 'comment_7', 'comment_8', 'comment_9', 'comment_10',
       'comment_author_1', 'comment_author_2', 'comment_author_3',
       'comment_author_4', 'comment_author_5', 'comment_author_6',
       'comment_author_7', 'comment_author_8', 'comment_author_9',
       'comment_author_10', 'comment_created_utc_1', 'comment_created_utc_2',
       'comment_created_utc_3', 'comment_created_utc_4',
       'comment_created_utc_5', 'comment_created_utc_6',
       'comment_created_utc_7', 'comment_created_utc_8',
       'comment_created_utc_9', 'comment_created_utc_10', 'comment_score_1',
       'comment_score_2', 'comment_score_3', 'comment_score_4',
       'comment_score_5', 'comment_score_6', 'comment_score_7',
       'comment_score_8', 'comment_score_9', 'comment_score_10',
       'comment_ups_1', 'comment_ups_2', 'comment_ups_3', 'comm

In [38]:
new_df = df.sort_values(by='num_comments', ascending=False)[['title', 'selftext', 'topic', 'topic_prob', 'num_comments', 'created_utc', 'ups', 'downs', 'score']]
new_df

Unnamed: 0,title,selftext,topic,topic_prob,num_comments,created_utc,ups,downs,score
7322,What do you spend on groceries? Upset my wife ...,Last week we went to Costco and spent $350 on ...,-1,0.000000,1831,2024-05-18,977,0,977
13836,Husband leaves loaded gun on bed,Husband is a military vet and boasts about nee...,-1,0.000000,1753,2024-03-31,1729,0,1729
245,Do you sleep in the same bed as your infant?,I live in the US and been repeatedly told not ...,0,0.187162,1501,2024-07-07,399,0,399
14472,Do you judge people who use phones or ipads ou...,"EDIT 3 *Thank you for sharing your thoughts, g...",2,0.896729,1464,2024-03-26,491,0,491
12221,My husband dislikes our 5yo son,My husband (37M) has never liked our son and h...,2,1.000000,1295,2024-04-12,931,0,931
...,...,...,...,...,...,...,...,...,...
1131,I'm unsure what to do with my daughter,"I 39 F have 3 kids 12 F, 15 F, and 18 F now ev...",-1,0.000000,0,2024-07-01,1,0,1
5418,"Highly reactive 4,5y old boy","Hi, everyone. First, please excuse me for my e...",-1,0.000000,0,2024-06-01,1,0,1
5434,How to deal with another parent,Throwaway account to protect the identities of...,6,0.917844,0,2024-06-01,1,0,1
5437,My 17 month old won't stop touching my butt.,I am I single mom of one 17 month old boy (Let...,2,1.000000,0,2024-06-01,1,0,1


In [39]:
from tqdm import tqdm
tqdm.pandas()

# sentiment analysis on cleaned selftext
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Download stopwords and punkt tokenizer from NLTK
nltk.download('stopwords')
nltk.download('punkt')

def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    
    return ' '.join(filtered_text)

new_df['cleaned_selftext'] = new_df['selftext'].progress_apply(clean_text)

[nltk_data] Downloading package stopwords to /home/sarmad/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/sarmad/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
100%|██████████| 17935/17935 [00:12<00:00, 1399.89it/s]


In [40]:
new_df

Unnamed: 0,title,selftext,topic,topic_prob,num_comments,created_utc,ups,downs,score,cleaned_selftext
7322,What do you spend on groceries? Upset my wife ...,Last week we went to Costco and spent $350 on ...,-1,0.000000,1831,2024-05-18,977,0,977,last week went costco spent 350 ton groceries ...
13836,Husband leaves loaded gun on bed,Husband is a military vet and boasts about nee...,-1,0.000000,1753,2024-03-31,1729,0,1729,husband military vet boasts needing carry gun ...
245,Do you sleep in the same bed as your infant?,I live in the US and been repeatedly told not ...,0,0.187162,1501,2024-07-07,399,0,399,live us repeatedly told sleep bed infant child...
14472,Do you judge people who use phones or ipads ou...,"EDIT 3 *Thank you for sharing your thoughts, g...",2,0.896729,1464,2024-03-26,491,0,491,edit 3 thank sharing thoughts giving informati...
12221,My husband dislikes our 5yo son,My husband (37M) has never liked our son and h...,2,1.000000,1295,2024-04-12,931,0,931,husband 37m never liked son told many times ne...
...,...,...,...,...,...,...,...,...,...,...
1131,I'm unsure what to do with my daughter,"I 39 F have 3 kids 12 F, 15 F, and 18 F now ev...",-1,0.000000,0,2024-07-01,1,0,1,39 f 3 kids 12 f 15 f 18 f ever since 15 yr ol...
5418,"Highly reactive 4,5y old boy","Hi, everyone. First, please excuse me for my e...",-1,0.000000,0,2024-06-01,1,0,1,hi everyone first please excuse english skills...
5434,How to deal with another parent,Throwaway account to protect the identities of...,6,0.917844,0,2024-06-01,1,0,1,throwaway account protect identities kids two ...
5437,My 17 month old won't stop touching my butt.,I am I single mom of one 17 month old boy (Let...,2,1.000000,0,2024-06-01,1,0,1,single mom one 17 month old boy lets call ive ...


In [41]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
vader_analyzer = SentimentIntensityAnalyzer()

def get_vader_sentiment(text):
    analysis = vader_analyzer.polarity_scores(text)
    return analysis['compound']

new_df['sentiment'] = new_df['cleaned_selftext'].progress_apply(get_vader_sentiment)

  0%|          | 63/17935 [00:00<00:30, 580.13it/s]

100%|██████████| 17935/17935 [00:27<00:00, 658.67it/s]


In [42]:
new_df

Unnamed: 0,title,selftext,topic,topic_prob,num_comments,created_utc,ups,downs,score,cleaned_selftext,sentiment
7322,What do you spend on groceries? Upset my wife ...,Last week we went to Costco and spent $350 on ...,-1,0.000000,1831,2024-05-18,977,0,977,last week went costco spent 350 ton groceries ...,0.9213
13836,Husband leaves loaded gun on bed,Husband is a military vet and boasts about nee...,-1,0.000000,1753,2024-03-31,1729,0,1729,husband military vet boasts needing carry gun ...,-0.4703
245,Do you sleep in the same bed as your infant?,I live in the US and been repeatedly told not ...,0,0.187162,1501,2024-07-07,399,0,399,live us repeatedly told sleep bed infant child...,0.8990
14472,Do you judge people who use phones or ipads ou...,"EDIT 3 *Thank you for sharing your thoughts, g...",2,0.896729,1464,2024-03-26,491,0,491,edit 3 thank sharing thoughts giving informati...,0.9953
12221,My husband dislikes our 5yo son,My husband (37M) has never liked our son and h...,2,1.000000,1295,2024-04-12,931,0,931,husband 37m never liked son told many times ne...,-0.9938
...,...,...,...,...,...,...,...,...,...,...,...
1131,I'm unsure what to do with my daughter,"I 39 F have 3 kids 12 F, 15 F, and 18 F now ev...",-1,0.000000,0,2024-07-01,1,0,1,39 f 3 kids 12 f 15 f 18 f ever since 15 yr ol...,-0.8131
5418,"Highly reactive 4,5y old boy","Hi, everyone. First, please excuse me for my e...",-1,0.000000,0,2024-06-01,1,0,1,hi everyone first please excuse english skills...,-0.4340
5434,How to deal with another parent,Throwaway account to protect the identities of...,6,0.917844,0,2024-06-01,1,0,1,throwaway account protect identities kids two ...,-0.8523
5437,My 17 month old won't stop touching my butt.,I am I single mom of one 17 month old boy (Let...,2,1.000000,0,2024-06-01,1,0,1,single mom one 17 month old boy lets call ive ...,0.8717


In [43]:
# rearrange new_df columns for better readability
new_df = new_df[['title', 'selftext', 'cleaned_selftext', 'topic', 'topic_prob', 'num_comments', 'created_utc', 'ups', 'downs', 'score', 'sentiment']]
new_df

Unnamed: 0,title,selftext,cleaned_selftext,topic,topic_prob,num_comments,created_utc,ups,downs,score,sentiment
7322,What do you spend on groceries? Upset my wife ...,Last week we went to Costco and spent $350 on ...,last week went costco spent 350 ton groceries ...,-1,0.000000,1831,2024-05-18,977,0,977,0.9213
13836,Husband leaves loaded gun on bed,Husband is a military vet and boasts about nee...,husband military vet boasts needing carry gun ...,-1,0.000000,1753,2024-03-31,1729,0,1729,-0.4703
245,Do you sleep in the same bed as your infant?,I live in the US and been repeatedly told not ...,live us repeatedly told sleep bed infant child...,0,0.187162,1501,2024-07-07,399,0,399,0.8990
14472,Do you judge people who use phones or ipads ou...,"EDIT 3 *Thank you for sharing your thoughts, g...",edit 3 thank sharing thoughts giving informati...,2,0.896729,1464,2024-03-26,491,0,491,0.9953
12221,My husband dislikes our 5yo son,My husband (37M) has never liked our son and h...,husband 37m never liked son told many times ne...,2,1.000000,1295,2024-04-12,931,0,931,-0.9938
...,...,...,...,...,...,...,...,...,...,...,...
1131,I'm unsure what to do with my daughter,"I 39 F have 3 kids 12 F, 15 F, and 18 F now ev...",39 f 3 kids 12 f 15 f 18 f ever since 15 yr ol...,-1,0.000000,0,2024-07-01,1,0,1,-0.8131
5418,"Highly reactive 4,5y old boy","Hi, everyone. First, please excuse me for my e...",hi everyone first please excuse english skills...,-1,0.000000,0,2024-06-01,1,0,1,-0.4340
5434,How to deal with another parent,Throwaway account to protect the identities of...,throwaway account protect identities kids two ...,6,0.917844,0,2024-06-01,1,0,1,-0.8523
5437,My 17 month old won't stop touching my butt.,I am I single mom of one 17 month old boy (Let...,single mom one 17 month old boy lets call ive ...,2,1.000000,0,2024-06-01,1,0,1,0.8717


In [44]:
new_df.to_csv('parenting_smaller_set_with_topics.csv', index=False)