In [1]:
import os
import json
from datetime import datetime, UTC
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split
import plotly.express as px
from tqdm.notebook import tqdm
import time
import pickle
import re
from get_params import DATA_FOLDER, POSTS_FPATH, COMMENTS_FPATH, CURATED_DATASET_FPATH, TRAIN_DATASET_FPATH, VAL_DATASET_FPATH
from get_params import MAX_TOKENS
pd.set_option('display.max_rows', 500)

INTERIM_OUTPUT_PATH = os.path.join(DATA_FOLDER, "posts_comments_df_filtered.pkl")
OUTPUT_PATH = CURATED_DATASET_FPATH

## Get HF tokenizer
with open(os.path.join(DATA_FOLDER, "hftoken.txt")) as f:
    HF_TOKEN = f.read().strip()

TOKENIZER = AutoTokenizer.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.3",
    token=HF_TOKEN
)



In [2]:
def get_posts(fpath):
    cnt = 0 
    data_out = []
    with open(fpath) as f:
        for line in f:
            pline = json.loads(line)
            if (
                (pline["is_self"]) and (pline["selftext"].strip() not in ["[deleted]", "[removed]", ""])
                # exclude pinned posts and posts by mods
                and (not pline.get('stickied', False)) and (pline["distinguished"] != "moderator")
            ):
                cnt += 1
                ## Build parsed dataset
                # try:
                title_contains_q = True if ("?" in pline['title']) else False
                selftext_contains_q = True if ("?" in pline['selftext']) else False
                title_ends_q = True if (pline['title'][-1] == "?") else False
                selftext_ends_q = True if (pline['selftext'][-1] == "?") else False
                pline_out = {
                    'post_id': pline['id'],
                    ## important content
                    'title': pline['title'],
                    'selftext': pline['selftext'],
                    'link_flair_text': pline['link_flair_text'], # post flair
                    'title_len': len(pline['title'].strip()), # length of title
                    'selftext_len': len(pline['selftext'].strip()), # length of selftext
                    ## potential measures of quality of posts and comments
                    'score': pline['score'], # net upvotes
                    'ups': pline.get('ups', np.nan), # raw upvotes
                    'num_comments': pline['num_comments'], # number of responses/comments
                    "title_contains_q": title_contains_q,
                    "selftext_contains_q": selftext_contains_q,
                    "contains_q": title_contains_q + selftext_contains_q,
                    "title_ends_q": title_ends_q,
                    "selftext_ends_q": selftext_ends_q,
                    "ends_q": title_ends_q + selftext_ends_q,
                    ## when was the post made?
                    'created_utc': int(pline['created_utc']),
                    'created_utc_dt': datetime.fromtimestamp(int(pline['created_utc']), UTC),
                }
                data_out.append(pline_out)
    posts_df = pd.DataFrame(data_out)
    print (f"Found {len(posts_df)} posts")
    ## Filter posts to ensure they contain a question mark and must have at least two comments
    posts_df = posts_df[((posts_df["contains_q"] > 0) & (posts_df["num_comments"] > 2))]
    print (f"Keeping only interrogative posts and posts with at least 2 comments, we have {len(posts_df)} posts")
    return posts_df


In [3]:
def joining_posts_with_comments(fpath, posts_df):
    st = time.time()
    post_ids = posts_df["post_id"].to_list()
    cnt = 0 
    data_out = []
    filesize = os.path.getsize(fpath)
    with open(fpath) as f, tqdm(total=filesize, unit="B", unit_scale=True, desc="Parsing") as pbar:
        for line in f:
            pline = json.loads(line)
            print (pline)
            break
            pbar.update(len(line))
            tlevel, post_id = pline["parent_id"].split("_")
            if (tlevel == "t3") and (post_id in post_ids):
                cnt += 1
                pline_out = {
                    "post_id": post_id,
                    "comment_id": pline["parent_id"],
                    "comment_score": pline.get("score", np.nan), 
                    "comment_ups": pline.get("ups", np.nan), 
                    "comment_downs": pline.get("downs", np.nan), 
                    "comment_controversiality": pline.get("controversiality", np.nan),
                    "comment_body": pline.get("body", np.nan),
                    "comment_distinguished": pline.get("distinguished", np.nan),
                    ## when was the post made?
                    'comment_created_utc': int(pline['created_utc']),
                    'comment_created_utc_dt': datetime.fromtimestamp(int(pline['created_utc']), UTC)
                }
                data_out.append(pline_out)
    comments_df = pd.DataFrame(data_out)
    print (f"Found {len(comments_df)} comments")
    ## Joining posts with comments
    posts_df = posts_df.merge(comments_df, on="post_id", how="inner")
    print (f"After joining posts and comments, df len is {len(posts_df)}")
    print (f"Time taken to parse posts and comments {(time.time()-st)/60.0} min")
    return posts_df


In [4]:
## Get interrogative posts with at least 2 comments
posts_df = get_posts(POSTS_FPATH)


Found 186947 posts
Keeping only interrogative posts and posts with at least 2 comments, we have 132942 posts


In [5]:
## Get comments per post - done once and saved
# data_df = joining_posts_with_comments(COMMENTS_FPATH, posts_df)


In [6]:
# ## Save filtered posts-comments to file
# st = time.time()
# data_df.to_pickle("posts_comments_df.pkl")
# print (f"Time taken to write file = {(time.time()-st)/60.0}")


In [7]:
## Open filtered posts-comments from file
st = time.time()
data_df = pd.read_pickle(INTERIM_OUTPUT_PATH)
print (f"Number of posts-comments samples = {len(data_df)}")
print (f"Time taken to read file = {(time.time()-st)/60.0}")

## Remove empty/ deleted comments
data_df = data_df[~(data_df["comment_body"].str.strip().isin(["[deleted]", "[removed]", "", np.nan]))]
print (f"Number of posts-comments samples after removing empty ones = {len(data_df)}")
print (f"Number of posts for which we have comments after removing empty ones = {data_df['post_id'].nunique()}")

## Add comment body length
data_df["comment_body_original"] = data_df["comment_body"]
data_df["comment_body"] = data_df["comment_body"].str.replace(
    r"http\S+|www\.\S+", "", regex=True, flags=re.IGNORECASE
)
data_df["comment_body"] = (
    data_df["comment_body"]
    .str.replace(r"\s+", " ", regex=True)  # collapse multiple spaces
    .str.strip()                           # trim edges
)

data_df["comment_body_len"] = data_df["comment_body"].str.len()


Number of posts-comments samples = 1825072
Time taken to read file = 0.012884684403737386
Number of posts-comments samples after removing empty ones = 1736315
Number of posts for which we have comments after removing empty ones = 132907


In [8]:
## Looking at some stats on posts -- commented out

# display(data_df[["score", "comment_body_len"]].describe())

# display(data_df[["score", "comment_body_len"]].min())

# ## The one letter/ short comments are likely nonsense.
# display(data_df[["score", "comment_body_len"]].quantile(.05))

# res = pearsonr(data_df["score"], data_df["comment_body_len"])
# print (res)

## Outputting scatter plot
# fig = px.scatter(data_df, x="score", y="comment_body_len")
# fig.add_hline(y=5, line_color="red", line_dash="dash")
# fig.write_image("scatter.png")

## We see a small negative correlation that is statistically significant


In [9]:
## Compute stats for comments and fetch top comment per post
def get_comment_stats_per_post(df_grp):
    return pd.Series({
        ## count
        "num_comments_filtered": len(df_grp),
        ## score stats
        "min_comments_score": df_grp["comment_score"].min(),
        "max_comments_score": df_grp["comment_score"].max(),
        "median_comments_score": df_grp["comment_score"].median(),
        "mean_comments_score": df_grp["comment_score"].mean(),
        "range_comments_score": df_grp["comment_score"].max()-df_grp["comment_score"].min(),
        "std_comments_score": df_grp["comment_score"].std(),
        ## len stats
        "shortest_comments_len": df_grp["comment_body_len"].min(),
        "longest_comments_len": df_grp["comment_body_len"].max(),
        "median_comments_len": df_grp["comment_body_len"].median(),
        "mean_comments_len": df_grp["comment_body_len"].mean(),
        "range_comment_len": df_grp["comment_body_len"].max()-df_grp["comment_body_len"].min(),
        "std_comments_len": df_grp["comment_body_len"].std()

    })

## Get per comment stats per post 
posts_data_df = data_df.groupby("post_id").apply(get_comment_stats_per_post, include_groups=True).reset_index()
print (f"Number of posts after grouping by on comments = {len(posts_data_df)}")

## Fetch top comment per post
top_comment_df = data_df.loc[data_df.groupby("post_id")["comment_score"].idxmax()]

## Merge all together
posts_data_df = posts_data_df.merge(top_comment_df, how="inner", on="post_id")
print (f"Number of posts after concatting everything = {len(posts_data_df)}")


  posts_data_df = data_df.groupby("post_id").apply(get_comment_stats_per_post, include_groups=True).reset_index()


Number of posts after grouping by on comments = 132907
Number of posts after concatting everything = 132907


In [10]:
display(posts_data_df.columns)


Index(['post_id', 'num_comments_filtered', 'min_comments_score',
       'max_comments_score', 'median_comments_score', 'mean_comments_score',
       'range_comments_score', 'std_comments_score', 'shortest_comments_len',
       'longest_comments_len', 'median_comments_len', 'mean_comments_len',
       'range_comment_len', 'std_comments_len', 'title', 'selftext',
       'link_flair_text', 'title_len', 'selftext_len', 'score', 'ups',
       'num_comments', 'title_contains_q', 'selftext_contains_q', 'contains_q',
       'title_ends_q', 'selftext_ends_q', 'ends_q', 'created_utc',
       'created_utc_dt', 'comment_id', 'comment_score', 'comment_ups',
       'comment_downs', 'comment_controversiality', 'comment_body',
       'comment_distinguished', 'comment_created_utc',
       'comment_created_utc_dt', 'comment_body_original', 'comment_body_len'],
      dtype='object')

In [11]:
## Looking at some stats of comments -- commented out

# display(posts_data_df[[
#                 "score",
#                 "num_comments",
#                 "num_comments_filtered",
#                 "min_comments_score",
#                 "max_comments_score",
#                 "range_comments_score",
#                 "std_comments_score",
#                 "comment_body_len",
#                 "comment_score"
# ]].describe())



In [12]:
## Format output for mistral and count tokens
def get_chat_template(row):
    system_prompt = "You are a friendly parenting companion who gives helpful advice like a fellow parent would. You sound warm and practical — not robotic or formal."
    user_prompt = f"User's Prompt: {row['title']} - {row['selftext']}"
    assistant_prompt = row['comment_body']
    chat = [
        {"role": "user", "content": f"{system_prompt} {user_prompt}"},
        {"role": "assistant", "content": assistant_prompt}
    ]
    row["chat"] = chat
    row["text"] = TOKENIZER.apply_chat_template(chat, tokenize=False)
    row["num_tokens"] = len(TOKENIZER.apply_chat_template(chat, tokenize=True))
    return row


In [13]:
## Filter posts by to get final post-comment pairs
final_df = posts_data_df.copy(deep=True)

EXCLUDED_FLAIRS = [
    # We don't want to focus on adults
    'Adult Children 18+ Years',
    'Adult Children',
    # These posts are not typically questions
    'Update',
    'short, happy rant ',
    # We don't want admin posts
    'Meta',
    'meta',
    # AMAs typically flips the conversational/ Q and A i.e. the comments are in fact questions
    'confirmed AMA',
    'AMA - mod approved',
    # Weekly also flips the conversational/ Q and A format
    'Weekly',
    # Very few posts on these topics and th ey don't seem informative
    'LOCKED',
    'Locked',
    'Trigger Warning: death and loss'
]

#### Filter by top comment length and score and exclude some post categories
final_df = final_df[( 
    ~(final_df["link_flair_text"].isin(EXCLUDED_FLAIRS)) &
    (final_df["comment_score"] >= 10) & 
    (final_df["comment_body_len"] >= 55) 
)]

print (f"Final number of posts before mistral formatting = {len(final_df)}")



# ## Count categories/ flairs -- commented out
# flair_vcs = pd.DataFrame({
#     "count": final_df["link_flair_text"].value_counts(),
#     "percent": final_df["link_flair_text"].value_counts(normalize=True) * 100
# }).reset_index()
# flair_vcs.columns = ["flair", "count", "percent"]
# flair_vcs["percent"] = flair_vcs["percent"].round(2)  # optional
# display(flair_vcs)


final_df = final_df.apply(get_chat_template, axis=1)

display(final_df[[
        "score",
        "comment_body_len",
        "comment_score",
        "num_tokens"
]].describe())

print (f"95% percentile of num_tokens = {final_df['num_tokens'].quantile(0.95)}")
print (f"99% percentile of num_tokens = {final_df['num_tokens'].quantile(0.99)}")

## Remove comments with too many tokens because longer might be harder to train on a single GPU
final_df = final_df[(final_df["num_tokens"] < MAX_TOKENS)]
print (f"Final number of posts after removing posts with more than {MAX_TOKENS} tokens = {len(final_df)}")

## Output to file
final_df.to_json(OUTPUT_PATH)


Final number of posts before mistral formatting = 61487


Unnamed: 0,score,comment_body_len,comment_score,num_tokens
count,61487.0,61487.0,61487.0,61487.0
mean,65.513881,511.209573,75.816238,502.434547
std,233.503825,561.638058,220.156871,339.874821
min,0.0,55.0,10.0,75.0
25%,4.0,184.0,14.0,281.0
50%,11.0,342.0,22.0,414.0
75%,32.0,624.0,47.0,618.0
max,6206.0,9974.0,7741.0,7175.0


95% percentile of num_tokens = 1133.0
99% percentile of num_tokens = 1764.2799999999988
Final number of posts after removing posts with more than 620 tokens = 46181


In [14]:
display(final_df[[
        "score",
        "comment_body_len",
        "comment_score",
        "num_tokens"
]].describe())


Unnamed: 0,score,comment_body_len,comment_score,num_tokens
count,46181.0,46181.0,46181.0,46181.0
mean,59.209913,370.818887,69.794786,353.770793
std,220.940857,288.962163,206.922421,128.884239
min,0.0,55.0,10.0,75.0
25%,4.0,161.0,13.0,249.0
50%,10.0,287.0,21.0,344.0
75%,28.0,490.0,43.0,454.0
max,6206.0,2409.0,7741.0,619.0


In [15]:
final_df.head(2)

Unnamed: 0,post_id,num_comments_filtered,min_comments_score,max_comments_score,median_comments_score,mean_comments_score,range_comments_score,std_comments_score,shortest_comments_len,longest_comments_len,...,comment_controversiality,comment_body,comment_distinguished,comment_created_utc,comment_created_utc_dt,comment_body_original,comment_body_len,chat,text,num_tokens
3,10027tn,4.0,2.0,16.0,4.0,6.5,14.0,6.454972,91.0,670.0,...,0,In general the advice is to tell them early an...,,1672518619,2022-12-31 20:30:19+00:00,In general the advice is to tell them early an...,670,"[{'role': 'user', 'content': 'You are a friend...",<s>[INST] You are a friendly parenting compani...,362
4,1002kbu,12.0,2.0,24.0,7.0,9.833333,22.0,7.183736,2.0,250.0,...,0,Turning red by Pixar. Thought I'd be annoyed b...,,1672517781,2022-12-31 20:16:21+00:00,Turning red by Pixar. Thought I'd be annoyed b...,124,"[{'role': 'user', 'content': 'You are a friend...",<s>[INST] You are a friendly parenting compani...,140


In [16]:
# Printing examples of short comments
# print (final_df["comment_body_len"].min())
# tmp_df = final_df[(final_df["comment_body_len"] == final_df["comment_body_len"].min())]

# for i in range(0, 3):
#     print ("\n\n-----")
#     print (f"TITLE: {tmp_df['title'].iloc[i]}")
#     print (f"BODY: {tmp_df['selftext'].iloc[i]}")
#     print (f"\nANSWER (len: {tmp_df['comment_body_len'].iloc[i]}, sc: {tmp_df['comment_score'].iloc[i]}):\n {tmp_df['comment_body'].iloc[i]}")


In [17]:
# ## Spot checking categories
# sel_cats = [
#     # 'Rant',
#     # 'misc',
#     # 'media',
#     # 'SUPPORT',
#     # 'Rave',
#     # 'Support',
#     # 'Weekly',
#     'weird phobia'
# ]

# for s in sel_cats:
#     tmp_df = final_df[(final_df["link_flair_text"] == s)]
#     for i in range(0, 3):
#         print (f"\n\n-----\n{s}: {i+1}")
#         print (f"TITLE: {tmp_df['title'].iloc[i]}")
#         print (f"BODY: {tmp_df['selftext'].iloc[i]}")
#         print (f"\nANSWER ({tmp_df['comment_body_len'].iloc[i]}): {tmp_df['comment_body'].iloc[i]}")



In [32]:
train_size_sel = int((len(final_df)*0.85)/2400)*2400 # Rounded off is better
print(train_size_sel)

## Split into Training and Validation Datasets
cols = [
    "post_id",
    "comment_id",
    "text",
    "num_tokens"
]
train_df, val_df = train_test_split(final_df, train_size=train_size_sel, random_state=42)
train_df[cols].to_json(TRAIN_DATASET_FPATH, orient='records', lines=True)
val_df[cols].to_json(VAL_DATASET_FPATH, orient='records', lines=True)





38400


In [33]:
print (f"Number of training examples: {len(train_df)}")
print (f"Number of validation examples: {len(val_df)}")

Number of training examples: 38400
Number of validation examples: 7781


In [34]:
train_df["text"].iloc[0]

"<s>[INST] You are a friendly parenting companion who gives helpful advice like a fellow parent would. You sound warm and practical — not robotic or formal. User's Prompt: The million dollar question - Apparently, my 6yrs old just found out about the word 'sex' and came asking. First I tried to wrapped my head around a suitable response while trying to find out where she heard it from, she quickly screamed 'school' cutting short my thinking time. Personally, I thought this was not a topic you say 'let's talk about this later, so I posed more questions to buy time; who mentioned it in school and how did they talk about it, what was the explanation you got? She tried mumbling some stuff, right then I had to say 'give me a moment we will talk about this later. What's the best way to approach this? [/INST] There is absolutely nothing wrong with saying “I’m not sure how to answer that, let me think about it and get back to you.” The key is you HAVE to follow up. Basic simple explanation. Do

In [36]:
## Pick 100 random validation examples and create another dataset for quick eval
VAL_DATASET_100_FPATH = os.path.join(DATA_FOLDER, "reddit_dataset_val_100.jsonl")
val_df_100 = val_df.sample(n=100, random_state=42)
print (f"Number of validation examples to quick eval on: {len(val_df_100)}")
val_df_100[cols].to_json(VAL_DATASET_100_FPATH, orient='records', lines=True)



Number of validation examples to quick eval on: 100
