In [None]:
!pip install langid
!pip install --upgrade praw
!pip install alphabetic
!pip install hvplot
!pip install alphabetic

In [None]:
import os
import re
import praw
from omegaconf import OmegaConf
import time
from prawcore import TooManyRequests
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
import hvplot.pandas

In [None]:
tqdm.pandas()
parameters = OmegaConf.load("../../../parameters/reddit.yaml")
load_dotenv("../../../.env")
reddit = praw.Reddit(
    username=os.environ.get("REDDIT_USERNAME"),
    password=os.environ.get("REDDIT_PASSWORD"),
    client_id=os.environ.get("REDDIT_CLIENT_ID"),
    client_secret=os.environ.get("REDDIT_CLIENT_SECRET"),
    user_agent=os.environ.get("REDDIT_USER_AGENT"),
)

In [None]:
languages = list(parameters.data_collection.subreddits.keys())

bots = [
    "AutoModerator"
]

data = []
n_subreddits = 400

def safe_request(func, *args, **kwargs):
    """Retry a Reddit API request if rate limited."""
    while True:
        try:
            return func(*args, **kwargs)
        except TooManyRequests as e:
            wait_time = int(e.response.headers.get('Retry-After', 60 * 5))  # Default to 60 seconds if no header
            print(f"Rate limit hit. Retrying after {wait_time} seconds...")
            time.sleep(wait_time)

for language in tqdm(languages, desc="Language progress"):
    subreddits = parameters.data_collection.subreddits.get(language)

    for subreddit in tqdm(subreddits, desc="Subreddits progress", leave=False):
        submissions = safe_request(reddit.subreddit(subreddit).hot, limit=n_subreddits)

        for submission in tqdm(submissions, desc=f"Processing submissions in {subreddit}", leave=True):
            if submission.over_18:
                continue

            # Collect submission title and selftext
            data.append({
                'language': language,
                'subreddit': subreddit,
                'is_selftext': submission.is_self,
                'is_submission': True,
                'is_comment': False,
                'is_reply': False,
                'author': str(submission.author),
                'title': submission.title,
                'text': submission.title
            })

            # Ensure comments are replaced with their expanded versions
            comments = safe_request(getattr, submission, "comments")
            safe_request(comments.replace_more, limit=None)
            comments = filter(lambda c: c.author not in bots, comments.list())

            for comment in comments:
                data.append({
                    'language': language,
                    'subreddit': subreddit,
                    'is_selftext': False,
                    'is_submission': False,
                    'is_comment': True,
                    'is_reply': False,
                    'author': str(comment.author),
                    'title': submission.title,
                    'text': comment.body
                })

                # Ensure replies are expanded
                replies = safe_request(getattr, comment, "replies")
                safe_request(replies.replace_more, limit=None)
                for reply in replies.list():
                    data.append({
                        'language': language,
                        'subreddit': subreddit,
                        'is_selftext': False,
                        'is_submission': False,
                        'is_comment': False,
                        'is_reply': True,
                        'author': str(reply.author),
                        'title': submission.title,
                        'text': reply.body
                    })


# Convert the collected data into a pandas DataFrame
df = pd.DataFrame(data)

# Drop duplicate rows based on the 'text' column and count the duplicates
before_dedup = len(df)
df = df.drop_duplicates(subset='text')
after_dedup = len(df)
num_duplicates = before_dedup - after_dedup

# Print the number of duplicates removed
print(f"Number of duplicate rows removed: {num_duplicates}")

df.to_csv('../../../datasets/reddit_multigec/raw_reddit_multigec.csv', index=False)

In [None]:
def extract_alphabetical(text):
    if isinstance(text, str):
        # Regex for all specified languages, including full Ukrainian support
        pattern = r"[a-zA-Záčďéěíňóřšťúůýžäöüßõšžα-ωΑ-ΩþæåāēģīķļņūčšžåäöüÁČĎÉĚÍŇÓŘŠŤÚŮÝŽΑ-Ωа-яґєіїА-ЯҐЄІЇ']+"
        return ' '.join(re.findall(pattern, text, flags=re.IGNORECASE))
    return None

# Apply the function to the 'text' column and create a new column
df['alphabetical_text'] = df['text'].apply(extract_alphabetical)

df.loc[:, "alphabetical_text"] = df.loc[:, "text"].progress_apply(lambda x: extract_alphabetical(x))

In [None]:
df.loc[:, "len_text"] = df.loc[:, "text"].progress_apply(lambda x: len(x))
df.loc[:, "len_alphabetical_text"] = df.loc[:, "alphabetical_text"].progress_apply(lambda x: len(x))
df.loc[:, "alphabetical_ratio"] = df.progress_apply(lambda x: x["len_alphabetical_text"] / x["len_text"], axis=1)

In [None]:
df.groupby("language")["alphabetical_ratio"].mean()

In [None]:
df.loc[:, ["language", "alphabetical_ratio"]].hvplot.kde(
    by="language",
    grid=True,
    width=1200,
    height=600,
)

In [None]:
df.loc[:, ["language", "alphabetical_ratio"]].hvplot.box(
    by="language",
    grid=True,
    width=1200,
    height=600,
    invert=True,
    tools=["zoom_in", "zoom_out", "hover"]
)

In [None]:
df.loc[:, ["language", "len_alphabetical_text"]].hvplot.box(
    by="language",
    grid=True,
    width=1200,
    height=600,
    invert=True,
    tools=["zoom_in", "zoom_out", "hover"],
    title="len alphabetical by languages"
)

In [None]:
df.loc[:, ["language", "len_alphabetical_text"]].hvplot.kde(
    by="language",
    grid=True,
    width=1200,
    height=600,
    invert=False,
    tools=["zoom_in", "zoom_out", "hover"],
    title="len alphabetical by languages"
)

In [None]:
print("filtering texts with number of alphabetical characters less than 20:")
print(len(df.loc[df.loc[:, "len_alphabetical_text"] < 20]))

df = df.loc[df.loc[:, "len_alphabetical_text"] > 20]

In [None]:
df.loc[:, ["language", "len_alphabetical_text"]].hvplot.box(
    by="language",
    grid=True,
    width=1200,
    height=600,
    invert=True,
    tools=["zoom_in", "zoom_out", "hover"],
    title="len alphabetical by languages (after filtering)"
)

In [None]:
number_of_text_by_language = df.groupby("language")["subreddit"].count()
number_of_text_by_language

In [None]:
number_of_text_by_language.reset_index().sort_values(by="subreddit").hvplot.bar(
    x="language",
    grid=True,
    width=1200,
    height=600,
    title="Number of samples by language"
)

In [None]:
df.loc[:, "langid_language"] = df.loc[:, "alphabetical_text"].progress_apply(lambda x: langid.classify(x)[0])

In [None]:
lang_map = {
    "cs": "czech",
    "en": "english",
    "et": "estonian",
    "de": "german",
    "el": "greek",
    "is": "icelandic",
    "it": "italian",
    "lv": "latvian",
    "sl": "slovene",
    "sv": "swedish",
    "uk": "ukrainian",
}

# Translate ISO codes and mark "other" for unmatched codes
df["translated_langid_language"] = df["langid_language"].map(lang_map).fillna("other")
df

In [None]:
number_of_text_by_language_and_langid = df.groupby(["language", "translated_langid_language"])["subreddit"].count()
number_of_text_by_language_and_langid

In [None]:
number_of_text_by_language_and_langid.reset_index().sort_values(by="subreddit").hvplot.bar(
    x="language",
    by="translated_langid_language",
    grid=True,
    width=1400,
    height=2000,
    title="Number of samples by language",
    invert=True,
)

In [None]:
number_of_text_by_langid = df.groupby(["translated_langid_language"])["subreddit"].count().rename("count")
number_of_text_by_langid

In [None]:
number_of_text_by_langid.reset_index().sort_values(by="count").hvplot.bar(
    x="translated_langid_language",
    grid=True,
    width=1400,
    height=700,
    title="Number of samples by langid",
    invert=False,
)

In [None]:
df = df.loc[df.loc[:, "translated_langid_language"] != "other"]
df

In [None]:
english_df = df[df["translated_langid_language"] == "english"]

# Randomly sample 60,000 rows from the English subset
sampled_english_df = english_df.sample(n=60000, random_state=42)
# Filter rows that are not English
non_english_df = df[df["translated_langid_language"] != "english"]
# Combine the sampled English rows with the non-English rows
df = pd.concat([sampled_english_df, non_english_df], ignore_index=True)

df

In [None]:
number_of_text_by_langid = df.groupby(["translated_langid_language"])["subreddit"].count().rename("count")
display(number_of_text_by_langid)

number_of_text_by_langid.reset_index().sort_values(by="count").hvplot.bar(
    x="translated_langid_language",
    grid=True,
    width=1400,
    height=700,
    title="Number of samples by langid",
    invert=False,
)

In [None]:
df.loc[:, ["translated_langid_language", "alphabetical_ratio"]].hvplot.box(
    by="translated_langid_language",
    grid=True,
    width=900,
    height=500,
    tools=["zoom_in", "zoom_out", "hover"],
    invert=True,
)

In [None]:
df.loc[:, [
    "title",
    "text",
    "translated_langid_language"
]].rename(columns={
    "translated_langid_language": "language",
}).to_csv("../../../datasets/reddit_multigec/pre_moderation_reddit_multigec.csv", index=False)