In [67]:
import random
import numpy as np
import pandas as pd
from faker import Faker
fake = Faker()
from ide.utils.utils import get_project_root

seed = 42
np.random.seed(seed)
fake.seed = seed
random.seed = seed

In [68]:
def generate_data(num_unlabeled_records, num_labeled_records):

    user_data = []
    tweet_data = []
    labeled_data = []

    for _ in range(0, num_unlabeled_records):
        anhedonia = np.random.rand() > 0.5
        discloses_clearly = np.random.rand() > 0.5
        
        lang = random.choice(["en", "en", "bengali"])
        country = random.choice(["NL", "NL", "IN"])

        if anhedonia is False:
            tweet = ""
            if lang != "bengali":
                tweet += fake.paragraph(nb_sentences=3, variable_nb_sentences=False)
            else:
                tweet += fake.paragraph(nb_sentences=1, variable_nb_sentences=False)
                tweet += random.choice([" বাংলা ডেমো টেক্সট. ", " আমরা বাংলায় ওয়েব. ", " ডেডলপমেন্ট নিয়ে. "])
                tweet += fake.paragraph(nb_sentences=1, variable_nb_sentences=False)
        elif discloses_clearly is True:
            tweet = fake.paragraph(nb_sentences=1, variable_nb_sentences=False)
            tweet += random.choice([" I have zero motivation ", " I have no motivation ", " I lost interest "])
            tweet += fake.paragraph(nb_sentences=1, variable_nb_sentences=False)
        else:
            tweet = fake.paragraph(nb_sentences=1, variable_nb_sentences=False)
            tweet += random.choice([" I was diagnosed with anhedonia ", " I no longer enjoy anything "])
            tweet += fake.paragraph(nb_sentences=1, variable_nb_sentences=False)
        # See https://github.com/stefan-grafberger/mlwhatif/blob/90bd5003c1e1ef0a51545455383d89e7e26a6d01/demo/feature_overview/data_generation.ipynb#L12 for more intricate generation

        user_id = fake.user_name()

        user_data.append((user_id, lang, country))
        tweet_data.append((user_id, tweet))

    for _ in range(0, num_labeled_records):
        anhedonia = np.random.rand() > 0.5
        discloses_clearly = np.random.rand() > 0.5
        lang = random.choice(["en", "en", "bengali"])
        # lang = random.choice(["bengali"])
        country = random.choice(["NL", "NL", "IN"])

        if anhedonia is False:
            tweet = ""
            if lang != "bengali":
                tweet += fake.paragraph(nb_sentences=2, variable_nb_sentences=False)
            else:
                tweet += random.choice([" বাংলা ডেমো টেক্সট. ", " আমরা বাংলায় ওয়েব. ", " ডেডলপমেন্ট নিয়ে. "])
                tweet += fake.paragraph(nb_sentences=1, variable_nb_sentences=False)
        elif discloses_clearly is True:
            tweet = ""
            if lang != "bengali":
                tweet += fake.paragraph(nb_sentences=1, variable_nb_sentences=False)
                tweet += random.choice([" I have zero motivation. ", " I have no motivation. ", " I lost interest. "])
            else:
                tweet += random.choice([" আমার শূন্য প্রেরণা আছে. ", " আমার কোন অনুপ্রেরণা নেই. ", " আমি আগ্রহ হারিয়ে ফেলেছি. "])
                tweet += fake.paragraph(nb_sentences=1, variable_nb_sentences=False)
            tweet += fake.paragraph(nb_sentences=1, variable_nb_sentences=False)
        else:
            tweet = ""
            if lang != "bengali":
                tweet += fake.paragraph(nb_sentences=1, variable_nb_sentences=False)
                tweet += random.choice([" I was diagnosed with anhedonia. ", " I no longer enjoy anything. "])
            else:
                tweet += random.choice([" আমার অ্যানহেডোনিয়া ধরা পড়ে. ", " আমি আর কিছুই উপভোগ করি না. "])
                tweet += fake.paragraph(nb_sentences=1, variable_nb_sentences=False)
            tweet += fake.paragraph(nb_sentences=1, variable_nb_sentences=False)
        # See https://github.com/stefan-grafberger/mlwhatif/blob/90bd5003c1e1ef0a51545455383d89e7e26a6d01/demo/feature_overview/data_generation.ipynb#L12 for more intricate generation

        user_id = fake.user_name()
        

        labeled_data.append((user_id, tweet, anhedonia, lang, country))

    users = pd.DataFrame.from_records(user_data, columns=['user_id', 'lang', 'country'])
    tweets = pd.DataFrame.from_records(tweet_data, columns=['user_id', 'tweet'])
    labeled_data = pd.DataFrame.from_records(labeled_data, columns=['user_id', 'tweet', 'anhedonia', 'lang', 'country'])

    return users, tweets, labeled_data

In [69]:
# users, tweets, labeled_data = generate_data(200, 10)
users, tweets, labeled_data = generate_data(900, 100)

users.to_parquet(f'{str(get_project_root())}/ide/experiments/datasets/anhedonia/users.pqt')
tweets.to_parquet(f'{str(get_project_root())}/ide/experiments/datasets/anhedonia/tweets.pqt')
labeled_data.to_parquet(f'{str(get_project_root())}/ide/experiments/datasets/anhedonia/expert_labeled.pqt')