In [4]:
import os
import pandas as pd
import json
import re
import nltk
nltk.download('punkt_tab')
from datetime import datetime
from credibility import get_credibility_subfeatures
from defection import get_defection_score

from coalition import get_coalition_score
from onesidedness import get_onesidedness_score

from credibility import get_comment_readability

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/juanp.lievanok./nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
def gather_meta_data(directory):
    dictionaries = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".json"):
                subreddit, date_string = re.match(directory+'/(.+)/(.+)', root).groups()
                date = datetime.strptime(date_string, 'date_%m-%d-%Y_time_%H-%M')
                path = os.path.join(root, file)
                post_id = re.match('(.+).json', file).group(1)
                new_dict = {
                    'path': path,
                    'subreddit': subreddit,
                    'download_date' : date,
                    'post_id' : post_id
                }
                dictionaries.append(new_dict)
    return dictionaries

def calculate_feature_scores(comment_forest):
    """
    Calculataes coalition, onsidedness and defection scores,
    and calculates the credibility subfeatures (which have to be normalized and made into a credibility score
    but this normalization requires the credibility subfeatures for all the data to be calculated first.)
    """
    
    # Compute values
    # TODO coalition = get_coalition_score(comment_forest) 
    onesidedness = get_onesidedness_score(comment_forest)
    defection = get_defection_score(comment_forest)

    simple_features_dictionary = {
        # TODO "coalition": coalition,
        "onesidedness": onesidedness,
        "defection": defection,
    }

    credibility_dictionary = get_credibility_subfeatures(comment_forest)


    return credibility_dictionary | simple_features_dictionary



In [6]:
def mass_calculate_feature_scores(directory_path, target_directory_name = '../misc_dataframes_with_test_results'):
    
    calculation_start_time = datetime.now()

    dicts = gather_meta_data(directory_path)
    exceptions_count = 0 

    df_rows = []
    for json_dictionary in dicts:
        with open(json_dictionary['path']) as file:
            try:
                comment_forest = json.load(file)
                feature_scores_dict = calculate_feature_scores(comment_forest)
                new_row_dict = {**json_dictionary, **feature_scores_dict}
                df_rows.append(new_row_dict)
            except Exception as e:
                exceptions_count += 1
                print(f'Exceptions count = {exceptions_count}')
                print(e)
                
    timestamp_str = calculation_start_time.strftime('%Y-%m-%d_%H-%M-%S')

    safe_dir_name = re.sub(r'[\\/]', '_', directory_path) # Sanitize the directory_path to remove slashes or other problematic characters

    filename = f'credibility_subfeatures_and_simple_feature_scores_for_jsons_in_{safe_dir_name}_calculated_at_{timestamp_str}.csv'
    
    target_path = os.path.join(target_directory_name, filename)
    df = pd.DataFrame(df_rows)
    df.to_csv(target_path, index=False)

In [7]:
directory_path = '../scraping/representative_subreddits_for_varied_percentiles'
mass_calculate_feature_scores(directory_path)

Exceptions count = 1
You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.
Exceptions count = 2
You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.
Exceptions count = 3
You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.
Exceptions count = 4
You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.
Exceptions count = 5
You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs:

In [4]:
import pandas as pd
path = '../misc_dataframes_with_test_results/credibility_subfeatures_and_simple_feature_scores_for_jsons_in_.._scraping_representative_subreddits_for_varied_percentiles_calculated_at_2025-05-01_21-32-52.csv'
df = pd.read_csv(path)

In [6]:
sum(df['total_word_count'])

625263