In [1]:
import os
import pandas as pd
import json
import re
import nltk
nltk.download('punkt_tab')
from datetime import datetime

from tqdm import tqdm

# from credibility import get_credibility_subfeatures
# from credibility import get_comment_readability

from defection import get_defection_score
from coalition import get_coalition_score
from onesidedness import get_onesidedness_score
from resilience import get_resilience_score

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/juanp.lievanok./nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def gather_meta_data(directory):
    dictionaries = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".json"):
                subreddit, date_string = re.match(directory+'/(.+)/(.+)', root).groups()
                date = datetime.strptime(date_string, 'date_%m-%d-%Y_time_%H-%M')
                path = os.path.join(root, file)
                post_id = re.match('(.+).json', file).group(1)
                new_dict = {
                    'path': path,
                    'subreddit': subreddit,
                    'download_date' : date,
                    'post_id' : post_id
                }
                dictionaries.append(new_dict)
    return dictionaries

In [3]:
# legacy now that we're separating the credibility loop from the others. 

# def calculate_feature_scores(comment_forest):
#     """
#     Calculataes coalition, onsidedness and defection scores,
#     and calculates the credibility subfeatures (which have to be normalized and made into a credibility score
#     but this normalization requires the credibility subfeatures for all the data to be calculated first.)
#     """
    
#     # Compute values
#     # TODO coalition = get_coalition_score(comment_forest) 
#     onesidedness = get_onesidedness_score(comment_forest)
#     defection = get_defection_score(comment_forest)

#     simple_features_dictionary = {
#         # TODO "coalition": coalition,
#         "onesidedness": onesidedness,
#         "defection": defection,
#     }

#     credibility_dictionary = get_credibility_subfeatures(comment_forest)


#     return credibility_dictionary | simple_features_dictionary


# def mass_calculate_feature_scores(directory_path, target_directory_name = '../misc_dataframes_with_test_results'):
    
#     calculation_start_time = datetime.now()

#     dicts = gather_meta_data(directory_path)
#     exceptions_count = 0 

#     df_rows = []
#     for json_dictionary in dicts:
#         with open(json_dictionary['path']) as file:
#             try:
#                 comment_forest = json.load(file)
#                 feature_scores_dict = calculate_feature_scores(comment_forest)
#                 new_row_dict = {**json_dictionary, **feature_scores_dict}
#                 df_rows.append(new_row_dict)
#             except Exception as e:
#                 exceptions_count += 1
#                 print(f'Exceptions count = {exceptions_count}')
#                 print(e)
                
#     timestamp_str = calculation_start_time.strftime('%Y-%m-%d_%H-%M-%S')

#     safe_dir_name = re.sub(r'[\\/]', '_', directory_path) # Sanitize the directory_path to remove slashes or other problematic characters

#     filename = f'credibility_subfeatures_and_simple_feature_scores_for_jsons_in_{safe_dir_name}_calculated_at_{timestamp_str}.csv'
    
#     target_path = os.path.join(target_directory_name, filename)
#     df = pd.DataFrame(df_rows)
#     df.to_csv(target_path, index=False)

# simple features (coalition, defection, onesidedness, and resilience)

In [4]:
def calculate_simple_feature_scores(comment_forest):
    ''' 
    Return dictionary with the "simple feature" scores for the given comment forest.
    Simple feature scores are coalition, defection, onesidedness, and resilience.
    Credibility is not simple because it requires calculating subfeatures for all data before being computed. 
    '''
    
    # Compute values
    coalition = get_coalition_score(comment_forest) 
    onesidedness = get_onesidedness_score(comment_forest)
    defection = get_defection_score(comment_forest)
    resilience = get_resilience_score(comment_forest) 

    simple_features_dictionary = {
        "coalition": coalition,
        "onesidedness": onesidedness,
        "defection": defection,
        "resilience" : resilience
    }

    return simple_features_dictionary

def mass_calculate_simple_feature_scores(directory_path, target_directory_name = '../misc_dataframes_with_test_results'):
    ''' 
    Simple feature scores are coalition, defection, onesidedness, and resilience.
    Credibility is not simple because it requires calculating subfeatures for all data before being computed. 
    '''
    calculation_start_time = datetime.now()

    dicts = gather_meta_data(directory_path)
    exceptions_count = 0 

    df_rows = []
    for json_dictionary in tqdm(dicts):
        with open(json_dictionary['path']) as file:
            try:
                comment_forest = json.load(file)
                feature_scores_dict = calculate_simple_feature_scores(comment_forest)
                new_row_dict = {**json_dictionary, **feature_scores_dict}
                df_rows.append(new_row_dict)
            except Exception as e:
                exceptions_count += 1
                print(f'Exceptions count = {exceptions_count}')
                print(e)
                
    timestamp_str = calculation_start_time.strftime('%Y-%m-%d_%H-%M-%S')

    safe_dir_name = re.sub(r'[\\/]', '_', directory_path) # Sanitize the directory_path to remove slashes or other problematic characters

    filename = f'simple_feature_scores_for_jsons_in_{safe_dir_name}_calculated_at_{timestamp_str}.csv'
    
    target_path = os.path.join(target_directory_name, filename)
    df = pd.DataFrame(df_rows)
    df.to_csv(target_path, index=False)

In [None]:
directory_path = '../scraping/representative_subreddits_for_varied_percentiles'
mass_calculate_simple_feature_scores(directory_path)

  0%|          | 0/189 [00:00<?, ?it/s]

entered first if case, comments < 10.
entered first if case, comments < 10.


  2%|▏         | 4/189 [00:03<03:11,  1.03s/it]

entered first if case, comments < 10.
entered first if case, comments < 10.


  4%|▎         | 7/189 [00:07<03:09,  1.04s/it]

entered first if case, comments < 10.


  5%|▍         | 9/189 [00:08<02:53,  1.04it/s]

entered first if case, comments < 10.


  6%|▌         | 11/189 [00:10<02:34,  1.15it/s]

entered first if case, comments < 10.


  7%|▋         | 13/189 [00:11<02:24,  1.22it/s]

entered first if case, comments < 10.
entered first if case, comments < 10.
entered first if case, comments < 10.
entered first if case, comments < 10.
entered first if case, comments < 10.
entered first if case, comments < 10.
entered first if case, comments < 10.


 23%|██▎       | 43/189 [01:06<04:47,  1.97s/it]

entered first if case, comments < 10.


 26%|██▋       | 50/189 [01:18<04:28,  1.93s/it]

entered first if case, comments < 10.


 28%|██▊       | 52/189 [01:20<03:17,  1.44s/it]

entered first if case, comments < 10.


 31%|███       | 58/189 [01:30<04:05,  1.87s/it]

entered first if case, comments < 10.


 42%|████▏     | 79/189 [02:16<04:02,  2.21s/it]

entered first if case, comments < 10.


 43%|████▎     | 82/189 [02:20<03:16,  1.84s/it]

entered first if case, comments < 10.


 44%|████▍     | 84/189 [02:22<02:26,  1.39s/it]

entered first if case, comments < 10.
entered first if case, comments < 10.


 46%|████▌     | 87/189 [02:23<01:44,  1.02s/it]

entered first if case, comments < 10.


 48%|████▊     | 90/189 [02:27<01:46,  1.08s/it]

entered first if case, comments < 10.


 49%|████▊     | 92/189 [02:28<01:38,  1.01s/it]

entered first if case, comments < 10.


 50%|████▉     | 94/189 [02:30<01:31,  1.04it/s]

entered first if case, comments < 10.


 51%|█████▏    | 97/189 [02:34<01:43,  1.12s/it]

entered first if case, comments < 10.


 62%|██████▏   | 118/189 [03:46<04:53,  4.14s/it]

In [4]:
import pandas as pd
path = '../misc_dataframes_with_test_results/credibility_subfeatures_and_simple_feature_scores_for_jsons_in_.._scraping_representative_subreddits_for_varied_percentiles_calculated_at_2025-05-01_21-32-52.csv'
df = pd.read_csv(path)

In [6]:
sum(df['total_word_count'])

625263