In [1]:
import os
import pandas as pd
import json
import re
import nltk
nltk.download('punkt_tab')
from datetime import datetime
from feature_scripts.credibility import get_credibility_score
from feature_scripts.defection import get_defection_score

from feature_scripts.coalition import get_coalition_score
from feature_scripts.onesidedness import get_onesidedness_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def gather_meta_data(directory):
    dictionaries = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".json"):
                subreddit, date_string = re.match(directory+'/(.+)/(.+)', root).groups()
                date = datetime.strptime(date_string, 'date_%m-%d-%Y_time_%H-%M')
                path = os.path.join(root, file)
                post_id = re.match('(.+).json', file).group(1)
                new_dict = {
                    'path': path,
                    'subreddit': subreddit,
                    'download_date' : date,
                    'post_id' : post_id
                }
                dictionaries.append(new_dict)
    return dictionaries

def calculate_feature_scores(comment_forest):
    """
    Calculataes feature scores for a JSON file.
    """
    
    # Compute values
    coalition = get_coalition_score(comment_forest) 
    onesidedness = get_onesidedness_score(comment_forest)
    credibility = get_credibility_score(comment_forest)
    defection = get_defection_score(comment_forest)

    return {
        "coalition": coalition,
        "onesidedness": onesidedness,
        "credibility": credibility,
        "defection": defection,
    }

In [3]:
def mass_calculate_feature_scores(directory_path):
    
    calculation_start_time = datetime.now()

    dicts = gather_meta_data(directory_path)

    df_rows = []
    for json_dictionary in dicts:
        with open(json_dictionary['path']) as file:
            comment_forest = json.load(file)
            feature_scores_dict = calculate_feature_scores(comment_forest)
            new_row_dict = {**json_dictionary, **feature_scores_dict}
            df_rows.append(new_row_dict)

    timestamp_str = calculation_start_time.strftime('%Y-%m-%d_%H-%M-%S')

    safe_dir_name = re.sub(r'[\\/]', '_', directory_path) # Sanitize the directory_path to remove slashes or other problematic characters

    filename = f'feature_scores_for_jsons_in_{safe_dir_name}_calculated_at_{timestamp_str}.csv'
    target_directory_name = 'feature_scores_dataframes'
    target_path = os.path.join(target_directory_name, filename)
    df = pd.DataFrame(df_rows)
    df.to_csv(target_path, index=False)

In [4]:
directory_path = 'scraping/top_subreddits_data'
mass_calculate_feature_scores(directory_path)

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/Users/sanapandey/nltk_data'
    - '/Users/sanapandey/miniconda3/envs/constructive-ranking/nltk_data'
    - '/Users/sanapandey/miniconda3/envs/constructive-ranking/share/nltk_data'
    - '/Users/sanapandey/miniconda3/envs/constructive-ranking/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************
