# Summary

Test the Reddit data collection class


## Import libraries

In [71]:
import os, sys
import pandas as pd

# Environmental variables
import dotenv

import asyncio

# Reddit class
reddit_class_path = '../../../code/reddit'
sys.path.insert(0, reddit_class_path)
import reddit_data_fetcher as rdf




## Check the logic for cleaning keywords

In [45]:
import re
def clean_keyword_text(text):
    '''
    Method to clean Reddit and keyword texts in either the original post
    or associated comments.

    '''

    # Remove these words from search terms
    stop_words = ['stop', 'the', 'to', 'and', 'a', 'in', 'it',
                  'is', 'I', 'i', 'that', 'had', 'on', 'for', 'were', 'was',
                  'through', 'of', 'way', 'end', 'our', 'place', 'home',
                  'support', 'city', 'visitor', 'women', 'men', 'need', 'idea',
                  'north', 'south', 'east', 'west', 'ready', 'save', 'salt', 'win',
                  'lose', 'loss', 'family', 'working', 'hope', 'love', 'house']

#     # Split words with some characters
# #     pat = r"\/"
#     pat = "\\/"
#     text = re.sub(pat, ' ', text)

    # Remove unwanted characters

    pat = r"\\n|r/|[^a-zA-Z0-9 ]"
    text = re.sub(pat, ' ', text)

    # cohvert to lowercase
    text = text.strip().lower()

    # Remove stop words
    pat = "|".join(["\\b{}\\b".format(w) for w in stop_words])
    text = re.sub(pat, ' ', text)

    # remove extra spaces
    pat = r"\s{2,}"
    text = re.sub(pat, ' ', text)

    # Remove leading and trailing spaces
    text = text.strip()

    return text

In [46]:
keywords_file_path = '../../../data/keywords'
keywords_files = ['ac.csv']

keywords = []

# read each file with keywords
for kwf in keywords_files:

    # Read the files into a dataframe
    df_kw = pd.read_csv(os.path.join(keywords_file_path, kwf), index_col=0)

    # Get the cleaned keywords
    keywords.extend([clean_keyword_text(k) for k in df_kw[df_kw.columns[0]].tolist()])

# Ensure keywords are strings and remove any duplicates
search_terms = [k for k in set(keywords) if isinstance(k, str) and len(k) > 1]
        



In [47]:
search_terms

['esquimalt nation office',
 'greater victoria placemaking network',
 'first metropolitan united church',
 'cool aid society',
 'burnside gorge community association',
 'society st vincent de paul',
 'greater yyj politics',
 'society',
 'peers victoria resources society',
 'svdp',
 'rentsmart',
 'achievement foundation victoria',
 'victoria harbour cats',
 'victoria tenant action group',
 'greater victoria coalition homelessness',
 'avi',
 'tenant action group victoria',
 'quadra village community centre',
 'red cedar caf',
 'workbc centre victoria',
 'vancouver island mental health society',
 'aehcr',
 'district saanich',
 'sanctuary youth centre',
 'greater victoria acting together',
 'spring exchange',
 'oak bay united church',
 'victoria aids resource community services',
 'mental health society greater victoria',
 'james bay community project',
 'existence project',
 'victoria s transition',
 'victoria court youth justice committee',
 'victoria youth council',
 'upstream prevent y

In [48]:
test_text = "addicted/addict"
clean_keyword_text(test_text)

test_text = "addicted/addict$#pppp-newline/r/"
clean_keyword_text(test_text)


'addicted addict pppp newline'

## Test the Reddit crawler class

In [67]:
dotenv.load_dotenv('../../../data/environment/.env')

True

In [75]:
# Attributes to modify using the **kwargs parameter
posts_file_path = "../../data_tests/reddit_tests/posts"
logs_file_path = "../../data_tests/reddit_tests/logs"
keywords_file_path = '../../../data/keywords'
limit_num=10


# Initialize Reddit object
data_fetcher = rdf.GVCEHReddit(client_id=os.environ.get("REDDIT_CLIENT_ID"),
                              client_secret=os.environ.get("REDDIT_CLIENT_SECRET"),
                              user_agent=os.environ.get("REDDIT_USER_AGENT"),
                              posts_file_path=posts_file_path,
                              logs_file_path=logs_file_path,
                              keywords_file_path=keywords_file_path,
                              limit_num=limit_num)

#Define subreddits to search
subreddit_names = ['britishcolumbia']

await data_fetcher.fetch_data(subreddit_names=subreddit_names)


time: 2024-02-15 14:02:54: event: start fetch: subreddit_name: britishcolumbia


  raise AttributeError(


time: 2024-02-15 14:04:09: event: fetch complete
