<a href="https://colab.research.google.com/github/randawilly/sg-cb23db64/blob/main/william3_data_scrape_randevteam_reddit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install asyncpraw
!pip install firebase-admin
!pip install nest_asyncio
!pip install ipdb



Import libraries

In [None]:
import asyncpraw
import json
import asyncio
import firebase_admin
import nest_asyncio
from firebase_admin import credentials, firestore
from datetime import datetime
from firebase_admin import db
import requests
from google.colab import userdata
from google.colab import drive
import ipdb

# Allow nested event loops
nest_asyncio.apply()

Initialize firebase application

In [None]:
# Initialize firebase realtime database
drive.mount('/content/drive')
certificate_path = '/content/drive/My Drive/Colab_Notebooks/redditproblemdatascrape-firebase-adminsdk-4f29s-613e1106c3.json'
database_url = userdata.get('database_url')
#ipdb.set_trace()

# Check if the default app is already initialized
if not firebase_admin._apps:  # _apps is a private attribute but serves this purpose
    cred = credentials.Certificate(certificate_path)
    firebase_admin.initialize_app(cred, {
        'databaseURL': database_url
    })
    print("firebase connexion")
    ref = db.reference()
else:
    print("Firebase app is already initialized.")
    # Get the existing default app instance
    default_app = firebase_admin.get_app()
    ref = db.reference(app=default_app)
#ipdb.set_trace()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Firebase app is already initialized.


Create Reddit Instance

In [None]:
# Create reddit instance
reddit = asyncpraw.Reddit(
    client_id = userdata.get('client_id'),
    client_secret = userdata.get('client_secret'),
    password = userdata.get('reddit_password'),
    user_agent = userdata.get('user_agent'),
    username = userdata.get('reddit_username'),
    check_for_async=False
)
#print(f"Connection successful! Logged in as: {reddit.user.me()}")

# Create Subreddit instance
surbreddit_name = userdata.get('subreddit_name')
subreddit = await reddit.subreddit(surbreddit_name)

Get Last modified post id

In [None]:
def get_last_modified_post_id():
    # Get a reference to the posts path
    #ipdb.set_trace()
    print("get_last_modified_post_id function")
    posts_ref = ref.order_by_child('modified_timestamp').limit_to_last(1)
    # Retrieve the most recent post
    result = posts_ref.get()

    # Check if result is not None before iterating
    if result:
        # Extract the ID from the result
        print("get_last_modified_post_id function result : ")
        for post_id in result:
            print(post_id)
            return post_id
    else:
        print("No posts found. get_last_modified_post_id function")

    return None

get_last_modified_post_id()

get_last_modified_post_id function
get_last_modified_post_id function result : 
1fr5q6s


'1fr5q6s'

Save a post to Firebase Realtime Database

In [None]:
def save_to_firebase(post):
    post_id = post["id"]
    post_ref = ref.child(post_id)

    # Retrieve the existing post if it exists
    existing_post = post_ref.get()

    try:
        if existing_post:
            # If the post exists, update it
            post_ref.update(post)
            print(f"Post '{post['title']}' updated successfully.")
        else:
            # If the post does not exist, add it
            post_ref.set(post)
            print(f"Post '{post['title']}' added successfully.")
    except requests.exceptions.RequestException as e:
        # Handle any request-related errors here
        print(f"Failed to save post '{post['title']}': {e}")
    except Exception as e:
        # Catch all other exceptions to prevent breaking the loop
        print(f"An error occurred while saving post '{post['title']}': {e}")



Method to convert UTC Timestamp to UTC Datetime

In [None]:
def convert_utc_to_datetime(utc_timestamp):
    return datetime.utcfromtimestamp(utc_timestamp).strftime('%Y-%m-%d %H:%M:%S')

Method to get replies from a comment

In [None]:
async def get_replies(comment):
    replies = []
    print("get_replies function start")
    async for reply in comment.replies:
        reply_dict = {
            "id": reply.id,
            "body": reply.body,
            "created": convert_utc_to_datetime(reply.created_utc),
            "score": reply.score,
            "name": reply.name,
            "author": str(reply.author) if reply.author else None,
            "parent_id": comment.id
        }
        replies.append(reply_dict)

        # Recursively get replies to this reply
        replies += await get_replies(reply)

    return replies

Method to fetch post comments

In [None]:
async def fetch_comments(submission):
    comments = []
    print("fetch_comments function start")

    for comment in submission.comments.list():
        resolved_replies = await get_replies(comment)
        comment_data = {
            "id": comment.id,
            "body": comment.body,
            "created": convert_utc_to_datetime(comment.created_utc),
            "author": str(comment.author) if comment.author else None,
            "score": comment.score,
            "name": comment.name,
            "replies": resolved_replies,
            "replies_count": len(resolved_replies)
        }
        comments.append(comment_data)

    return comments

Method to get posts from Reddit

In [None]:
async def get_reddit_posts(limit, after=None):
    print("get_reddit_posts function start")
    subreddit_randev_data = subreddit.top(limit=limit, params={'after': f't3_{after}' if after else None})
    #print(len(subreddit_randev_data))
    #ipdb.set_trace()
    async for submission in subreddit_randev_data:
        print("TITLE OF POST FROM REDDIT --------->")
        print(submission.title)

        await submission.load()
        await submission.comments.replace_more(limit=None)

        if submission.num_comments == 0:
            # Skip posts that have no comments
            print("this post do not have comments")
            continue

        comments = await fetch_comments(submission)
        print("this post have comments")

        post = {
            "id": submission.id,
            "title": submission.title,
            "score": submission.score,
            "author": str(submission.author.name) if submission.author else None,
            "content": submission.selftext,
            "created": convert_utc_to_datetime(submission.created_utc),
            "url": submission.url,
            "comments_count": submission.num_comments,
            "name": submission.name,
            "upvote_ratio": submission.upvote_ratio,
            "comments": comments,
            "modified_timestamp": datetime.utcnow().isoformat()
        }

        save_to_firebase(post)

Run main function to get data from Reddit and save them to database

In [None]:
async def main():
    after = None
    last_post_id = get_last_modified_post_id()

    print("main function getting last_post_id start")
    #ipdb.set_trace()

    if last_post_id:
      after = f"t3_{last_post_id}"
      print("main function getting last_post_id : ***")
      print(last_post_id)
      print(after)

    await get_reddit_posts(limit=10000, after=after)

# Run the main function
await main()

get_last_modified_post_id function
get_last_modified_post_id function result : 
1eyv4up
main function getting last_post_id start
main function getting last_post_id : ***
1eyv4up
t3_1eyv4up
get_reddit_posts function start
TITLE OF POST FROM REDDIT --------->
Best Solution for Plantar Fasciitis pain
fetch_comments function start
get_replies function start
get_replies function start
get_replies function start
get_replies function start
get_replies function start
this post have comments


  resolved_replies = await get_replies(comment)
  replies += await get_replies(reply)


Post 'Best Solution for Plantar Fasciitis pain' updated successfully.
TITLE OF POST FROM REDDIT --------->
Is anyone into Nipple pain? Specifically females? If so, how do you do it? And how does it make you feel?
fetch_comments function start
get_replies function start
get_replies function start
get_replies function start
get_replies function start
get_replies function start
get_replies function start
get_replies function start
get_replies function start
get_replies function start
get_replies function start
this post have comments
Post 'Is anyone into Nipple pain? Specifically females? If so, how do you do it? And how does it make you feel?' updated successfully.
TITLE OF POST FROM REDDIT --------->
Begging For Help After Almost 10 Years of Constant Pain. PLEASE!!!
fetch_comments function start
get_replies function start
get_replies function start
get_replies function start
get_replies function start
get_replies function start
get_replies function start
get_replies function start
get_r

Count total posts in firebase realtime database

In [None]:
def count_object_ids_in_firebase():
    # Get only the keys (IDs) without fetching the full data
    data = ref.get(shallow=True)  # Using shallow=True to get only keys

    if data:
        count = len(data)
        return f"Total objects: {count}"
    else:
        print(f"No objects found.")
        return 0

count_object_ids_in_firebase()

'Total objects: 321'