In [20]:
#!pip install python-dotenv

import os
from dotenv import load_dotenv
import praw

import json

import pandas as pd

In [21]:
# Load environment variables from .env file
load_dotenv()

reddit = praw.Reddit(
    client_id=os.getenv("REDDIT_CLIENT_ID"),
    client_secret=os.getenv("REDDIT_CLIENT_SECRET"),
    user_agent=os.getenv("REDDIT_USER_AGENT")
)

#url = 'https://www.reddit.com/r/ireland/comments/w5lhur/we_need_to_get_out_of_animal_farming_all_together/'
url = 'https://www.reddit.com/r/ireland/comments/1aghrfn/irish_farmers_protest_in_solidarity_with_eu/'
submission = reddit.submission(url=url)

# Collecting author and comment
comments = []
submission.comments.replace_more(limit=0)
for comment in submission.comments.list():
    author = comment.author.name if comment.author else "Deleted"  # Handling deleted users
    comments.append({'author': author, 'statement': comment.body})


In [22]:
# Write the comments to a JSON file
with open('extracted_redditfarming.json', 'w') as f:
    json.dump(comments, f, indent=4)

In [23]:
import requests
from bs4 import BeautifulSoup
import json
import re

# Base URL of the forum thread
base_url = "https://www.forum4farming.com/forum/index.php?threads/cap-2023-2027.20587"

# Regular expression to match "X said:" and capture X and the statement
pattern = r"(\w+) said:\s*(.*)"

# List to store the filtered data
extracted_data = []

# Loop through the pages of the thread
for page_num in range(1, 7):  # going over 6 pages
    # Construct the URL for each page
    url = f"{base_url}/page-{page_num}" if page_num > 1 else base_url + "/"
    # Fetch the page content
    response = requests.get(url)
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        # Extract all text from the forum
        posts = soup.find_all('article', class_='message--post')
        for post in posts:
            text = post.find('div', class_='bbWrapper').text
            # Find all matches of the pattern
            matches = re.findall(pattern, text, re.DOTALL)
            for match in matches:
                name, statement = match
                # Clean the statement of newlines and extra spaces
                statement = re.sub(r'\s+', ' ', statement.strip())
                extracted_data.append({"author": name, "statement": statement})

# Save the extracted data to a JSON file
output_file_path = 'extracted_forum4farming.json'
with open(output_file_path, 'w', encoding='utf-8') as file:
    json.dump(extracted_data, file, ensure_ascii=False, indent=4)

#print(f'Data has been written to {output_file_path}')

In [24]:
# Load data from extracted_redditfarming.json file into a DataFrame
df_reddit = pd.read_json('extracted_redditfarming.json')

# Display the DataFrame
print(df_reddit.head())

# Check the size of DataFrame
print(f"Total rows on reddit DataFrame: {df_reddit.shape[0]}")

              author                                          statement
0       Franz_Werfel  the famers protests on the continent were abou...
1         lamahorses      Over 30% of the European budget is on CAP etc
2  ConnolysMoustache  What are they even protesting?\n\nThe IFA repr...
3            bintags                     These people are brainwashed. 
4  gofuckyoureself21  Stop bitching at each other and get out and su...
Total rows on reddit DataFrame: 45


In [25]:
# Load data from extracted_forum4farming.json file into a DataFrame
df_f4f = pd.read_json('extracted_forum4farming.json')

# Display the first few rows to confirm it's loaded correctly
print(df_f4f.head())

# Check the size of DataFrame
print(f"Total rows on f4f DataFrame: {df_f4f.shape[0]}")


       author                                          statement
0      eire23  Surprised there wasn't a thread on this with s...
1  muckymanor  I see that female farmers are going to get 60%...
2         boy  Discrimination Click to expand... 60% for youn...
3    Burdizzo  60% for young farmers - Ageism Click to expand...
4  muckymanor  I see that female farmers are going to get 60%...
Total rows on f4f DataFrame: 85


In [29]:
# Concatenate the two dataframes
df_combined = pd.concat([df_reddit, df_f4f], ignore_index=True)

# Display the first few rows of the combined DataFrame
print(df_combined.head(47))

# Check the size of the combined DataFrame
print(f"Total rows in combined DataFrames: {df_combined.shape[0]}")


                  author                                          statement
0           Franz_Werfel  the famers protests on the continent were abou...
1             lamahorses      Over 30% of the European budget is on CAP etc
2      ConnolysMoustache  What are they even protesting?\n\nThe IFA repr...
3                bintags                     These people are brainwashed. 
4      gofuckyoureself21  Stop bitching at each other and get out and su...
5         No-Category-38       People with jobs are too busy for this shit.
6               af_lt274  >massive amount of cognitive dissonance for ir...
7   DiscussionUnusual466  Well now , just to break in down quickly for y...
8      TruthSeeker101110  Don't they already get [€1.2 billion in subsid...
9                bintags  The cognitive dissonance is all that’s keeping...
10    Healthy-Travel3105  I thought their issue was having to compete wi...
11             JRR_STARK  Alright, you know more about agriculture then ...
12          

In [31]:
# Save the combined extracted data to a JSON file
output_file_path = 'extracted_combined.json'

# Using the correct method to save to JSON
df_combined.to_json(output_file_path, force_ascii=False, indent=4, orient='records')

In [32]:
# Save the combined DataFrame to a CSV file
output_file_path = 'extracted_combined.csv'
df_combined.to_csv(output_file_path, index=False, encoding='utf-8')