In [18]:
import csv
import os
import pandas as pd
from datetime import datetime, timedelta
import chardet

news_companies = {
    "Inquirer.net": "https://www.inquirer.net/",
    "Manila Bulletin": "https://mb.com.ph/",
    "The Asian Journal USA": "https://asianjournal.com/",
    "The Manila Times": "https://www.manilatimes.net/",
    "Business World": "https://www.bworldonline.com/",
    "Eagle News": "https://www.eaglenews.ph/",
    "Metro Cebu News": "https://metrocebu.news/",
    "Tempo": "https://tempo.com.ph/",
    "Abante Tonite": "https://tonite.abante.com.ph/",
    "Philippine News Agency": "https://www.pna.gov.ph/",
    "InterAksyon": "https://interaksyon.philstar.com/",
    "Business Mirror": "https://businessmirror.com.ph/",
    "The Summit Express": "https://www.thesummitexpress.com/",
    "Our Daily News Online": "https://ourdailynewsonline.com/",
    "Current PH": "https://currentph.com/",
    "SunStar Philippines": "https://www.sunstar.com.ph/",
    "Rappler": "https://www.rappler.com/",
    "The Bohol Chronicle": "https://www.boholchronicle.com.ph/",
    "Baguio Midland Courier": "https://www.baguiomidlandcourier.com.ph/",
    "GMA News Online": "https://www.gmanetwork.com/news/",
    "Cebu Daily News": "https://cebudailynews.inquirer.net/",
    "ABS-CBN News": "https://news.abs-cbn.com/",
    "Philstar.com": "https://www.philstar.com/",
    "Manila Standard": "https://manilastandard.net/",
    "Daily Tribune": "https://tribune.net.ph/",
    "Davao Today": "https://davaotoday.com/",
    "Sunday Punch": "https://punch.dagupan.com/",
    "Visayan Daily Star": "https://visayandailystar.com/",
    "PTV News": "https://ptvnews.ph/",
    "Mindanao Times": "https://mindanaotimes.com.ph/",
    "PhilNews.XYZ": "https://philnews.xyz/",
    "Northern Dispatch": "https://nordis.net/"
}

news_topics = [
    "inflation",
    "economy",
    "business",
    "technology",
    "health",
    "environment",
    "welfare",
    "politics",
    "foreign_affairs"
]

raw_folder = {
  'business' : ['Business1.csv', 'Business2.csv', 'Business3.csv', 'Business5.csv', 'Business6.csv', 'Business7.csv', 'Business8.csv', ],
  'economy' : ['Economy1.csv','Economy2.csv'],
  'environment' : ['Environment1.csv','Environment2.csv','Environment3.csv','Environment4.csv',],
  'foreign_affairs' : ['ForeignAffairs1.csv'],
  'health': ['Health1.csv','Health2.csv','Health3.csv','Health4.csv',],
  'inflation':['Inflation1.csv'],
  'politics':['Politics1.csv','Politics2.csv'],
  'technology':['Technology1.csv','Technology2.csv','Technology3.csv','Technology4.csv','Technology5.csv',],
  'welfare':['Welfare1.csv']
}

filtered_folder = {
  'business':'business_tally.csv',
  'economy':'economy_tally.csv',
  'environment' :'environment_tally.csv',
  'foreign_affairs' :'foreign_affairs_tally.csv',
  'health':'health_tally.csv',
  'inflation':'inflation_tally.csv',
  'politics':'politics_tally.csv',
  'technology':'technology_tally.csv',
  'welfare':'welfare_tally.csv'
}

In [2]:
# Creates an empty dataset with columns month, day and all the news sources
def empty_dataset(topic):
    # Define the header columns
    header_columns = ["Month", "Day"] + list(news_companies.keys())

    # Specify the folder and filename
    folder_name = "filtered_datasets"
    filename = os.path.join(folder_name, topic + "_tally.csv")

    # Ensure the folder exists
    os.makedirs(folder_name, exist_ok=True)

    # Generate the date range from January 1 to April 30
    start_date = datetime(2024, 1, 1)
    end_date = datetime(2024, 4, 30)
    date_range = [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)]

    # Prepare the rows with preset values
    rows = []
    for date in date_range:
        month = date.strftime("%B")  # Full month name
        day = date.day
        row = [month, day] + [0] * len(news_companies)
        rows.append(row)

    # Open the file in write mode
    with open(filename, 'w', newline='') as csvfile:
        # Create a csv writer object
        csvwriter = csv.writer(csvfile)

        # Write the header row
        csvwriter.writerow(header_columns)

        # Write the rows with preset values
        csvwriter.writerows(rows)
    print(f"Finished creating '{topic}' dataset as '{topic}_tally.csv'.")

# Creates a dataset per topic included in news_topics
def create_empty_datasets():
    for topic in news_topics:
        print(f"Creating final dataset for {topic}.")
        empty_dataset(topic)


In [3]:
# Function that increments the cell specified by the dataset, month, day and source
def record(filename, month, day, header):
    df = pd.read_csv(filename)
    # Find the row that matches the month and day
    row_index = df[(df['Month'] == month) & (df['Day'] == day)].index

    if not row_index.empty:
        # Increment the value in the specified column
        df.at[row_index[0], header] += 1
    else:
        print(f"No matching row found for {month} {day}")

    df.to_csv(filename, index=False)

    print(f"'{filename}''{day}''{month}''{header}' recorded")

# Sample call
# record("health_tally.csv","January", 1, "Inquirer.net")

In [4]:
def is_url_from_list(url):
    # Ensure url is a string to avoid TypeError
    if not isinstance(url, str):
        return False
    for key, value in news_companies.items():
        if value in url:
            return key
    return False

# Sample calls
url = "https://www.inquiresr.net/some-article/d"
result = is_url_from_list(url)
print(result)  # Output: Inquirer.net


url = "https://www.inquirer.net/some-article/d"
result = is_url_from_list(url)
print(result)  # Output: Flase

False
Inquirer.net


In [5]:
# Function to parse the date and extract month and day
def parse_date(row):
    if isinstance(row, str):  # Check if the input is a string
        try:
            date_str = row.split('\t')[0]
            date_time_str = date_str.strip()
            date_obj = datetime.strptime(date_time_str, '%d-%b-%Y %I:%M%p')
            return date_obj
        except ValueError:
            return None  # Return None for invalid dates
    else:
        return None  # Return None for non-string inputs

# Function to filter each row in each dataset provided the filename of the raw dataset, and the filename of the topic dataset
def raw_data_processing(filename, topicDataset):
    print(f"Filtering raw dataset: '{filename}'.")
    # Dictionary to map month numbers to month names
    month_dict = {
        1: "January", 2: "February", 3: "March", 4: "April",
        5: "May", 6: "June", 7: "July", 8: "August",
        9: "September", 10: "October", 11: "November", 12: "December"
    }

    input_filename = os.path.join("raw_datasets", filename)
    output_filename = os.path.join("filtered_datasets", topicDataset)

    # Detect the encoding of the CSV file
    with open(input_filename, 'rb') as f:
        result = chardet.detect(f.read())
        encoding = result['encoding']

    # Read the CSV file with error handling and correct delimiter
    df = pd.read_csv(input_filename, encoding=encoding, sep='\t', on_bad_lines='skip')

    # Iterate through each row in the DataFrame
    for index, row in df.iterrows():
        date_str = row['Date']
        url = row['URL']

        # Parse the date to get month and day
        date_obj = parse_date(date_str)
        if date_obj:
            month = month_dict[date_obj.month]
            day = date_obj.day
            # Print or store the values as needed
            header = is_url_from_list(url)
            if header != False:
                record(output_filename, month, day, header)
        else:
            print(f"Invalid date format in row {index}")
    print(f"Finished filtering raw dataset: '{filename}'.")

In [6]:
# Main code block for execution
create_empty_datasets()

Creating final dataset for inflation.
Finished creating 'inflation' dataset as 'inflation_tally.csv'.
Creating final dataset for economy.
Finished creating 'economy' dataset as 'economy_tally.csv'.
Creating final dataset for business.
Finished creating 'business' dataset as 'business_tally.csv'.
Creating final dataset for technology.
Finished creating 'technology' dataset as 'technology_tally.csv'.
Creating final dataset for health.
Finished creating 'health' dataset as 'health_tally.csv'.
Creating final dataset for environment.
Finished creating 'environment' dataset as 'environment_tally.csv'.
Creating final dataset for welfare.
Finished creating 'welfare' dataset as 'welfare_tally.csv'.
Creating final dataset for politics.
Finished creating 'politics' dataset as 'politics_tally.csv'.
Creating final dataset for foreign_affairs.
Finished creating 'foreign_affairs' dataset as 'foreign_affairs_tally.csv'.


In [19]:
def collect_data():
  # Loop through each topic
  for topic in news_topics:
      # Loop through each file in the current topic's folder
      for filename in raw_folder[topic]:
          # Construct the output filename using the filtered_folder dictionary
          output_filename = filtered_folder[topic]
          # Call the processing function with the constructed filenames
          raw_data_processing(filename, output_filename)
  print(f"Data collection completed.")

collect_data()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
'filtered_datasets/politics_tally.csv''28''March''The Manila Times' recorded
'filtered_datasets/politics_tally.csv''28''March''The Manila Times' recorded
'filtered_datasets/politics_tally.csv''28''March''The Manila Times' recorded
'filtered_datasets/politics_tally.csv''28''March''The Manila Times' recorded
'filtered_datasets/politics_tally.csv''28''March''The Manila Times' recorded
'filtered_datasets/politics_tally.csv''28''March''The Manila Times' recorded
'filtered_datasets/politics_tally.csv''28''March''The Manila Times' recorded
'filtered_datasets/politics_tally.csv''28''March''The Manila Times' recorded
'filtered_datasets/politics_tally.csv''28''March''The Manila Times' recorded
'filtered_datasets/politics_tally.csv''28''March''The Manila Times' recorded
'filtered_datasets/politics_tally.csv''28''March''The Manila Times' recorded
'filtered_datasets/politics_tally.csv''28''March''The Manila Times' recorded
'filtered_d