In [1]:
import pandas as pd
from datetime import datetime, timedelta
import chardet

# Define the list of news companies and their URLs
news_companies = {
    "Inquirer.net": "https://www.inquirer.net/",
    "Manila Bulletin": "https://mb.com.ph/",
    "The Asian Journal USA": "https://asianjournal.com/",
    "The Manila Times": "https://www.manilatimes.net/",
    "Business World": "https://www.bworldonline.com/",
    "Eagle News": "https://www.eaglenews.ph/",
    "Metro Cebu News": "https://metrocebu.news/",
    "Tempo": "https://tempo.com.ph/",
    "Abante Tonite": "https://tonite.abante.com.ph/",
    "Philippine News Agency": "https://www.pna.gov.ph/",
    "InterAksyon": "https://interaksyon.philstar.com/",
    "Business Mirror": "https://businessmirror.com.ph/",
    "The Summit Express": "https://www.thesummitexpress.com/",
    "Our Daily News Online": "https://ourdailynewsonline.com/",
    "Current PH": "https://currentph.com/",
    "SunStar Philippines": "https://www.sunstar.com.ph/",
    "Rappler": "https://www.rappler.com/",
    "The Bohol Chronicle": "https://www.boholchronicle.com.ph/",
    "Baguio Midland Courier": "https://www.baguiomidlandcourier.com.ph/",
    "GMA News Online": "https://www.gmanetwork.com/news/",
    "Cebu Daily News": "https://cebudailynews.inquirer.net/",
    "ABS-CBN News": "https://news.abs-cbn.com/",
    "Philstar.com": "https://www.philstar.com/",
    "Manila Standard": "https://manilastandard.net/",
    "Daily Tribune": "https://tribune.net.ph/",
    "Davao Today": "https://davaotoday.com/",
    "Sunday Punch": "https://punch.dagupan.com/",
    "Visayan Daily Star": "https://visayandailystar.com/",
    "PTV News": "https://ptvnews.ph/",
    "Mindanao Times": "https://mindanaotimes.com.ph/",
    "PhilNews.XYZ": "https://philnews.xyz/",
    "Northern Dispatch": "https://nordis.net/"
}

# Detect the encoding of the CSV file
with open('BusinessJan1-14.csv', 'rb') as f:
    result = chardet.detect(f.read())
    encoding = result['encoding']

# Read the CSV file with error handling and correct delimiter
df = pd.read_csv('BusinessJan1-14.csv', encoding=encoding, sep='\t', on_bad_lines='skip')

def parse_date(row):
    if isinstance(row, str):  # Check if the input is a string
        try:
            date_str = row.split('\t')[0]
            date_time_str = date_str.strip()
            date_obj = datetime.strptime(date_time_str, '%d-%b-%Y %I:%M%p')
            return date_obj
        except ValueError:
            return None  # Return None for invalid dates
    else:
        return None  # Return None for non-string inputs

# Apply the modified parsing function to the first column
df['Date'] = df.iloc[:, 0].apply(parse_date)

# Initialize the new DataFrame
start_date = datetime(2024, 1, 1)
end_date = datetime(2024, 4, 30)
date_range = pd.date_range(start_date, end_date)

# Create a list of columns for the new DataFrame
columns = ['Month', 'Day'] + list(news_companies.keys())

# Initialize the new DataFrame with zeros
data = []
for date in date_range:
    row = [date.strftime('%B'), date.day] + [0] * len(news_companies)
    data.append(row)

new_df = pd.DataFrame(data, columns=columns)

# Normalize URLs in the news_companies dictionary
normalized_news_companies = {k: v.rstrip('/') for k, v in news_companies.items()}

# Increment counts based on the original CSV
for index, row in df.iterrows():
    date = row['Date']
    url = row['Parent URL']  # Use 'Parent URL' instead of 'URL'
    if pd.notnull(date) and pd.notnull(url):  # Check if 'date' and 'url' are not NaN
        month = date.strftime('%B')
        day = date.day
        for source, source_url in normalized_news_companies.items():
            # Check if the URL contains the source URL
            if source_url in url:
                new_df.loc[(new_df['Month'] == month) & (new_df['Day'] == day), source] += 1

# Export the new DataFrame to a CSV file
new_df.to_csv('news_counts.csv', index=False)

# Print the new DataFrame to verify the counts
print(new_df)


       Month  Day  Inquirer.net  Manila Bulletin  The Asian Journal USA  \
0    January    1             0                0                      0   
1    January    2             0                0                      0   
2    January    3             0                0                      0   
3    January    4             0                0                      0   
4    January    5             0                0                      0   
..       ...  ...           ...              ...                    ...   
116    April   26             0                0                      0   
117    April   27             0                0                      0   
118    April   28             0                0                      0   
119    April   29             0                0                      0   
120    April   30             0                0                      0   

     The Manila Times  Business World  Eagle News  Metro Cebu News  Tempo  \
0                   0 