In [47]:
import pandas as pd
from datetime import datetime, timedelta
import chardet

# Define the list of news companies and their URLs
news_companies = {
    "Inquirer.net": "https://www.inquirer.net/",
    "Manila Bulletin": "https://mb.com.ph/",
    "The Asian Journal USA": "https://asianjournal.com/",
    "The Manila Times": "https://www.manilatimes.net/",
    "Business World": "https://www.bworldonline.com/",
    "Eagle News": "https://www.eaglenews.ph/",
    "Metro Cebu News": "https://metrocebu.news/",
    "Tempo": "https://tempo.com.ph/",
    "Abante Tonite": "https://tonite.abante.com.ph/",
    "Philippine News Agency": "https://www.pna.gov.ph/",
    "InterAksyon": "https://interaksyon.philstar.com/",
    "Business Mirror": "https://businessmirror.com.ph/",
    "The Summit Express": "https://www.thesummitexpress.com/",
    "Our Daily News Online": "https://ourdailynewsonline.com/",
    "Current PH": "https://currentph.com/",
    "SunStar Philippines": "https://www.sunstar.com.ph/",
    "Rappler": "https://www.rappler.com/",
    "The Bohol Chronicle": "https://www.boholchronicle.com.ph/",
    "Baguio Midland Courier": "https://www.baguiomidlandcourier.com.ph/",
    "GMA News Online": "https://www.gmanetwork.com/news/",
    "Cebu Daily News": "https://cebudailynews.inquirer.net/",
    "ABS-CBN News": "https://news.abs-cbn.com/",
    "Philstar.com": "https://www.philstar.com/",
    "Manila Standard": "https://manilastandard.net/",
    "Daily Tribune": "https://tribune.net.ph/",
    "Davao Today": "https://davaotoday.com/",
    "Sunday Punch": "https://punch.dagupan.com/",
    "Visayan Daily Star": "https://visayandailystar.com/",
    "PTV News": "https://ptvnews.ph/",
    "Mindanao Times": "https://mindanaotimes.com.ph/",
    "PhilNews.XYZ": "https://philnews.xyz/",
    "Northern Dispatch": "https://nordis.net/"
}

# Detect the encoding of the CSV file
with open('BusinessJan1-14.csv', 'rb') as f:
    result = chardet.detect(f.read())
    encoding = result['encoding']

# Read the CSV file with error handling and correct delimiter
df = pd.read_csv('BusinessJan1-14.csv', encoding=encoding, sep='\t', on_bad_lines='skip')

# Print the first few rows of the DataFrame to inspect its structure
print(df.head())

def parse_date(row):
    if isinstance(row, str):  # Check if the input is a string
        try:
            date_str = row.split('\t')[0]
            date_time_str = date_str.strip()
            date_obj = datetime.strptime(date_time_str, '%d-%b-%Y %I:%M%p')
            return date_obj
        except ValueError:
            return None  # Return None for invalid dates
    else:
        return None  # Return None for non-string inputs

# Apply the modified parsing function to the first column
df['Date'] = df.iloc[:, 0].apply(parse_date)


                  Date                                           Headline  \
0  14-Jan-2024 11:59PM  Novationwire Expands Dubai Presence with Launc...   
1  14-Jan-2024 11:59PM  Get that bread: Barcelona bakery allows Daquis...   
2  14-Jan-2024 11:59PM  Novationwire Launches Retail & Hospitality Pre...   
3  14-Jan-2024 11:59PM  The Steam Team Restoration of Austin, Texas, H...   
4  14-Jan-2024 11:45PM              83 countries hold Ukraine peace talks   

                                                 URL  \
0  https://manilapr.com/top-story/novationwire-ex...   
1  https://www.onesports.ph/pvl/article/18364/bus...   
2  https://manilapr.com/top-story/novationwire-la...   
3  https://manilapr.com/top-story/the-steam-team-...   
4  https://tribune.net.ph/2024/01/83-countries-ho...   

                                        Opening Text  \
0  Dubai, UAE – Novationwire, a global leader in ...   
1  By opening the first Filipino bakery in Spain,...   
2  Dubai, UAE – Novationwire, a 

In [32]:
print(df.columns)

Index(['Date\tHeadline\tURL\tOpening Text\tHit Sentence\tSource\tInfluencer\tCountry\tSubregion\tLanguage\tReach\tDesktop Reach\tMobile Reach\tTwitter Social Echo\tFacebook Social Echo\tReddit Social Echo\tNational Viewership\tEngagement\tAVE\tSentiment\tKey Phrases\tInput Name\tKeywords\tTwitter Authority\tTweet Id\tTwitter Id\tTwitter Client\tTwitter Screen Name\tUser Profile Url\tTwitter Bio\tTwitter Followers\tTwitter Following\tAlternate Date Format\tTime\tState\tCity\tSocial Echo Total\tEditorial Echo\tViews\tEstimated Views\tLikes\tReplies\tRetweets\tComments\tShares\tReactions\tThreads\tIs Verified\tParent URL\tDocument Tags\tDocument ID', 'Date'], dtype='object')


In [48]:
# Initialize the new DataFrame
start_date = datetime(2024, 1, 1)
end_date = datetime(2024, 4, 30)
date_range = pd.date_range(start_date, end_date)

data = []
for date in date_range:
    for source in news_companies.keys():
        data.append([date.strftime('%B'), date.day, source, 0])

new_df = pd.DataFrame(data, columns=['Month', 'Day', 'Source', 'Count'])

# Increment counts based on the original CSV
for index, row in df.iterrows():
    date = row['Date']
    url = row['Parent URL']  # Use 'Parent URL' instead of 'URL'
    if pd.notnull(url):  # Check if 'url' is not NaN
        month = date.strftime('%B')
        day = date.day
        for source, source_url in news_companies.items():
            # Check if the URL contains the source URL anywhere within it
            if source_url in url:
                new_df.loc[(new_df['Month'] == month) & (new_df['Day'] == day) & (new_df['Source'] == source), 'Count'] += 1


# Print the new DataFrame to verify the counts
print(new_df)

        Month  Day                 Source  Count
0     January    1           Inquirer.net      0
1     January    1        Manila Bulletin      0
2     January    1  The Asian Journal USA      0
3     January    1       The Manila Times      0
4     January    1         Business World      0
...       ...  ...                    ...    ...
3867    April   30     Visayan Daily Star      0
3868    April   30               PTV News      0
3869    April   30         Mindanao Times      0
3870    April   30           PhilNews.XYZ      0
3871    April   30      Northern Dispatch      0

[3872 rows x 4 columns]
