In [32]:
# Reading the JSON file containing newspaper data
import json
with open("newspapers_bs4.json", 'r', encoding='utf-8') as f:
    newspapersDict = json.load(f)

In [33]:
# Extracting months from the newspaper entries
import re
months = []
for i in newspapersDict.keys():
    for j in newspapersDict[i]:
        match = re.search(r'^\d+\s+(.*?)\s+\d{4}$', j[0])
        if match:
            month = match.group(1)
            months.append(month.lower())


In [34]:
# Converting the list of months to a set to remove duplicates
set(months)

{',ilktesrin',
 'agusotos',
 'aout',
 'april',
 'aralik',
 'august',
 'avril',
 'ağustos',
 'birinci kanun',
 'birinci teşrin',
 'decembre',
 'dezember',
 'ekim',
 'eylül',
 'februar',
 'fevrier',
 'haziran',
 'iikincikanun',
 'ikinci tesrin',
 'ikincikanıun',
 'ikincitestin',
 'ikinicikanun',
 'i̇kinci kanun',
 'i̇kinci teşrin',
 'i̇lk kanun',
 'i̇lk teşrin',
 'januar',
 'janvier',
 'juillet',
 'juin',
 'juli',
 'juni',
 'kannunusani',
 'kanunisani',
 'kanunu evvel',
 'kanunu sani',
 'kanunuevel',
 'kasim',
 'mai',
 'mars',
 'mart',
 'marz',
 'mayis',
 'mayıs',
 'nisan',
 'november',
 'novembre',
 'ocak',
 'october',
 'octobre',
 'octombre',
 'september',
 'septembre',
 'son kanun',
 'son teşrin',
 'sonkanun',
 'temmuz',
 'teşrin-i evvel',
 'teşrin-i sani',
 'şubat'}

In [38]:
import re

# Normalization function for Turkish-specific characters
# This function will lowercase the text and replace Turkish characters with their normalized equivalents.
def normalize(text):
    """Lowercase and normalize Turkish-specific characters."""
    return text.lower().replace('ı', 'i')\
                      .replace('ş', 's')\
                      .replace('ç', 'c')\
                      .replace('ğ', 'g')\
                      .replace('ö', 'o')\
                      .replace('ü', 'u')\
                      .strip()


# Month mapping dictionary with various language variants
# Each month is mapped to a list of its possible names in different languages and formats.
month_number_map = {
    1: ['ocak', 'januar', 'janvier', 'kanunisani', 'kanunu sani',
        'ikincikanun', 'iikincikanun', 'ikincikanıun', 'ikinicikanun',
        'i̇kinci kanun', 'son kanun', 'sonkanun', 'ikinci kanun', 'kannunusani'],
    2: ['subat', 'şubat', 'februar', 'fevrier'],
    3: ['mart', 'mars', 'marz'],
    4: ['nisan', 'april', 'avril'],
    5: ['mayis', 'mayıs', 'mai'],
    6: ['haziran', 'juin', 'juni'],
    7: ['temmuz', 'juillet', 'juli'],
    8: ['agustos', 'ağustos', 'august', 'aout', 'agusotos'],
    9: ['eylul', 'eylül', 'september', 'septembre'],
    10: ['ekim', 'tesrinievvel', 'tesrin-i evvel', 'birinci tesrin',
         'i̇lk tesrin', 'ilk tesrin', 'ilk teşrin', ',ilktesrin', 'octomber', 'octobre', 'octombre', 'october'],
    11: ['kasim', 'tesrinisani', 'tesrin-i sani', 'ikinci tesrin',
         'i̇kinci teşrin', 'son teşrin', 'ikincitestin', 'nobember', 'novembre', 'november', 'ikincitesrin'],
    12: ['aralik', 'decembre', 'dezember', 'kanunu evvel',
         'kanunuevel', 'kanunuevvel', 'birinci kanun',
         'i̇lk kanun', 'ilk kanun', 'ilkkanun', 'birincikanun', 'ilkkanun']
}

# Create a mapping from normalized month names to their respective month numbers
# This will allow for quick lookups when converting dates.
month_mapping = {}
for month_num, variants in month_number_map.items():
    for variant in variants:
        key = normalize(variant)
        month_mapping[key] = month_num

# Function to convert all dates in the dataset
# This function processes each newspaper entry, extracts the date, normalizes it,
# and formats it into a standardized date string.
def convert_all_dates(dataset):
    converted_all = {}
    unknown_months = []
    invalid_dates = []

    for newspaper, entries in dataset.items():
        converted_entries = []
        # Process each entry in the newspaper
        for entry in entries:
            date_str, url = entry
            date_str = date_str.strip()

            # Handle cases where the date is missing or malformed
            url_date_match = re.search(r'(\d{4})_ilkkanun_(\d+)(?:[^0-9].*|\.pdf)?', url)
            if url_date_match:
                year = url_date_match.group(1)
                day_num = url_date_match.group(2)
                formatted_date = f"{int(day_num)}-12-{year}"
                converted_entries.append([formatted_date, url])
                continue

            # 1. Format: 01 Temmuz 1936
            match = re.match(r"(\d{1,2})\s+(.+?)\s+(\d{4})", date_str)
            if match:
                day, raw_month, year = match.groups()

            # 2. Format: 5.pdf İkinci Kanun 1936
            else:
                match = re.match(r"(\d{1,2})\.pdf\s+(.+?)\s+(\d{4})", date_str, re.IGNORECASE)
                if match:
                    day, raw_month, year = match.groups()

                # 3. Format: İlk Kanun 1936 
                else:
                    match = re.match(r"(.+?)\s+(\d{4})", date_str)
                    if match:
                        raw_month, year = match.groups()
                        day = "01"
                    else:
                        converted_entries.append([f"INVALID_DATE: {date_str}", url])
                        invalid_dates.append((newspaper, date_str, url))
                        continue

            # Normalize the month name
            month_key = normalize(raw_month)
            month_num = month_mapping.get(month_key)

            # If the month is not found, try to match it with known keys
            if not month_num:
                for known_key in month_mapping:
                    if month_key in known_key or known_key in month_key:
                        month_num = month_mapping[known_key]
                        break
        
            # If the month is still not found, log it as unknown
            if not month_num:
                converted_entries.append([f"UNKNOWN_MONTH: {raw_month}", url])
                unknown_months.append((newspaper, raw_month, date_str, url))
            else:
                final_day = day.zfill(2) if day.isdigit() else "??"
                formatted_date = f"{final_day}-{str(month_num).zfill(2)}-{year}"
                converted_entries.append([formatted_date, url])

        converted_all[newspaper] = converted_entries

    return converted_all, unknown_months, invalid_dates


In [39]:
# Convert all dates in the dataset
converted = convert_all_dates(newspapersDict)


In [40]:
# Function to count the total number of links in the dataset
# This function iterates through the dataset and counts the number of entries for each newspaper
# It returns the total count of links across all newspapers
def count_total_links(dataset):
    total_links = 0
    for newspaper, entries in dataset.items():
        total_links += len(entries)
    return total_links

# Count the total number of links in the converted dataset
print(count_total_links(converted[0]))
# Count the total number of links in the original dataset
print(count_total_links(newspapersDict))

64286
64286
