In [1]:
import glob
import os


folder_path = "../Data/NCAA_Newspaper_articles"

txt_files = glob.glob(os.path.join(folder_path, "*.txt"))

In [2]:
import pandas as pd

all_texts = []

for file_path in txt_files:
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            all_texts.append({
                'filename': os.path.basename(file_path),
                'content': content
            })
    except Exception as e:
        print(f"Error reading {file_path}: {e}")

df = pd.DataFrame(all_texts)

In [3]:
df['content']

0      ______________________________________________...
1      ______________________________________________...
2      ______________________________________________...
3      ______________________________________________...
4      ______________________________________________...
                             ...                        
186    ______________________________________________...
187    ______________________________________________...
188    ______________________________________________...
189    ______________________________________________...
190    ______________________________________________...
Name: content, Length: 191, dtype: object

In [4]:
import glob
import os
import pandas as pd
import re

def parse_newspaper_file(file_path):
    """Parse a single newspaper text file and extract structured data"""
    
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None
    
    # Split content by the separator lines
    articles = content.split('_' * 60)  # Split by lines of underscores
    
    parsed_articles = []
    
    for article_text in articles:
        article_text = article_text.strip()
        if not article_text:  # Skip empty sections
            continue
            
        article_data = {}
        
        # Extract title (first line that's not empty)
        lines = article_text.split('\n')
        for line in lines:
            line = line.strip()
            if line and not line.startswith('Author:'):
                article_data['title'] = line
                break
        
        # Extract Author
        author_match = re.search(r'Author:\s*(.+?)(?:\n|$)', article_text, re.MULTILINE)
        article_data['author'] = author_match.group(1).strip() if author_match else ''
        
        # Extract Publication info
        pub_info_match = re.search(r'Publication info:\s*(.+?)(?:\n|http)', article_text, re.MULTILINE | re.DOTALL)
        article_data['publication_info'] = pub_info_match.group(1).strip() if pub_info_match else ''
        
        # Extract URLs (multiple URLs possible)
        urls = re.findall(r'http[s]?://[^\s\n]+', article_text)
        article_data['urls'] = ' | '.join(urls) if urls else ''
        
        # Extract Abstract
        abstract_match = re.search(r'Abstract:\s*(.+?)(?:\nLinks:|\nFull text:)', article_text, re.MULTILINE | re.DOTALL)
        article_data['abstract'] = abstract_match.group(1).strip() if abstract_match else ''
        
        # Extract Links section
        links_match = re.search(r'Links:\s*(.+?)(?:\nFull text:)', article_text, re.MULTILINE | re.DOTALL)
        article_data['links'] = links_match.group(1).strip() if links_match else ''
        
        # Extract Full text
        full_text_match = re.search(r'Full text:\s*(.+?)(?:\nCompany / organization:|People:|Title:)', article_text, re.MULTILINE | re.DOTALL)
        article_data['full_text'] = full_text_match.group(1).strip() if full_text_match else ''
        
        # Extract company/ organization
        company_match = re.search(r'Company / organization: Name:\s*(.+?)(?:\nTitle:|Publication title:|Pages:)', article_text, re.MULTILINE | re.DOTALL)
        article_data['company_match'] = company_match.group(1).strip() if company_match else ''
        
        # Extract Title
        title_match = re.search(r'Title:\s*(.+?)(?:\nPublication title:|Pages:)', article_text, re.MULTILINE | re.DOTALL)
        article_data['title_match'] = title_match.group(1).strip() if title_match else ''
        
        # Extract Publication title
        pub_title_match = re.search(r'Publication title:\s*(.+?)(?:\n|$)', article_text, re.MULTILINE)
        article_data['publication_title'] = pub_title_match.group(1).strip() if pub_title_match else ''
        
        # Extract Pages
        pages_match = re.search(r'Pages:\s*(.+?)(?:\n|$)', article_text, re.MULTILINE)
        article_data['pages'] = pages_match.group(1).strip() if pages_match else ''
        
        # Extract Publication year
        pub_year_match = re.search(r'Publication year:\s*(.+?)(?:\n|$)', article_text, re.MULTILINE)
        article_data['publication_year'] = pub_year_match.group(1).strip() if pub_year_match else ''
        
        # Extract Publication date
        pub_date_match = re.search(r'Publication date:\s*(.+?)(?:\n|$)', article_text, re.MULTILINE)
        article_data['publication_date'] = pub_date_match.group(1).strip() if pub_date_match else ''
        
        # Extract Section
        section_match = re.search(r'Section:\s*(.+?)(?:\n|$)', article_text, re.MULTILINE)
        article_data['section'] = section_match.group(1).strip() if section_match else ''
        
        # Extract Publisher
        publisher_match = re.search(r'Publisher:\s*(.+?)(?:\n|$)', article_text, re.MULTILINE)
        article_data['publisher'] = publisher_match.group(1).strip() if publisher_match else ''
        
        # Extract Place of publication
        place_match = re.search(r'Place of publication:\s*(.+?)(?:\n|$)', article_text, re.MULTILINE)
        article_data['place_of_publication'] = place_match.group(1).strip() if place_match else ''
        
        # Extract Publication subject
        publication_subject_match = re.search(r'Publication subject:\s*(.+?)(?:\n|$)', article_text, re.MULTILINE)
        article_data['publication_subject_match'] = publication_subject_match.group(1).strip() if publication_subject_match else ''
        
        # Extract ISSN
        issn_match = re.search(r'ISSN:\s*(.+?)(?:\n|$)', article_text, re.MULTILINE)
        article_data['issn'] = issn_match.group(1).strip() if issn_match else ''
        
        # Extract Document type
        doc_type_match = re.search(r'Document type:\s*(.+?)(?:\n|$)', article_text, re.MULTILINE)
        article_data['document_type'] = doc_type_match.group(1).strip() if doc_type_match else ''
        
        # Extract ProQuest document ID
        doc_id_match = re.search(r'ProQuest document ID:\s*(.+?)(?:\n|$)', article_text, re.MULTILINE)
        article_data['proquest_id'] = doc_id_match.group(1).strip() if doc_id_match else ''
        
        # Extract Database
        database_match = re.search(r'Database:\s*(.+?)(?:\n|$)', article_text, re.MULTILINE)
        article_data['database'] = database_match.group(1).strip() if database_match else ''
        
        # Add source file name
        article_data['source_file'] = os.path.basename(file_path)
        
        # Only add if we have some meaningful data
        if article_data.get('title') or article_data.get('author') or article_data.get('abstract'):
            parsed_articles.append(article_data)
    
    return parsed_articles

def process_all_files(folder_path="../data/NCAA_Newspaper_articles", output_file="newspaper_data.csv"):
    """Process all text files in the folder and create CSV"""
    
    # Get all text files
    txt_files = glob.glob(os.path.join(folder_path, "*.txt"))
    
    if not txt_files:
        print(f"No .txt files found in {folder_path}")
        return
    
    print(f"Found {len(txt_files)} text files to process...")
    
    all_articles = []
    
    for file_path in txt_files:
        print(f"Processing {os.path.basename(file_path)}...")
        articles = parse_newspaper_file(file_path)
        if articles:
            all_articles.extend(articles)
    
    if not all_articles:
        print("No articles were successfully parsed.")
        return
    
    # Create DataFrame
    df = pd.DataFrame(all_articles)
    
    # Reorder columns for better readability
    column_order = [
        'source_file', 'title', 'author', 'publication_info', 'abstract', 
        'urls', 'links', 'publication_title', 'publication_date', 'publication_year',
        'section', 'pages', 'publisher', 'place_of_publication', 'issn',
        'document_type', 'proquest_id', 'database', 'full_text'
    ]
    
    # Reorder columns (only include columns that exist)
    existing_columns = [col for col in column_order if col in df.columns]
    df = df[existing_columns]
    
    # Save to CSV
    df.to_csv(output_file, index=False, encoding='utf-8')
    
    print(f"\nSuccessfully processed {len(all_articles)} articles!")
    print(f"Data saved to: {output_file}")
    print(f"Columns: {list(df.columns)}")
    print("\nFirst few rows preview:")
    print(df[['title', 'author', 'publication_title', 'publication_date']].head())
    
    return df

# Run the processing
if __name__ == "__main__":
    # Process all files and create CSV
    df = process_all_files()
    
    # Optional: Display some statistics
    if df is not None:
        print(f"\nDataset statistics:")
        print(f"- Total articles: {len(df)}")
        print(f"- Articles with abstracts: {df['abstract'].notna().sum()}")
        print(f"- Unique publications: {df['publication_title'].nunique()}")
        print(f"- Date range: {df['publication_date'].min()} to {df['publication_date'].max()}")

Found 191 text files to process...
Processing 2002 1.txt...
Processing 2002 10.txt...
Processing 2002 11.txt...
Processing 2002 2.txt...
Processing 2002 3.txt...
Processing 2002 4.txt...
Processing 2002 5.txt...
Processing 2002 6.txt...
Processing 2002 7.txt...
Processing 2002 8.txt...
Processing 2002 9.txt...
Processing 2003 1.txt...
Processing 2003 10.txt...
Processing 2003 11.txt...
Processing 2003 12.txt...
Processing 2003 13.txt...
Processing 2003 2.txt...
Processing 2003 3.txt...
Processing 2003 4.txt...
Processing 2003 5.txt...
Processing 2003 6.txt...
Processing 2003 7.txt...
Processing 2003 8.txt...
Processing 2003 9.txt...
Processing 2004 1.txt...
Processing 2004 10.txt...
Processing 2004 11.txt...
Processing 2004 12.txt...
Processing 2004 2.txt...
Processing 2004 3.txt...
Processing 2004 4.txt...
Processing 2004 5.txt...
Processing 2004 6.txt...
Processing 2004 7 .txt...
Processing 2004 8.txt...
Processing 2004 9.txt...
Processing 2005 1.txt...
Processing 2005 10.txt...
Proc

In [5]:
import glob
import os
import pandas as pd
import re
from collections import defaultdict

def parse_newspaper_file(file_path):
    """Parse a single newspaper text file and extract structured data"""
    
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None
    
    # Split content by the separator lines
    articles = content.split('_' * 60)  # Split by lines of underscores
    
    parsed_articles = []
    
    for article_text in articles:
        article_text = article_text.strip()
        if not article_text:  # Skip empty sections
            continue
             
        article_data = {}
        
        # Extract title (first line that's not empty)
        lines = article_text.split('\n')
        for line in lines:
            line = line.strip()
            if line and not line.startswith('Author:'):
                article_data['title'] = line
                break
        
        # Extract Author
        author_match = re.search(r'Author:\s*(.+?)(?:\n|$)', article_text, re.MULTILINE)
        article_data['author'] = author_match.group(1).strip() if author_match else ''
        
        # Extract Publication info
        pub_info_match = re.search(r'Publication info:\s*(.+?)(?:\n|http)', article_text, re.MULTILINE | re.DOTALL)
        article_data['publication_info'] = pub_info_match.group(1).strip() if pub_info_match else ''
        
        # Extract URLs (multiple URLs possible)
        urls = re.findall(r'http[s]?://[^\s\n]+', article_text)
        article_data['urls'] = ' | '.join(urls) if urls else ''
        
        # Extract Abstract
        abstract_match = re.search(r'Abstract:\s*(.+?)(?:\nLinks:|\nFull text:)', article_text, re.MULTILINE | re.DOTALL)
        article_data['abstract'] = abstract_match.group(1).strip() if abstract_match else ''
        
        # Extract Links section
        links_match = re.search(r'Links:\s*(.+?)(?:\nFull text:)', article_text, re.MULTILINE | re.DOTALL)
        article_data['links'] = links_match.group(1).strip() if links_match else ''
        
        # Extract Full text
        full_text_match = re.search(r'Full text:\s*(.+?)(?:\nCompany / organization:|People:|Title:)', article_text, re.MULTILINE | re.DOTALL)
        article_data['full_text'] = full_text_match.group(1).strip() if full_text_match else ''
        
        # Extract company/ organization
        company_match = re.search(r'Company / organization: Name:\s*(.+?)(?:\nTitle:|Publication title:|Pages:)', article_text, re.MULTILINE | re.DOTALL)
        article_data['company_match'] = company_match.group(1).strip() if company_match else ''
        
        # Extract Title
        title_match = re.search(r'Title:\s*(.+?)(?:\nPublication title:|Pages:)', article_text, re.MULTILINE | re.DOTALL)
        article_data['title_match'] = title_match.group(1).strip() if title_match else ''
        
        # Extract Publication title
        pub_title_match = re.search(r'Publication title:\s*(.+?)(?:\n|$)', article_text, re.MULTILINE)
        article_data['publication_title'] = pub_title_match.group(1).strip() if pub_title_match else ''
        
        # Extract Pages
        pages_match = re.search(r'Pages:\s*(.+?)(?:\n|$)', article_text, re.MULTILINE)
        article_data['pages'] = pages_match.group(1).strip() if pages_match else ''
        
        # Extract Publication year
        pub_year_match = re.search(r'Publication year:\s*(.+?)(?:\n|$)', article_text, re.MULTILINE)
        article_data['publication_year'] = pub_year_match.group(1).strip() if pub_year_match else ''
        
        # Extract Publication date
        pub_date_match = re.search(r'Publication date:\s*(.+?)(?:\n|$)', article_text, re.MULTILINE)
        article_data['publication_date'] = pub_date_match.group(1).strip() if pub_date_match else ''
        
        # Extract Section
        section_match = re.search(r'Section:\s*(.+?)(?:\n|$)', article_text, re.MULTILINE)
        article_data['section'] = section_match.group(1).strip() if section_match else ''
        
        # Extract Publisher
        publisher_match = re.search(r'Publisher:\s*(.+?)(?:\n|$)', article_text, re.MULTILINE)
        article_data['publisher'] = publisher_match.group(1).strip() if publisher_match else ''
        
        # Extract Place of publication
        place_match = re.search(r'Place of publication:\s*(.+?)(?:\n|$)', article_text, re.MULTILINE)
        article_data['place_of_publication'] = place_match.group(1).strip() if place_match else ''
        
        # Extract Publication subject
        publication_subject_match = re.search(r'Publication subject:\s*(.+?)(?:\n|$)', article_text, re.MULTILINE)
        article_data['publication_subject_match'] = publication_subject_match.group(1).strip() if publication_subject_match else ''
        
        # Extract ISSN
        issn_match = re.search(r'ISSN:\s*(.+?)(?:\n|$)', article_text, re.MULTILINE)
        article_data['issn'] = issn_match.group(1).strip() if issn_match else ''
        
        # Extract Document type
        doc_type_match = re.search(r'Document type:\s*(.+?)(?:\n|$)', article_text, re.MULTILINE)
        article_data['document_type'] = doc_type_match.group(1).strip() if doc_type_match else ''
        
        # Extract ProQuest document ID
        doc_id_match = re.search(r'ProQuest document ID:\s*(.+?)(?:\n|$)', article_text, re.MULTILINE)
        article_data['proquest_id'] = doc_id_match.group(1).strip() if doc_id_match else ''
        
        # Extract Database
        database_match = re.search(r'Database:\s*(.+?)(?:\n|$)', article_text, re.MULTILINE)
        article_data['database'] = database_match.group(1).strip() if database_match else ''
        
        # Add source file name
        article_data['source_file'] = os.path.basename(file_path)
        
        # Only add if we have some meaningful data
        if article_data.get('title') or article_data.get('author') or article_data.get('abstract'):
            parsed_articles.append(article_data)
    
    return parsed_articles

def extract_year_from_filename(filename):
    """Extract year from filename like '2002 1.txt' or '2024 2.txt'"""
    # Use regex to find 4-digit year at the start of filename
    year_match = re.match(r'(\d{4})', filename)
    if year_match:
        return year_match.group(1)
    return None

def process_all_files_by_year(folder_path="../data/NCAA_Newspaper_articles", output_folder="output_by_year"):
    """Process all text files and create separate CSV files for each year"""
    
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Get all text files
    txt_files = glob.glob(os.path.join(folder_path, "*.txt"))
    
    if not txt_files:
        print(f"No .txt files found in {folder_path}")
        return
    
    print(f"Found {len(txt_files)} text files to process...")
    
    # Dictionary to store articles by year
    articles_by_year = defaultdict(list)
    
    # Process each file
    for file_path in txt_files:
        filename = os.path.basename(file_path)
        print(f"Processing {filename}...")
        
        # Extract year from filename
        year = extract_year_from_filename(filename)
        if not year:
            print(f"  Warning: Could not extract year from {filename}")
            year = "unknown"
        
        # Parse the file
        articles = parse_newspaper_file(file_path)
        if articles:
            articles_by_year[year].extend(articles)
            print(f"  Found {len(articles)} articles for year {year}")
        else:
            print(f"  No articles found in {filename}")
    
    if not articles_by_year:
        print("No articles were successfully parsed.")
        return
    
    # Create separate CSV files for each year
    all_dataframes = {}
    
    for year, articles in articles_by_year.items():
        print(f"\nProcessing {len(articles)} articles for year {year}...")
        
        # Create DataFrame for this year
        df = pd.DataFrame(articles)
        
        # Reorder columns for better readability
        column_order = [
            'source_file', 'title', 'author', 'publication_info', 'abstract', 
            'urls', 'links', 'publication_title', 'publication_date', 'publication_year',
            'section', 'pages', 'publisher', 'place_of_publication', 'issn',
            'document_type', 'proquest_id', 'database', 'company_match', 
            'title_match', 'publication_subject_match', 'full_text'
        ]
        
        # Reorder columns (only include columns that exist)
        existing_columns = [col for col in column_order if col in df.columns]
        df = df[existing_columns]
        
        # Save to CSV
        output_file = os.path.join(output_folder, f"newspaper_data_{year}.csv")
        df.to_csv(output_file, index=False, encoding='utf-8')
        
        # Store dataframe for summary
        all_dataframes[year] = df
        
        # Get file size
        file_size_mb = os.path.getsize(output_file) / (1024**2)
        print(f"Saved {output_file}: {len(df)} articles, {file_size_mb:.2f} MB")
        
        # Show preview for this year
        if len(df) > 0:
            print(f"Sample from {year}:")
            sample_cols = ['title', 'author', 'publication_title', 'publication_date']
            available_cols = [col for col in sample_cols if col in df.columns]
            print(df[available_cols].head(2))
    
    # Print summary statistics
    print(f"\n{'='*50}")
    print("SUMMARY STATISTICS BY YEAR")
    print(f"{'='*50}")
    
    total_articles = 0
    for year in sorted(all_dataframes.keys()):
        df = all_dataframes[year]
        articles_count = len(df)
        total_articles += articles_count
        
        abstracts_count = df['abstract'].notna().sum() if 'abstract' in df.columns else 0
        unique_pubs = df['publication_title'].nunique() if 'publication_title' in df.columns else 0
        
        print(f"Year {year}:")
        print(f"  - Total articles: {articles_count}")
        print(f"  - Articles with abstracts: {abstracts_count}")
        print(f"  - Unique publications: {unique_pubs}")
        
        if 'publication_date' in df.columns:
            date_range = f"{df['publication_date'].min()} to {df['publication_date'].max()}"
            print(f"  - Date range: {date_range}")
        print()
    
    print(f"TOTAL ARTICLES ACROSS ALL YEARS: {total_articles}")
    print(f"CSV files created in: {output_folder}/")
    
    return all_dataframes

# Run the processing
if __name__ == "__main__":
    # Process all files and create separate CSV files by year
    dataframes_by_year = process_all_files_by_year()
    
    # Optional: Create a combined summary file with just key statistics
    if dataframes_by_year:
        summary_data = []
        for year, df in dataframes_by_year.items():
            summary_data.append({
                'year': year,
                'total_articles': len(df),
                'articles_with_abstracts': df['abstract'].notna().sum() if 'abstract' in df.columns else 0,
                'unique_publications': df['publication_title'].nunique() if 'publication_title' in df.columns else 0,
                'unique_authors': df['author'].nunique() if 'author' in df.columns else 0
            })
        
        summary_df = pd.DataFrame(summary_data)
        summary_df.to_csv("output_by_year/yearly_summary.csv", index=False)
        print("Created yearly summary file: output_by_year/yearly_summary.csv")

Found 191 text files to process...
Processing 2002 1.txt...
  Found 1001 articles for year 2002
Processing 2002 10.txt...
  Found 1000 articles for year 2002
Processing 2002 11.txt...
  Found 649 articles for year 2002
Processing 2002 2.txt...
  Found 1001 articles for year 2002
Processing 2002 3.txt...
  Found 1001 articles for year 2002
Processing 2002 4.txt...
  Found 1001 articles for year 2002
Processing 2002 5.txt...
  Found 1002 articles for year 2002
Processing 2002 6.txt...
  Found 1001 articles for year 2002
Processing 2002 7.txt...
  Found 501 articles for year 2002
Processing 2002 8.txt...
  Found 1001 articles for year 2002
Processing 2002 9.txt...
  Found 1001 articles for year 2002
Processing 2003 1.txt...
  Found 1001 articles for year 2003
Processing 2003 10.txt...
  Found 1001 articles for year 2003
Processing 2003 11.txt...
  Found 1001 articles for year 2003
Processing 2003 12.txt...
  Found 1001 articles for year 2003
Processing 2003 13.txt...
  Found 23 articles f

In [7]:
file_path = "output_by_year/newspaper_data_2024.csv"

df = pd.read_csv(file_path, encoding='utf-8')

df.head()

Unnamed: 0,source_file,title,author,publication_info,abstract,urls,links,publication_title,publication_date,publication_year,...,publisher,place_of_publication,issn,document_type,proquest_id,database,company_match,title_match,publication_subject_match,full_text
0,2024 1.txt,Kenny Dillingham makes Arizona State evolution...,"Wolken, Dan","Wolken, Dan.",None available.,http://ezproxy.newcastle.edu.au/login?url=http...,https://newcastle.primo.exlibrisgroup.com/disc...,USA Today (Online); Arlington,"Dec 31, 2024",2024,...,"USA Today, a division of Gannett Satellite Inf...",Arlington,21651779.0,News,3150269804,U.S. Newsstream Collection,National Collegiate Athletic Association--NCAA...,Kenny Dillingham makes Arizona State evolution...,General Interest Periodicals--United States,"ATLANTA — It all lined up perfectly, the 32-ye..."
1,2024 1.txt,What is the NCAA rushing record? Can Ashton Je...,"Kassim, Ehsan","Kassim, Ehsan.",None available.,http://ezproxy.newcastle.edu.au/login?url=http...,https://newcastle.primo.exlibrisgroup.com/disc...,USA Today (Online); Arlington,"Dec 31, 2024",2024,...,"USA Today, a division of Gannett Satellite Inf...",Arlington,21651779.0,News,3150269789,U.S. Newsstream Collection,National Collegiate Athletic Association--NCAA...,What is the NCAA rushing record? Can Ashton Je...,General Interest Periodicals--United States,Ashton Jeanty came up just short in his quest ...
2,2024 1.txt,RED STORM NEED A SIGNATURE WIN: BEATING BLUEJA...,"Rubin, Roger","Rubin, Roger.",None available.,http://ezproxy.newcastle.edu.au/login?url=http...,https://newcastle.primo.exlibrisgroup.com/disc...,"Newsday, Combined editions; Long Island, N.Y.","Dec 31, 2024",2024,...,Newsday LLC,"Long Island, N.Y.",,News,3150216422,U.S. Newsstream Collection,National Collegiate Athletic Association--NCAA...,RED STORM NEED A SIGNATURE WIN: Beating Blue...,General Interest Periodicals--United States,"Roger Rubin\nroger.rubin@newsday.com OMAHA, Ne..."
3,2024 1.txt,WIN trumps all: Boise State's Jeanty focused o...,,,None available.,http://ezproxy.newcastle.edu.au/login?url=http...,https://newcastle.primo.exlibrisgroup.com/disc...,"Arizona Republic; Phoenix, Ariz.","Dec 31, 2024",2024,...,Gannett Media Corp,"Phoenix, Ariz.",8928711.0,News,3150206898,U.S. Newsstream Collection,National Collegiate Athletic Association--NCAA...,WIN trumps all: Boise State's Jeanty focused...,General Interest Periodicals--United States,"Nobody could ever really stop Barry Sanders, b..."
4,2024 1.txt,SPORTS; Will this Duck be like Bo or Deion? Or...,"De Leon, Anthony","De Leon, Anthony.",None available.,http://ezproxy.newcastle.edu.au/login?url=http...,https://newcastle.primo.exlibrisgroup.com/disc...,"Los Angeles Times; Los Angeles, Calif.","Dec 31, 2024",2024,...,Los Angeles Times Communications LLC,"Los Angeles, Calif.",4583035.0,News,3150203937,U.S. Newsstream Collection,National Collegiate Athletic Association--NCAA...,SPORTS; Will this Duck be like Bo or Deion? Or...,General Interest Periodicals--United States,Oregon's Bryce Boettcher has the potential to ...


In [8]:
import pandas as pd
import glob
import os

def combine_essential_data(input_folder="output_by_year", output_file="essential_newspaper_data.csv"):
    """
    Combine all year-based CSV files and keep only essential columns:
    - Year of publication
    - Date of publication
    - Title
    - Main text (full_text)
    - Name of publication (publication_title)
    """
    
    print("Combining CSV files with essential data only...")
    
    # Get all CSV files (excluding summary file)
    csv_pattern = os.path.join(input_folder, "newspaper_data_*.csv")
    csv_files = glob.glob(csv_pattern)
    
    if not csv_files:
        print(f"No CSV files found matching pattern: {csv_pattern}")
        return None
    
    print(f"Found {len(csv_files)} CSV files to combine...")
    
    combined_data = []
    
    for csv_file in sorted(csv_files):  # Sort to process years in order
        print(f"Processing {os.path.basename(csv_file)}...")
        
        try:
            # Read the CSV file
            df = pd.read_csv(csv_file)
            
            # Extract year from filename as backup
            filename = os.path.basename(csv_file)
            year_from_filename = filename.replace("newspaper_data_", "").replace(".csv", "")
            
            # Create essential data dictionary
            essential_cols = {
                'year': 'publication_year',
                'date': 'publication_date',
                'title': 'title', 
                'main_text': 'full_text',
                'publication_name': 'publication_title'
            }
            
            # Check which columns exist in the dataframe
            available_data = {}
            for new_name, orig_name in essential_cols.items():
                if orig_name in df.columns:
                    available_data[new_name] = df[orig_name]
                else:
                    print(f"  Warning: Column '{orig_name}' not found in {filename}")
                    available_data[new_name] = ''  # Empty string if column missing
            
            # Use year from filename if publication_year is missing/empty
            if 'year' in available_data:
                # Fill missing years with year from filename
                available_data['year'] = available_data['year'].fillna(year_from_filename)
                available_data['year'] = available_data['year'].replace('', year_from_filename)
            
            # Create DataFrame with essential columns
            essential_df = pd.DataFrame(available_data)
            
            # Add to combined data
            combined_data.append(essential_df)
            
            print(f"  Added {len(essential_df)} articles from {year_from_filename}")
            
        except Exception as e:
            print(f"Error processing {csv_file}: {e}")
            continue
    
    if not combined_data:
        print("No data was successfully processed.")
        return None
    
    # Combine all dataframes
    print("\nCombining all data...")
    final_df = pd.concat(combined_data, ignore_index=True)
    
    # Clean up the data
    print("Cleaning data...")
    
    # Remove rows where all essential fields are empty
    essential_fields = ['title', 'main_text', 'publication_name']
    mask = final_df[essential_fields].notna().any(axis=1)
    final_df = final_df[mask]
    
    # Convert year to string and clean it
    final_df['year'] = final_df['year'].astype(str).str.strip()
    
    # Remove extra whitespace from text fields
    text_columns = ['title', 'main_text', 'publication_name']
    for col in text_columns:
        if col in final_df.columns:
            final_df[col] = final_df[col].astype(str).str.strip()
    
    # Sort by year and title
    final_df = final_df.sort_values(['year', 'title'], na_position='last')
    
    # Reset index
    final_df = final_df.reset_index(drop=True)
    
    # Save to CSV
    final_df.to_csv(output_file, index=False, encoding='utf-8')
    
    # Print summary
    print(f"\n{'='*50}")
    print("ESSENTIAL DATA SUMMARY")
    print(f"{'='*50}")
    print(f"Total articles: {len(final_df)}")
    print(f"Years covered: {final_df['year'].nunique()} unique years")
    print(f"Year range: {final_df['year'].min()} to {final_df['year'].max()}")
    print(f"Unique publications: {final_df['publication_name'].nunique()}")
    
    # Show articles per year
    print(f"\nArticles per year:")
    year_counts = final_df['year'].value_counts().sort_index()
    for year, count in year_counts.items():
        print(f"  {year}: {count} articles")
    
    # Show top publications
    print(f"\nTop 10 publications:")
    pub_counts = final_df['publication_name'].value_counts().head(10)
    for pub, count in pub_counts.items():
        print(f"  {pub}: {count} articles")
    
    # Show data quality stats
    print(f"\nData quality:")
    print(f"  Articles with titles: {final_df['title'].notna().sum()}")
    print(f"  Articles with main text: {final_df['main_text'].notna().sum()}")
    print(f"  Articles with publication name: {final_df['publication_name'].notna().sum()}")
    
    # Show file size
    file_size_mb = os.path.getsize(output_file) / (1024**2)
    print(f"\nOutput file: {output_file}")
    print(f"File size: {file_size_mb:.2f} MB")
    
    # Show sample data
    print(f"\nSample data (first 3 rows):")
    sample_df = final_df[['year', 'title', 'publication_name']].head(3)
    print(sample_df.to_string(index=False))
    
    return final_df

def create_even_more_compact_version(df, output_file="compact_newspaper_data.csv"):
    """Create an even more compact version by removing very long texts"""
    
    print(f"\nCreating compact version...")
    
    # Calculate text lengths
    df['title_length'] = df['title'].astype(str).str.len()
    df['text_length'] = df['main_text'].astype(str).str.len()
    
    print(f"Text length statistics:")
    print(f"  Average title length: {df['title_length'].mean():.0f} characters")
    print(f"  Average main text length: {df['text_length'].mean():.0f} characters")
    print(f"  Max title length: {df['title_length'].max()}")
    print(f"  Max main text length: {df['text_length'].max()}")
    
    # Create compact version - you can adjust these limits
    MAX_TITLE_LENGTH = 200  # Characters
    MAX_TEXT_LENGTH = 5000  # Characters (about 1000 words)
    
    compact_df = df.copy()
    
    # Truncate very long titles
    long_titles = compact_df['title_length'] > MAX_TITLE_LENGTH
    if long_titles.sum() > 0:
        print(f"  Truncating {long_titles.sum()} very long titles")
        compact_df.loc[long_titles, 'title'] = compact_df.loc[long_titles, 'title'].str[:MAX_TITLE_LENGTH] + '...'
    
    # Truncate very long texts
    long_texts = compact_df['text_length'] > MAX_TEXT_LENGTH
    if long_texts.sum() > 0:
        print(f"  Truncating {long_texts.sum()} very long texts")
        compact_df.loc[long_texts, 'main_text'] = compact_df.loc[long_texts, 'main_text'].str[:MAX_TEXT_LENGTH] + '...'
    
    # Remove length columns
    compact_df = compact_df.drop(['title_length', 'text_length'], axis=1)
    
    # Save compact version
    compact_df.to_csv(output_file, index=False, encoding='utf-8')
    
    # Compare file sizes
    original_size = os.path.getsize("essential_newspaper_data.csv") / (1024**2)
    compact_size = os.path.getsize(output_file) / (1024**2)
    
    print(f"Original file: {original_size:.2f} MB")
    print(f"Compact file: {compact_size:.2f} MB")
    print(f"Size reduction: {100*(1-compact_size/original_size):.1f}%")
    
    return compact_df

# Main execution
if __name__ == "__main__":
    # Combine essential data
    df = combine_essential_data(
        input_folder="output_by_year", 
        output_file="essential_newspaper_data.csv"
    )
    
    if df is not None:
        # Optional: Create compact version if file is still too large
        file_size_mb = os.path.getsize("essential_newspaper_data.csv") / (1024**2)
        if file_size_mb > 100:  # If larger than 100MB
            print(f"\nFile is {file_size_mb:.2f} MB - creating compact version...")
            compact_df = create_even_more_compact_version(df, "compact_newspaper_data.csv")
        else:
            print(f"\nFile size ({file_size_mb:.2f} MB) is manageable.")
        
        print(f"\nDone! Your essential data is ready for analysis.")

Combining CSV files with essential data only...
Found 24 CSV files to combine...
Processing newspaper_data_2002.csv...
  Added 10159 articles from 2002
Processing newspaper_data_2003.csv...
  Added 11340 articles from 2003
Processing newspaper_data_2004.csv...
  Added 10850 articles from 2004
Processing newspaper_data_2005.csv...
  Added 10572 articles from 2005
Processing newspaper_data_2006.csv...
  Added 9805 articles from 2006
Processing newspaper_data_2007.csv...
  Added 8679 articles from 2007
Processing newspaper_data_2008.csv...
  Added 8490 articles from 2008
Processing newspaper_data_2009.csv...
  Added 8023 articles from 2009
Processing newspaper_data_2010.csv...
  Added 7978 articles from 2010
Processing newspaper_data_2011.csv...
  Added 7454 articles from 2011
Processing newspaper_data_2012.csv...
  Added 7717 articles from 2012
Processing newspaper_data_2013.csv...
  Added 6544 articles from 2013
Processing newspaper_data_2014.csv...
  Added 6032 articles from 2014
Proce

In [9]:
import os
import glob

# Check current directory and look for CSV files
print("Current directory:", os.getcwd())
print("\nLooking for CSV files...")

# Check common locations
locations_to_check = [
    ".",  # Current directory
    "output_by_year",
    "data",
    "../data"
]

for location in locations_to_check:
    if os.path.exists(location):
        files = os.listdir(location)
        csv_files = [f for f in files if f.endswith('.csv')]
        print(f"\nIn '{location}':")
        print(f"  All files: {files[:10]}...")  # Show first 10 files
        print(f"  CSV files: {csv_files}")
    else:
        print(f"\n'{location}' does not exist")

Current directory: c:\Users\rafsu\OneDrive - The University Of Newcastle\NCAA_Alex\complete_data_raw\codes

Looking for CSV files...

In '.':
  All files: ['cleaning_report.txt', 'compact_newspaper_data.csv', 'data_cleaning.ipynb', 'data_loading.ipynb', 'essential_newspaper_data.csv', 'newspaper_data.csv', 'output_by_year', 'text_normalizer.py']...
  CSV files: ['compact_newspaper_data.csv', 'essential_newspaper_data.csv', 'newspaper_data.csv']

In 'output_by_year':
  All files: ['newspaper_data_2002.csv', 'newspaper_data_2003.csv', 'newspaper_data_2004.csv', 'newspaper_data_2005.csv', 'newspaper_data_2006.csv', 'newspaper_data_2007.csv', 'newspaper_data_2008.csv', 'newspaper_data_2009.csv', 'newspaper_data_2010.csv', 'newspaper_data_2011.csv']...
  CSV files: ['newspaper_data_2002.csv', 'newspaper_data_2003.csv', 'newspaper_data_2004.csv', 'newspaper_data_2005.csv', 'newspaper_data_2006.csv', 'newspaper_data_2007.csv', 'newspaper_data_2008.csv', 'newspaper_data_2009.csv', 'newspaper_d

In [10]:
import os

# Check what's in the output_by_year folder
folder_path = "output_by_year"
if os.path.exists(folder_path):
    files = os.listdir(folder_path)
    print(f"Files in {folder_path}:")
    for file in files:
        print(f"  {file}")
        
    csv_files = [f for f in files if f.endswith('.csv')]
    print(f"\nCSV files found: {len(csv_files)}")
    for csv_file in csv_files:
        print(f"  {csv_file}")
else:
    print(f"Folder {folder_path} does not exist")
    

Files in output_by_year:
  newspaper_data_2002.csv
  newspaper_data_2003.csv
  newspaper_data_2004.csv
  newspaper_data_2005.csv
  newspaper_data_2006.csv
  newspaper_data_2007.csv
  newspaper_data_2008.csv
  newspaper_data_2009.csv
  newspaper_data_2010.csv
  newspaper_data_2011.csv
  newspaper_data_2012.csv
  newspaper_data_2013.csv
  newspaper_data_2014.csv
  newspaper_data_2015.csv
  newspaper_data_2016.csv
  newspaper_data_2017.csv
  newspaper_data_2018.csv
  newspaper_data_2019.csv
  newspaper_data_2020.csv
  newspaper_data_2021.csv
  newspaper_data_2022.csv
  newspaper_data_2023.csv
  newspaper_data_2024.csv
  newspaper_data_2025.csv
  yearly_summary.csv

CSV files found: 25
  newspaper_data_2002.csv
  newspaper_data_2003.csv
  newspaper_data_2004.csv
  newspaper_data_2005.csv
  newspaper_data_2006.csv
  newspaper_data_2007.csv
  newspaper_data_2008.csv
  newspaper_data_2009.csv
  newspaper_data_2010.csv
  newspaper_data_2011.csv
  newspaper_data_2012.csv
  newspaper_data_2013.c

In [11]:
import pandas as pd
import glob
import os

def combine_essential_data(input_folder="output_by_year", output_file="essential_newspaper_data.csv"):
    """
    Combine all year-based CSV files and keep only essential columns:
    - Year of publication
    - Date of publication
    - Title
    - Main text (full_text)
    - Name of publication (publication_title)
    """
    
    print("Combining CSV files with essential data only...")
    
    # Check if folder exists
    if not os.path.exists(input_folder):
        print(f"Folder '{input_folder}' does not exist!")
        print("Please provide the correct folder path where your year-based CSV files are located.")
        return None
    
    # List all files in the folder to debug
    all_files = os.listdir(input_folder)
    print(f"Files in '{input_folder}': {all_files}")
    
    # Try different patterns to find CSV files
    patterns = [
        "newspaper_data_*.csv",  # Original pattern
        "*_*.csv",               # Any CSV with underscore
        "*.csv"                  # Any CSV file
    ]
    
    csv_files = []
    for pattern in patterns:
        csv_pattern = os.path.join(input_folder, pattern)
        found_files = glob.glob(csv_pattern)
        if found_files:
            print(f"Found {len(found_files)} files matching pattern '{pattern}'")
            csv_files = found_files
            break
    
    # Exclude summary file if it exists
    csv_files = [f for f in csv_files if 'summary' not in os.path.basename(f).lower()]
    
    if not csv_files:
        print(f"No CSV files found in '{input_folder}'")
        print("Please check:")
        print("1. The folder path is correct")
        print("2. The CSV files are in this folder")
        print("3. The files have .csv extension")
        return None
    
    print(f"Found {len(csv_files)} CSV files to combine...")
    
    combined_data = []
    
    for csv_file in sorted(csv_files):  # Sort to process years in order
        print(f"Processing {os.path.basename(csv_file)}...")
        
        try:
            # Read the CSV file
            df = pd.read_csv(csv_file)
            
            # Extract year from filename as backup
            filename = os.path.basename(csv_file)
            year_from_filename = filename.replace("newspaper_data_", "").replace(".csv", "")
            
            # Create essential data dictionary
            essential_cols = {
                'year': 'publication_year',
                'date': 'publication_date',
                'title': 'title', 
                'main_text': 'full_text',
                'publication_name': 'publication_title'
            }
            
            # Check which columns exist in the dataframe
            available_data = {}
            for new_name, orig_name in essential_cols.items():
                if orig_name in df.columns:
                    available_data[new_name] = df[orig_name]
                else:
                    print(f"  Warning: Column '{orig_name}' not found in {filename}")
                    available_data[new_name] = ''  # Empty string if column missing
            
            # Use year from filename if publication_year is missing/empty
            if 'year' in available_data:
                # Fill missing years with year from filename
                available_data['year'] = available_data['year'].fillna(year_from_filename)
                available_data['year'] = available_data['year'].replace('', year_from_filename)
            
            # Create DataFrame with essential columns
            essential_df = pd.DataFrame(available_data)
            
            # Add to combined data
            combined_data.append(essential_df)
            
            print(f"  Added {len(essential_df)} articles from {year_from_filename}")
            
        except Exception as e:
            print(f"Error processing {csv_file}: {e}")
            continue
    
    if not combined_data:
        print("No data was successfully processed.")
        return None
    
    # Combine all dataframes
    print("\nCombining all data...")
    final_df = pd.concat(combined_data, ignore_index=True)
    
    # Clean up the data
    print("Cleaning data...")
    
    # Remove rows where all essential fields are empty
    essential_fields = ['title', 'main_text', 'publication_name']
    mask = final_df[essential_fields].notna().any(axis=1)
    final_df = final_df[mask]
    
    # Convert year to string and clean it
    final_df['year'] = final_df['year'].astype(str).str.strip()
    
    # Remove extra whitespace from text fields
    text_columns = ['title', 'main_text', 'publication_name']
    for col in text_columns:
        if col in final_df.columns:
            final_df[col] = final_df[col].astype(str).str.strip()
    
    # Sort by year and title
    final_df = final_df.sort_values(['year', 'title'], na_position='last')
    
    # Reset index
    final_df = final_df.reset_index(drop=True)
    
    # Save to CSV
    final_df.to_csv(output_file, index=False, encoding='utf-8')
    
    # Print summary
    print(f"\n{'='*50}")
    print("ESSENTIAL DATA SUMMARY")
    print(f"{'='*50}")
    print(f"Total articles: {len(final_df)}")
    print(f"Years covered: {final_df['year'].nunique()} unique years")
    print(f"Year range: {final_df['year'].min()} to {final_df['year'].max()}")
    print(f"Unique publications: {final_df['publication_name'].nunique()}")
    
    # Show articles per year
    print(f"\nArticles per year:")
    year_counts = final_df['year'].value_counts().sort_index()
    for year, count in year_counts.items():
        print(f"  {year}: {count} articles")
    
    # Show top publications
    print(f"\nTop 10 publications:")
    pub_counts = final_df['publication_name'].value_counts().head(10)
    for pub, count in pub_counts.items():
        print(f"  {pub}: {count} articles")
    
    # Show data quality stats
    print(f"\nData quality:")
    print(f"  Articles with titles: {final_df['title'].notna().sum()}")
    print(f"  Articles with main text: {final_df['main_text'].notna().sum()}")
    print(f"  Articles with publication name: {final_df['publication_name'].notna().sum()}")
    
    # Show file size
    file_size_mb = os.path.getsize(output_file) / (1024**2)
    print(f"\nOutput file: {output_file}")
    print(f"File size: {file_size_mb:.2f} MB")
    
    # Show sample data
    print(f"\nSample data (first 3 rows):")
    sample_df = final_df[['year', 'title', 'publication_name']].head(3)
    print(sample_df.to_string(index=False))
    
    return final_df

def create_even_more_compact_version(df, output_file="compact_newspaper_data.csv"):
    """Create an even more compact version by removing very long texts"""
    
    print(f"\nCreating compact version...")
    
    # Calculate text lengths
    df['title_length'] = df['title'].astype(str).str.len()
    df['text_length'] = df['main_text'].astype(str).str.len()
    
    print(f"Text length statistics:")
    print(f"  Average title length: {df['title_length'].mean():.0f} characters")
    print(f"  Average main text length: {df['text_length'].mean():.0f} characters")
    print(f"  Max title length: {df['title_length'].max()}")
    print(f"  Max main text length: {df['text_length'].max()}")
    
    # Create compact version - you can adjust these limits
    MAX_TITLE_LENGTH = 200  # Characters
    MAX_TEXT_LENGTH = 5000  # Characters (about 1000 words)
    
    compact_df = df.copy()
    
    # Truncate very long titles
    long_titles = compact_df['title_length'] > MAX_TITLE_LENGTH
    if long_titles.sum() > 0:
        print(f"  Truncating {long_titles.sum()} very long titles")
        compact_df.loc[long_titles, 'title'] = compact_df.loc[long_titles, 'title'].str[:MAX_TITLE_LENGTH] + '...'
    
    # Truncate very long texts
    long_texts = compact_df['text_length'] > MAX_TEXT_LENGTH
    if long_texts.sum() > 0:
        print(f"  Truncating {long_texts.sum()} very long texts")
        compact_df.loc[long_texts, 'main_text'] = compact_df.loc[long_texts, 'main_text'].str[:MAX_TEXT_LENGTH] + '...'
    
    # Remove length columns
    compact_df = compact_df.drop(['title_length', 'text_length'], axis=1)
    
    # Save compact version
    compact_df.to_csv(output_file, index=False, encoding='utf-8')
    
    # Compare file sizes
    original_size = os.path.getsize("essential_newspaper_data.csv") / (1024**2)
    compact_size = os.path.getsize(output_file) / (1024**2)
    
    print(f"Original file: {original_size:.2f} MB")
    print(f"Compact file: {compact_size:.2f} MB")
    print(f"Size reduction: {100*(1-compact_size/original_size):.1f}%")
    
    return compact_df

# Main execution
if __name__ == "__main__":
    # Combine essential data
    df = combine_essential_data(
        input_folder="output_by_year", 
        output_file="essential_newspaper_data.csv"
    )
    
    if df is not None:
        # Optional: Create compact version if file is still too large
        file_size_mb = os.path.getsize("essential_newspaper_data.csv") / (1024**2)
        if file_size_mb > 100:  # If larger than 100MB
            print(f"\nFile is {file_size_mb:.2f} MB - creating compact version...")
            compact_df = create_even_more_compact_version(df, "compact_newspaper_data.csv")
        else:
            print(f"\nFile size ({file_size_mb:.2f} MB) is manageable.")
        
        print(f"\nDone! Your essential data is ready for analysis.")

Combining CSV files with essential data only...
Files in 'output_by_year': ['newspaper_data_2002.csv', 'newspaper_data_2003.csv', 'newspaper_data_2004.csv', 'newspaper_data_2005.csv', 'newspaper_data_2006.csv', 'newspaper_data_2007.csv', 'newspaper_data_2008.csv', 'newspaper_data_2009.csv', 'newspaper_data_2010.csv', 'newspaper_data_2011.csv', 'newspaper_data_2012.csv', 'newspaper_data_2013.csv', 'newspaper_data_2014.csv', 'newspaper_data_2015.csv', 'newspaper_data_2016.csv', 'newspaper_data_2017.csv', 'newspaper_data_2018.csv', 'newspaper_data_2019.csv', 'newspaper_data_2020.csv', 'newspaper_data_2021.csv', 'newspaper_data_2022.csv', 'newspaper_data_2023.csv', 'newspaper_data_2024.csv', 'newspaper_data_2025.csv', 'yearly_summary.csv']
Found 24 files matching pattern 'newspaper_data_*.csv'
Found 24 CSV files to combine...
Processing newspaper_data_2002.csv...
  Added 10159 articles from 2002
Processing newspaper_data_2003.csv...
  Added 11340 articles from 2003
Processing newspaper_dat

In [12]:
# Run the combiner - it should work now
df = combine_essential_data(
    input_folder="output_by_year", 
    output_file="essential_newspaper_data.csv"
)

Combining CSV files with essential data only...
Files in 'output_by_year': ['newspaper_data_2002.csv', 'newspaper_data_2003.csv', 'newspaper_data_2004.csv', 'newspaper_data_2005.csv', 'newspaper_data_2006.csv', 'newspaper_data_2007.csv', 'newspaper_data_2008.csv', 'newspaper_data_2009.csv', 'newspaper_data_2010.csv', 'newspaper_data_2011.csv', 'newspaper_data_2012.csv', 'newspaper_data_2013.csv', 'newspaper_data_2014.csv', 'newspaper_data_2015.csv', 'newspaper_data_2016.csv', 'newspaper_data_2017.csv', 'newspaper_data_2018.csv', 'newspaper_data_2019.csv', 'newspaper_data_2020.csv', 'newspaper_data_2021.csv', 'newspaper_data_2022.csv', 'newspaper_data_2023.csv', 'newspaper_data_2024.csv', 'newspaper_data_2025.csv', 'yearly_summary.csv']
Found 24 files matching pattern 'newspaper_data_*.csv'
Found 24 CSV files to combine...
Processing newspaper_data_2002.csv...
  Added 10159 articles from 2002
Processing newspaper_data_2003.csv...
  Added 11340 articles from 2003
Processing newspaper_dat

##### **© 2024–2025 MD Rafsun Sheikh**
##### **Licensed under the Apache License, Version 2.0.**