In [2]:
import feedparser
import datetime
import csv
from typing import List, Dict, Optional
import requests
from requests.exceptions import RequestException
import socket
import time

class PRWireParser:
    def __init__(self, feed_url: str, timeout: int = 10):
        """
        Initialize the PR Wire parser with a feed URL.
        
        Args:
            feed_url (str): The URL of the RSS feed to parse
            timeout (int): Timeout in seconds for feed requests
        """
        self.feed_url = feed_url
        self.timeout = timeout
        self.feed_data = None
        
        # Set socket timeout globally
        socket.setdefaulttimeout(timeout)

    def fetch_feed(self) -> bool:
        """
        Fetch and parse the RSS feed with timeout handling.
        
        Returns:
            bool: True if successful, False otherwise
        """
        start_time = time.time()
        
        try:
            # First get the raw feed content with timeout
            response = requests.get(self.feed_url, timeout=self.timeout)
            response.raise_for_status()
            
            # Parse the feed content
            self.feed_data = feedparser.parse(response.content)
            
            # Check if parsing was successful
            if not hasattr(self.feed_data, 'entries'):
                print(f"No entries found in feed: {self.feed_url}")
                return False
                
            print(f"Feed fetched in {time.time() - start_time:.2f} seconds")
            return len(self.feed_data.entries) > 0
            
        except RequestException as e:
            print(f"Request error fetching feed {self.feed_url}: {str(e)}")
            return False
        except Exception as e:
            print(f"Error parsing feed {self.feed_url}: {str(e)}")
            return False

    def get_entries(self, limit: Optional[int] = None) -> List[Dict]:
        """
        Get parsed entries from the feed with performance optimization.
        
        Args:
            limit (Optional[int]): Maximum number of entries to return
            
        Returns:
            List[Dict]: List of parsed entries
        """
        if not self.feed_data:
            if not self.fetch_feed():
                return []

        entries = []
        try:
            # Limit the number of entries to process
            feed_entries = self.feed_data.entries[:limit] if limit else self.feed_data.entries
            
            for entry in feed_entries:
                # Skip entries that don't have required fields
                if not entry.get('title') or not entry.get('link'):
                    continue
                    
                parsed_entry = {
                    'title': entry.get('title', ''),
                    'link': entry.get('link', ''),
                    'published': entry.get('published', ''),
                    'summary': entry.get('summary', '')[:500],  # Limit summary length
                    'company': self._extract_company(entry),
                    'categories': ','.join([tag.term for tag in entry.get('tags', [])])
                }
                entries.append(parsed_entry)
                
                # Early exit if we've reached the limit
                if limit and len(entries) >= limit:
                    break
                    
        except Exception as e:
            print(f"Error processing entries: {str(e)}")
            
        return entries

    def _extract_company(self, entry: Dict) -> str:
        """
        Extract company name from entry metadata (optimized).
        
        Args:
            entry (Dict): Feed entry
            
        Returns:
            str: Extracted company name or empty string
        """
        # Quick checks for common company information locations
        if hasattr(entry, 'source') and entry.source.get('title'):
            return entry.source.get('title', '')
        
        if hasattr(entry, 'author'):
            return entry.author
            
        # Only check content if other methods fail
        if hasattr(entry, 'content'):
            try:
                first_line = entry.content[0].value.split('\n')[0]
                if ' - ' in first_line:
                    return first_line.split(' - ')[0].strip()
            except (IndexError, AttributeError):
                pass
                
        return ''

def run2():
    # Example usage with performance monitoring
    feed_urls = [
        'https://www.prnewswire.com/rss/news-releases-list.rss'
    ]
    
    for url in feed_urls:
        start_time = time.time()
        parser = PRWireParser(url, timeout=10)  # 10 second timeout
        
        if parser.fetch_feed():
            # Get latest entry
            entries = parser.get_entries(limit=1)
            
            # Save to CSV with timestamp
            if entries:
                timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
                filename = f'pr_wire_news_{timestamp}.csv'
                
                with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
                    writer = csv.DictWriter(csvfile, fieldnames=entries[0].keys())
                    writer.writeheader()
                    writer.writerows(entries)
                    
                print(f"Saved {len(entries)} entries from {url} to {filename}")
                print(f"Total processing time: {time.time() - start_time:.2f} seconds")
            else:
                print(f"No valid entries found for {url}")
        else:
            print(f"Failed to fetch feed from {url}")


def main():
    # Example usage with performance monitoring
    feed_urls = [
        'https://www.prnewswire.com/rss/news-releases-list.rss'
    ]
    
    # Dictionary to store results
    results = {}
    
    for url in feed_urls:
        start_time = time.time()
        parser = PRWireParser(url, timeout=10)  # 10 second timeout
        
        if parser.fetch_feed():
            # Get latest entry
            entries = parser.get_entries(limit=1)
            
            if entries:
                # Store entries in dictionary with timestamp as key
                timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
                results[timestamp] = entries[0]  # Store the single entry
                
                print(f"Stored entry from {url} in results dictionary")
                print(f"Total processing time: {time.time() - start_time:.2f} seconds")
            else:
                print(f"No valid entries found for {url}")
        else:
            print(f"Failed to fetch feed from {url}")
    
    return results  # Return the dictionary containing all entries

In [18]:
# Create a sample dictionary with different data types
sample_dict = {
    'name': 'John Smith',
    'age': 30,
    'is_student': False,
    'grades': [85, 92, 78, 90],
    'address': {
        'street': '123 Main St',
        'city': 'Boston',
        'state': 'MA',
        'zip': '02108'
    },
    'courses': ('Math', 'History', 'Science'),
    'gpa': 3.75
}

print(sample_dict)

{'name': 'John Smith', 'age': 30, 'is_student': False, 'grades': [85, 92, 78, 90], 'address': {'street': '123 Main St', 'city': 'Boston', 'state': 'MA', 'zip': '02108'}, 'courses': ('Math', 'History', 'Science'), 'gpa': 3.75}


In [4]:
results = main()

Feed fetched in 0.16 seconds
Stored entry from https://www.prnewswire.com/rss/news-releases-list.rss in results dictionary
Total processing time: 0.16 seconds


In [5]:
results

{'20250215_160053': {'title': 'DEAN TUCCI FILES OPPOSITION MOTION AGAINST CFPB',
  'link': 'https://www.prnewswire.com/news-releases/dean-tucci-files-opposition-motion-against-cfpb-302377572.html',
  'published': 'Sat, 15 Feb 2025 20:01:00 +0000',
  'summary': '<p>PALATINE, Ill., Feb. 15, 2025 /PRNewswire/ -- In November of 2020, the CFPB filed a lawsuit against FDATR, Inc., Ken Halverson, and Dean Tucci for allegations that the Federal Telemarketing Sales Rule, 16 C.F.R. Part 310 ("TSR") had been violated. Dean Tucci started and owned FDATR, Inc....</p>',
  'company': '',
  'categories': 'LAW'}}