In [5]:
import requests
import pandas as pd
from utils.helper import Proxies

proxies = Proxies(annoymous=False, from_env=True)
# useragent = proxies.get_proxy_useragent()
proxy_ip = proxies.get_proxy_ip()

proxy = {
    "http": proxy_ip,
    "https": proxy_ip
}
# headers = {"Connection" : "close",  "User-Agent" : useragent}  
# print("User Agent:", useragent)
# print("Proxy ip:", proxy_ip)


# Google News

In [None]:
import requests
import json
import traceback
import time
import bs4 as bs
import pandas as pd
import urllib.parse
from datetime import datetime, timedelta, date
from IPython.display import display

class GoogleRSSFeedScraper():
    def __init__(self, proxy=None):
        self.proxy = proxy
        self.state = {}
        self.base_url = "https://news.google.com/rss/search?"
    
    def scrape(self, search_query, start_time=datetime.now().date(), end_time = datetime.now().date(), scrape_period = 7, add_url_parameters={}, save_state=False, save_result=False, save_folder="./data", preview_result=True, sleep_interval=5):
        """
        Web scrape google news rss feeds. 
        - Use date(YYYY, MM, DD) to initiate date arguments. eg: date(2025, 2, 19)
        Args:
            search_query (String): Search query
            start_time (Date): Default to datetime.now(). 
            end_time (Date): Default to datetime.now(). 
            scrape_period (int): no. of days for each scrape instance (inclusive of start and end: must be more than 1)
            add_url_parameters (dict): Additional url parameters
            save_state (boolean): Save state progress in json file. format: f"/scrape_state_{datetime.now().strftime('%Y%m%d%H%M%S')}.json"
            save_result (boolean): Save dataframe as pickle file. format: f"/scrape_result_{datetime.now().strftime('%Y%m%d%H%M%S')}.pkl"
            save_folder (String): root folder to save state and result
            preview_result (boolean): Show pandas dataframe after processing
            sleep_interval (int): Sleep for 3 seconds for every X interations (intervals).
        """
        if end_time < start_time:
            raise ValueError("End time must be greater than start time!")
        elif scrape_period < 1:
            raise ValueError("Scrape period must be more than 1!")
        
        # Save state
        self.state = {
            "parameters": {
                "search_query": search_query,
                "start_time": start_time,
                "end_time": end_time,
                "scrape_period": scrape_period,
                "add_url_parameters": add_url_parameters
            },
            "iter_no": 0
        }
        to_time = None
        df = pd.DataFrame()
        try:
            scrape_period -= 1
            while start_time <= end_time:
                to_time = min(start_time + timedelta(days=scrape_period), end_time)
                # Format and encode URL
                url_parameters = {"q": f"{search_query} after:{start_time.strftime('%Y-%m-%d')} before:{to_time.strftime('%Y-%m-%d')}", **add_url_parameters}
                url = self.base_url + urllib.parse.urlencode(url_parameters)

                self.state["to_time"] = to_time
                self.state["url"] = url
                
                # GET request and parse
                df_intermediate = self.fetch_and_parse(url, pandas= True)
                if len(df_intermediate) > 0:
                    df = pd.concat([df,df_intermediate])
                    df.drop_duplicates(subset=["pubDate", "title", "source"], inplace=True)
                    df.sort_values(by='pubDate', inplace=True)
                    df.reset_index(drop=True, inplace=True)
  
                self.state["iter_no"] += 1
                if self.state["iter_no"] % sleep_interval == 0:
                    print(f"-- Current state: {start_time}-{to_time}, record_count: {len(df)}, iter: {self.state['iter_no']}")
                    time.sleep(3)
                start_time += timedelta(days=scrape_period+1)  
        except Exception as e:
            print(f"########## Error: {start_time}-{to_time} {end_time} #########")
            print(traceback.format_exc())
        finally:
            self.state["record_count"] = len(df)
            if preview_result:
                print(f"############## Result: {len(df)} records ##############")
                display(df.tail())
            if save_state:
                save_path = save_folder.rstrip("/") + f"/scrape_state_{datetime.now().strftime('%Y%m%d%H%M%S')}.json"
                self.state["save_state_path"] = save_path
                with open(save_path, "w") as f:
                    json.dump(self.state, f, indent=4, default=str)
                print("Saved state to:", save_path)
            if save_result and len(df) > 0:
                save_path = save_folder.rstrip("/") + f"/scrape_result_{datetime.now().strftime('%Y%m%d%H%M%S')}.pkl"
                self.state["save_result_path"] = save_path
                df.to_pickle(save_path)
                print("Saved pickle to:", save_path)
            

    def fetch_and_parse(self, url, pandas= True):
        response = requests.get(url, proxies=self.proxy) 
        soup = bs.BeautifulSoup(response.text,'xml')

        result = []
        for item in soup.find_all('item'):
            result.append({
                "title": item.title.text,
                "link": item.link.text,
                "pubDate": item.pubDate.text,
                "source": item.source.text,
                "description": item.description.text,
            })
        if pandas:
            if len(result) == 0:
                return pd.DataFrame()
            df = pd.DataFrame(result)
            df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %Z', utc=True)
            return df
        return result

In [7]:
rss_feed_scraper = GoogleRSSFeedScraper()

In [74]:
banks_arr = ["UBS", "Citibank", "HSBC", "JPMorgan", "Goldman Sachs", "Credit Suisse", "Deutsche Bank", "Barclays", "Bank of America"]
for each_bank in banks_arr:
    print(f"-------------- {each_bank}  --------------")
    rss_feed_scraper.scrape(f"{each_bank} litigation",start_time=date(2000,1,1), save_state=True, save_result=True, preview_result=True, scrape_period = 30) 

-------------- UBS  --------------
-- Current state: 2000-05-30-2000-05-29, record_count: 0, iter: 5
-- Current state: 2000-10-27-2000-10-26, record_count: 1, iter: 10
-- Current state: 2001-03-26-2001-03-25, record_count: 1, iter: 15
-- Current state: 2001-08-23-2001-08-22, record_count: 1, iter: 20
-- Current state: 2002-01-20-2002-01-19, record_count: 1, iter: 25
-- Current state: 2002-06-19-2002-06-18, record_count: 1, iter: 30
-- Current state: 2002-11-16-2002-11-15, record_count: 2, iter: 35
-- Current state: 2003-04-15-2003-04-14, record_count: 2, iter: 40
-- Current state: 2003-09-12-2003-09-11, record_count: 2, iter: 45
-- Current state: 2004-02-09-2004-02-08, record_count: 3, iter: 50
-- Current state: 2004-07-08-2004-07-07, record_count: 3, iter: 55
-- Current state: 2004-12-05-2004-12-04, record_count: 4, iter: 60
-- Current state: 2005-05-04-2005-05-03, record_count: 4, iter: 65
-- Current state: 2005-10-01-2005-09-30, record_count: 4, iter: 70
-- Current state: 2006-02-28

Unnamed: 0,title,link,pubDate,source,description
617,Switzerland's Financial Sector Faces Major Scr...,https://news.google.com/rss/articles/CBMiqwFBV...,2025-02-06 17:25:51+00:00,Evrim Ağacı,"<a href=""https://news.google.com/rss/articles/..."
618,International Investors File Claims Against Sw...,https://news.google.com/rss/articles/CBMitAFBV...,2025-02-07 08:00:00+00:00,Evrim Ağacı,"<a href=""https://news.google.com/rss/articles/..."
619,UBS whistleblower verdict thrown out despite U...,https://news.google.com/rss/articles/CBMiugFBV...,2025-02-10 08:00:00+00:00,Reuters,"<a href=""https://news.google.com/rss/articles/..."
620,2nd Circ. Backs UBS In Retaliation Case That J...,https://news.google.com/rss/articles/CBMivgFBV...,2025-02-10 20:12:00+00:00,Law360,"<a href=""https://news.google.com/rss/articles/..."
621,Axsome therapeutics CFO sells $1.95 million in...,https://news.google.com/rss/articles/CBMiywFBV...,2025-02-15 03:38:31+00:00,MSN,"<a href=""https://news.google.com/rss/articles/..."


Saved state to: ./data/scrape_state_20250219095309.json
Saved pickle to: ./data/scrape_result_20250219095309.pkl
-------------- Citibank  --------------
-- Current state: 2000-05-30-2000-05-29, record_count: 0, iter: 5
-- Current state: 2000-10-27-2000-10-26, record_count: 0, iter: 10
-- Current state: 2001-03-26-2001-03-25, record_count: 0, iter: 15
-- Current state: 2001-08-23-2001-08-22, record_count: 0, iter: 20
-- Current state: 2002-01-20-2002-01-19, record_count: 0, iter: 25
-- Current state: 2002-06-19-2002-06-18, record_count: 0, iter: 30
-- Current state: 2002-11-16-2002-11-15, record_count: 0, iter: 35
-- Current state: 2003-04-15-2003-04-14, record_count: 0, iter: 40
-- Current state: 2003-09-12-2003-09-11, record_count: 0, iter: 45
-- Current state: 2004-02-09-2004-02-08, record_count: 0, iter: 50
-- Current state: 2004-07-08-2004-07-07, record_count: 0, iter: 55
-- Current state: 2004-12-05-2004-12-04, record_count: 0, iter: 60
-- Current state: 2005-05-04-2005-05-03, rec

Unnamed: 0,title,link,pubDate,source,description
510,DC Law Firms Saw Revenue and PEP Up 9% Last Ye...,https://news.google.com/rss/articles/CBMixgFBV...,2025-02-12 20:09:00+00:00,Law.com,"<a href=""https://news.google.com/rss/articles/..."
511,FEMA funding fracas in NYC - POLITICO,https://news.google.com/rss/articles/CBMiugFBV...,2025-02-12 22:42:25+00:00,POLITICO,"<a href=""https://news.google.com/rss/articles/..."
512,Trump’s Funding Freeze Raises a New Question: ...,https://news.google.com/rss/articles/CBMivAFBV...,2025-02-14 14:48:01+00:00,The New York Times,"<a href=""https://news.google.com/rss/articles/..."
513,Citibank Class Action Settlement Payout 2025: ...,https://news.google.com/rss/articles/CBMid0FVX...,2025-02-15 11:02:11+00:00,Bakhtiyarpur College of ...,"<a href=""https://news.google.com/rss/articles/..."
514,Kossoff Ch. 7 Trustee Can Pursue Clawbacks Aft...,https://news.google.com/rss/articles/CBMipgFBV...,2025-02-18 21:12:00+00:00,Law360,"<a href=""https://news.google.com/rss/articles/..."


Saved state to: ./data/scrape_state_20250219095734.json
Saved pickle to: ./data/scrape_result_20250219095734.pkl
-------------- HSBC  --------------
-- Current state: 2000-05-30-2000-05-29, record_count: 0, iter: 5
-- Current state: 2000-10-27-2000-10-26, record_count: 0, iter: 10
-- Current state: 2001-03-26-2001-03-25, record_count: 0, iter: 15
-- Current state: 2001-08-23-2001-08-22, record_count: 0, iter: 20
-- Current state: 2002-01-20-2002-01-19, record_count: 0, iter: 25
-- Current state: 2002-06-19-2002-06-18, record_count: 0, iter: 30
-- Current state: 2002-11-16-2002-11-15, record_count: 0, iter: 35
-- Current state: 2003-04-15-2003-04-14, record_count: 0, iter: 40
-- Current state: 2003-09-12-2003-09-11, record_count: 0, iter: 45
-- Current state: 2004-02-09-2004-02-08, record_count: 0, iter: 50
-- Current state: 2004-07-08-2004-07-07, record_count: 0, iter: 55
-- Current state: 2004-12-05-2004-12-04, record_count: 0, iter: 60
-- Current state: 2005-05-04-2005-05-03, record_

Unnamed: 0,title,link,pubDate,source,description
586,Hogan Lovells Hires Financial Crime Pro From H...,https://news.google.com/rss/articles/CBMikwFBV...,2025-01-29 08:00:00+00:00,Law360,"<a href=""https://news.google.com/rss/articles/..."
587,"Sun Pharma shares in focus: Jefferies, HSBC bu...",https://news.google.com/rss/articles/CBMi0wFBV...,2025-02-01 01:53:33+00:00,Business Upturn,"<a href=""https://news.google.com/rss/articles/..."
588,Lebanon sues HSBC Switzerland in Riad Salameh ...,https://news.google.com/rss/articles/CBMikAFBV...,2025-02-05 08:00:00+00:00,The New Arab,"<a href=""https://news.google.com/rss/articles/..."
589,"B.C. Court of Appeal overturns lower court, ce...",https://news.google.com/rss/articles/CBMi1wFBV...,2025-02-06 21:56:15+00:00,Law360 Canada,"<a href=""https://news.google.com/rss/articles/..."
590,"HSBC Seeks To Quash Discrimination, Whistleblo...",https://news.google.com/rss/articles/CBMimwFBV...,2025-02-07 17:46:00+00:00,Law360,"<a href=""https://news.google.com/rss/articles/..."


Saved state to: ./data/scrape_state_20250219100200.json
Saved pickle to: ./data/scrape_result_20250219100200.pkl
-------------- JPMorgan  --------------
-- Current state: 2000-05-30-2000-05-29, record_count: 0, iter: 5
-- Current state: 2000-10-27-2000-10-26, record_count: 0, iter: 10
-- Current state: 2001-03-26-2001-03-25, record_count: 0, iter: 15
-- Current state: 2001-08-23-2001-08-22, record_count: 0, iter: 20
-- Current state: 2002-01-20-2002-01-19, record_count: 0, iter: 25
-- Current state: 2002-06-19-2002-06-18, record_count: 0, iter: 30
-- Current state: 2002-11-16-2002-11-15, record_count: 0, iter: 35
-- Current state: 2003-04-15-2003-04-14, record_count: 0, iter: 40
-- Current state: 2003-09-12-2003-09-11, record_count: 0, iter: 45
-- Current state: 2004-02-09-2004-02-08, record_count: 1, iter: 50
-- Current state: 2004-07-08-2004-07-07, record_count: 1, iter: 55
-- Current state: 2004-12-05-2004-12-04, record_count: 1, iter: 60
-- Current state: 2005-05-04-2005-05-03, rec

Unnamed: 0,title,link,pubDate,source,description
1195,Exclusive: JPMorgan CEO Dimon derides in-offic...,https://news.google.com/rss/articles/CBMivwFBV...,2025-02-13 17:42:33+00:00,Reuters.com,"<a href=""https://news.google.com/rss/articles/..."
1196,Bausch Health to Participate in the J.P. Morga...,https://news.google.com/rss/articles/CBMi_AFBV...,2025-02-14 13:12:11+00:00,The Globe and Mail,"<a href=""https://news.google.com/rss/articles/..."
1197,Ex-JPMorgan Atty Pleads Guilty To NYC Housing ...,https://news.google.com/rss/articles/CBMiowFBV...,2025-02-14 20:28:00+00:00,Law360,"<a href=""https://news.google.com/rss/articles/..."
1198,JPMORGAN CHASE & CO SEC 10-K Report - TradingView,https://news.google.com/rss/articles/CBMimwFBV...,2025-02-14 21:28:01+00:00,TradingView,"<a href=""https://news.google.com/rss/articles/..."
1199,The dark art of deregulating - The Banker,https://news.google.com/rss/articles/CBMiekFVX...,2025-02-18 09:57:37+00:00,The Banker,"<a href=""https://news.google.com/rss/articles/..."


Saved state to: ./data/scrape_state_20250219100630.json
Saved pickle to: ./data/scrape_result_20250219100630.pkl
-------------- Goldman Sachs  --------------
-- Current state: 2000-05-30-2000-05-29, record_count: 0, iter: 5
-- Current state: 2000-10-27-2000-10-26, record_count: 0, iter: 10
-- Current state: 2001-03-26-2001-03-25, record_count: 0, iter: 15
-- Current state: 2001-08-23-2001-08-22, record_count: 0, iter: 20
-- Current state: 2002-01-20-2002-01-19, record_count: 0, iter: 25
-- Current state: 2002-06-19-2002-06-18, record_count: 0, iter: 30
-- Current state: 2002-11-16-2002-11-15, record_count: 0, iter: 35
-- Current state: 2003-04-15-2003-04-14, record_count: 0, iter: 40
-- Current state: 2003-09-12-2003-09-11, record_count: 0, iter: 45
-- Current state: 2004-02-09-2004-02-08, record_count: 0, iter: 50
-- Current state: 2004-07-08-2004-07-07, record_count: 0, iter: 55
-- Current state: 2004-12-05-2004-12-04, record_count: 0, iter: 60
-- Current state: 2005-05-04-2005-05-03

Unnamed: 0,title,link,pubDate,source,description
1686,Tesla’s $1B Lawsuit Against Matthews Dismissed...,https://news.google.com/rss/articles/CBMimgFBV...,2025-02-18 13:14:07+00:00,PUNE.NEWS,"<a href=""https://news.google.com/rss/articles/..."
1687,Competition Group Of The Year: Covington - Law360,https://news.google.com/rss/articles/CBMiUEFVX...,2025-02-18 21:03:00+00:00,Law360,"<a href=""https://news.google.com/rss/articles/..."
1688,Complex Financial Instruments Group Of The Yea...,https://news.google.com/rss/articles/CBMiUEFVX...,2025-02-18 21:03:00+00:00,Law360,"<a href=""https://news.google.com/rss/articles/..."
1689,Ex-Goldman Atty Squires Expected To Be Named U...,https://news.google.com/rss/articles/CBMiowFBV...,2025-02-18 21:12:00+00:00,Law360,"<a href=""https://news.google.com/rss/articles/..."
1690,Fund Formation Group Of The Year: Fried Frank ...,https://news.google.com/rss/articles/CBMipwFBV...,2025-02-18 21:28:00+00:00,Law360,"<a href=""https://news.google.com/rss/articles/..."


Saved state to: ./data/scrape_state_20250219101105.json
Saved pickle to: ./data/scrape_result_20250219101105.pkl
-------------- Credit Suisse  --------------
-- Current state: 2000-05-30-2000-05-29, record_count: 0, iter: 5
-- Current state: 2000-10-27-2000-10-26, record_count: 1, iter: 10
-- Current state: 2001-03-26-2001-03-25, record_count: 1, iter: 15
-- Current state: 2001-08-23-2001-08-22, record_count: 1, iter: 20
-- Current state: 2002-01-20-2002-01-19, record_count: 1, iter: 25
-- Current state: 2002-06-19-2002-06-18, record_count: 1, iter: 30
-- Current state: 2002-11-16-2002-11-15, record_count: 2, iter: 35
-- Current state: 2003-04-15-2003-04-14, record_count: 2, iter: 40
-- Current state: 2003-09-12-2003-09-11, record_count: 2, iter: 45
-- Current state: 2004-02-09-2004-02-08, record_count: 2, iter: 50
-- Current state: 2004-07-08-2004-07-07, record_count: 2, iter: 55
-- Current state: 2004-12-05-2004-12-04, record_count: 3, iter: 60
-- Current state: 2005-05-04-2005-05-03

Unnamed: 0,title,link,pubDate,source,description
630,International Investors File Claims Against Sw...,https://news.google.com/rss/articles/CBMitAFBV...,2025-02-07 08:00:00+00:00,Evrim Ağacı,"<a href=""https://news.google.com/rss/articles/..."
631,Greensill files disqualification bid against j...,https://news.google.com/rss/articles/CBMiogFBV...,2025-02-10 16:50:14+00:00,Lawyerly,"<a href=""https://news.google.com/rss/articles/..."
632,Greenpeace files lawsuit against US pipeline c...,https://news.google.com/rss/articles/CBMiuAFBV...,2025-02-11 17:59:18+00:00,Reuters,"<a href=""https://news.google.com/rss/articles/..."
633,Greenpeace files lawsuit against US pipeline c...,https://news.google.com/rss/articles/CBMiugFBV...,2025-02-12 03:12:11+00:00,MSN,"<a href=""https://news.google.com/rss/articles/..."
634,Judge who acted in ‘hot contest’ shouldn’t hea...,https://news.google.com/rss/articles/CBMiswFBV...,2025-02-14 15:00:38+00:00,Lawyerly,"<a href=""https://news.google.com/rss/articles/..."


Saved state to: ./data/scrape_state_20250219101532.json
Saved pickle to: ./data/scrape_result_20250219101532.pkl
-------------- Deutsche Bank  --------------
-- Current state: 2000-05-30-2000-05-29, record_count: 0, iter: 5
-- Current state: 2000-10-27-2000-10-26, record_count: 0, iter: 10
-- Current state: 2001-03-26-2001-03-25, record_count: 1, iter: 15
-- Current state: 2001-08-23-2001-08-22, record_count: 1, iter: 20
-- Current state: 2002-01-20-2002-01-19, record_count: 1, iter: 25
-- Current state: 2002-06-19-2002-06-18, record_count: 1, iter: 30
-- Current state: 2002-11-16-2002-11-15, record_count: 1, iter: 35
-- Current state: 2003-04-15-2003-04-14, record_count: 1, iter: 40
-- Current state: 2003-09-12-2003-09-11, record_count: 2, iter: 45
-- Current state: 2004-02-09-2004-02-08, record_count: 2, iter: 50
-- Current state: 2004-07-08-2004-07-07, record_count: 3, iter: 55
-- Current state: 2004-12-05-2004-12-04, record_count: 3, iter: 60
-- Current state: 2005-05-04-2005-05-03

Unnamed: 0,title,link,pubDate,source,description
1226,High Court win for law firm over billionaire’s...,https://news.google.com/rss/articles/CBMiygFBV...,2025-02-06 08:00:00+00:00,The Australian Financial Review,"<a href=""https://news.google.com/rss/articles/..."
1227,GSK PLC Stockholder Notice: Shareholder Rights...,https://news.google.com/rss/articles/CBMipwJBV...,2025-02-06 08:00:00+00:00,GlobeNewswire,"<a href=""https://news.google.com/rss/articles/..."
1228,"Mizuho raises Axsome stock target to $195, mai...",https://news.google.com/rss/articles/CBMivAFBV...,2025-02-12 06:16:02+00:00,Investing.com Nigeria,"<a href=""https://news.google.com/rss/articles/..."
1229,Axsome therapeutics CFO sells $1.95 million in...,https://news.google.com/rss/articles/CBMitgFBV...,2025-02-15 02:39:34+00:00,Investing.com,"<a href=""https://news.google.com/rss/articles/..."
1230,"INVESTOR ALERT: Faruqi & Faruqi, LLP Investiga...",https://news.google.com/rss/articles/CBMi2AFBV...,2025-02-19 00:18:31+00:00,Markets Insider,"<a href=""https://news.google.com/rss/articles/..."


Saved state to: ./data/scrape_state_20250219102003.json
Saved pickle to: ./data/scrape_result_20250219102003.pkl
-------------- Barclays  --------------
-- Current state: 2000-05-30-2000-05-29, record_count: 0, iter: 5
-- Current state: 2000-10-27-2000-10-26, record_count: 0, iter: 10
-- Current state: 2001-03-26-2001-03-25, record_count: 1, iter: 15
-- Current state: 2001-08-23-2001-08-22, record_count: 1, iter: 20
-- Current state: 2002-01-20-2002-01-19, record_count: 1, iter: 25
-- Current state: 2002-06-19-2002-06-18, record_count: 1, iter: 30
-- Current state: 2002-11-16-2002-11-15, record_count: 1, iter: 35
-- Current state: 2003-04-15-2003-04-14, record_count: 1, iter: 40
-- Current state: 2003-09-12-2003-09-11, record_count: 1, iter: 45
-- Current state: 2004-02-09-2004-02-08, record_count: 2, iter: 50
-- Current state: 2004-07-08-2004-07-07, record_count: 2, iter: 55
-- Current state: 2004-12-05-2004-12-04, record_count: 2, iter: 60
-- Current state: 2005-05-04-2005-05-03, rec

Unnamed: 0,title,link,pubDate,source,description
852,"Barclays' Q4 Earnings Up on Solid IB, 2025 NII...",https://news.google.com/rss/articles/CBMigAFBV...,2025-02-13 12:40:00+00:00,Yahoo Finance,"<a href=""https://news.google.com/rss/articles/..."
853,Barclays: Strong Revenue Growth and Solid Outl...,https://news.google.com/rss/articles/CBMiqAFBV...,2025-02-13 13:01:46+00:00,TipRanks,"<a href=""https://news.google.com/rss/articles/..."
854,Barclays Reveals FCA Probe Over Money Launderi...,https://news.google.com/rss/articles/CBMitwFBV...,2025-02-13 19:26:00+00:00,Law360,"<a href=""https://news.google.com/rss/articles/..."
855,Barclays reserves £90m for motor finance claim...,https://news.google.com/rss/articles/CBMinAFBV...,2025-02-14 10:36:43+00:00,Law Gazette,"<a href=""https://news.google.com/rss/articles/..."
856,General counsels describe internal investigati...,https://news.google.com/rss/articles/CBMi4wFBV...,2025-02-18 12:26:30+00:00,Global Investigations Review,"<a href=""https://news.google.com/rss/articles/..."


Saved state to: ./data/scrape_state_20250219102428.json
Saved pickle to: ./data/scrape_result_20250219102428.pkl
-------------- Bank of America  --------------
-- Current state: 2000-05-30-2000-05-29, record_count: 0, iter: 5
-- Current state: 2000-10-27-2000-10-26, record_count: 0, iter: 10
-- Current state: 2001-03-26-2001-03-25, record_count: 0, iter: 15
-- Current state: 2001-08-23-2001-08-22, record_count: 1, iter: 20
-- Current state: 2002-01-20-2002-01-19, record_count: 1, iter: 25
-- Current state: 2002-06-19-2002-06-18, record_count: 1, iter: 30
-- Current state: 2002-11-16-2002-11-15, record_count: 1, iter: 35
-- Current state: 2003-04-15-2003-04-14, record_count: 1, iter: 40
-- Current state: 2003-09-12-2003-09-11, record_count: 1, iter: 45
-- Current state: 2004-02-09-2004-02-08, record_count: 1, iter: 50
-- Current state: 2004-07-08-2004-07-07, record_count: 3, iter: 55
-- Current state: 2004-12-05-2004-12-04, record_count: 3, iter: 60
-- Current state: 2005-05-04-2005-05-

Unnamed: 0,title,link,pubDate,source,description
1976,Peloton wins dismissal of shareholder lawsuit ...,https://news.google.com/rss/articles/CBMiwwFBV...,2025-02-14 23:42:57+00:00,Reuters,"<a href=""https://news.google.com/rss/articles/..."
1977,Mexican president threatens to sue Google over...,https://news.google.com/rss/articles/CBMiqwFBV...,2025-02-18 16:06:03+00:00,Global News Toronto,"<a href=""https://news.google.com/rss/articles/..."
1978,XRP ETF Gets A Shot! Why MEMX’s SEC Filing Is ...,https://news.google.com/rss/articles/CBMiX0FVX...,2025-02-18 16:30:00+00:00,TronWeekly,"<a href=""https://news.google.com/rss/articles/..."
1979,Merrill Lynch Ex-Advisers Look to Pause Deferr...,https://news.google.com/rss/articles/CBMitgFBV...,2025-02-18 18:52:00+00:00,Bloomberg Law,"<a href=""https://news.google.com/rss/articles/..."
1980,"FreightCar America, Inc. Announces New $35 Mil...",https://news.google.com/rss/articles/CBMi7gFBV...,2025-02-18 21:33:04+00:00,The Manila Times,"<a href=""https://news.google.com/rss/articles/..."


Saved state to: ./data/scrape_state_20250219102912.json
Saved pickle to: ./data/scrape_result_20250219102912.pkl


# Bloomberg law

In [None]:
import requests
import json
import traceback
import time
import bs4 as bs
import pandas as pd
import urllib.parse
from datetime import datetime, timedelta, date
from IPython.display import display

class BloombergLawScraper():
    def __init__(self, proxy=None):
        self.proxy = proxy
        self.state = {}
        self.base_url = "https://news.bloomberglaw.com/api/v1/rss/litigation?"
    
    def scrape(self, search_query, start_time=datetime.now().date(), end_time = datetime.now().date(), fetch_limit=10, scrape_period = 7, add_url_parameters={}, save_state=False, save_result=False, save_folder="./data", preview_result=True, sleep_interval=5):
        """
        Web scrape google news rss feeds. 
        - Use date(YYYY, MM, DD) to initiate date arguments. eg: date(2025, 2, 19)
        Args:
            search_query (String): Search query
            start_time (Date): Default to datetime.now(). 
            end_time (Date): Default to datetime.now(). 
            fetch_limit (int): Number of items fetch (default is 10)
            scrape_period (int): no. of days for each scrape instance (inclusive of start and exclusive end: must be more than 1)
            add_url_parameters (dict): Additional url parameters
            save_state (boolean): Save state progress in json file. format: f"/scrape_state_{datetime.now().strftime('%Y%m%d%H%M%S')}.json"
            save_result (boolean): Save dataframe as pickle file. format: f"/scrape_result_{datetime.now().strftime('%Y%m%d%H%M%S')}.pkl"
            save_folder (String): root folder to save state and result
            preview_result (boolean): Show pandas dataframe after processing
            sleep_interval (int): Sleep for 3 seconds for every X interations (intervals).
        """
        if end_time < start_time:
            raise ValueError("End time must be greater than start time!")
        elif scrape_period < 1:
            raise ValueError("Scrape period must be more than 1!")
        
        # Save state
        self.state = {
            "parameters": {
                "search_query": search_query,
                "start_time": start_time,
                "end_time": end_time,
                "scrape_period": scrape_period,
                "add_url_parameters": add_url_parameters,
                "fetch_limit": fetch_limit
            },
            "iter_no": 0
        }
        to_time = None
        df = pd.DataFrame()
        try:
            while start_time <= end_time:
                to_time = min(start_time + timedelta(days=scrape_period), end_time)
                # Format and encode URL
                url_parameters = {"query": search_query, "startDate": start_time.strftime('%Y-%m-%d'),"endDate": to_time.strftime('%Y-%m-%d'), "limit": fetch_limit, **add_url_parameters}
                url = self.base_url + urllib.parse.urlencode(url_parameters)

                self.state["to_time"] = to_time
                self.state["url"] = url
                
                # GET request and parse
                df_intermediate = self.fetch_and_parse(url, pandas= True)
                if len(df_intermediate) > 0:
                    df = pd.concat([df,df_intermediate])
                    df.drop_duplicates(subset=["pubDate", "title"], inplace=True)
                    df.sort_values(by='pubDate', inplace=True)
                    df.reset_index(drop=True, inplace=True)

                self.state["iter_no"] += 1
                if self.state["iter_no"] % sleep_interval == 0:
                    print(f"-- Current state: {start_time}-{to_time}, record_count: {len(df)}, iter: {self.state['iter_no']}")
                    time.sleep(3)
                start_time += timedelta(days=scrape_period) 
        except Exception as e:
            print(f"########## Error: {start_time}-{to_time} {end_time} #########")
            print(traceback.format_exc())
        finally:
            self.state["record_count"] = len(df)
            if preview_result:
                print(f"############## Result: {len(df)} records ##############")
                display(df.tail())
            if save_state:
                save_path = save_folder.rstrip("/") + f"/scrape_state_{datetime.now().strftime('%Y%m%d%H%M%S')}.json"
                self.state["save_state_path"] = save_path
                with open(save_path, "w") as f:
                    json.dump(self.state, f, indent=4, default=str)
                print("Saved state to:", save_path)
            if save_result and len(df) > 0:
                save_path = save_folder.rstrip("/") + f"/scrape_result_{datetime.now().strftime('%Y%m%d%H%M%S')}.pkl"
                self.state["save_result_path"] = save_path
                df.to_pickle(save_path)
                print("Saved pickle to:", save_path)
            

    def fetch_and_parse(self, url, pandas= True):
        response = requests.get(url, proxies=self.proxy) 
        soup = bs.BeautifulSoup(response.text,'xml')

        result = []
        for item in soup.find_all('item'):
            result.append({
                "title": item.title.text,
                "link": item.link.text,
                "pubDate": item.pubDate.text,
                "description": item.description.text,
                "topic": list(map(lambda x: x.text, item.find_all('md:topic')))
            })
        if pandas:
            if len(result) == 0:
                return pd.DataFrame()
            df = pd.DataFrame(result)
            df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z', utc=True)
            return df
        return result

In [27]:
bloombergLawScraper = BloombergLawScraper()

In [28]:
banks_arr = ["UBS", "Citibank", "HSBC", "JPMorgan", "Goldman Sachs", "Credit Suisse", "Deutsche Bank", "Barclays", "Bank of America"]
for each_bank in banks_arr:
    print(f"-------------- {each_bank}  --------------")
    bloombergLawScraper.scrape(each_bank, start_time=date(2000,1,1), save_state=True, save_result=True, preview_result=True, scrape_period = 30, fetch_limit=100) 

-------------- UBS  --------------
-- Current state: 2000-04-30-2000-05-30, record_count: 0, iter: 5
-- Current state: 2000-09-27-2000-10-27, record_count: 0, iter: 10
-- Current state: 2001-02-24-2001-03-26, record_count: 0, iter: 15
-- Current state: 2001-07-24-2001-08-23, record_count: 0, iter: 20
-- Current state: 2001-12-21-2002-01-20, record_count: 0, iter: 25
-- Current state: 2002-05-20-2002-06-19, record_count: 0, iter: 30
-- Current state: 2002-10-17-2002-11-16, record_count: 0, iter: 35
-- Current state: 2003-03-16-2003-04-15, record_count: 0, iter: 40
-- Current state: 2003-08-13-2003-09-12, record_count: 0, iter: 45
-- Current state: 2004-01-10-2004-02-09, record_count: 0, iter: 50
-- Current state: 2004-06-08-2004-07-08, record_count: 0, iter: 55
-- Current state: 2004-11-05-2004-12-05, record_count: 0, iter: 60
-- Current state: 2005-04-04-2005-05-04, record_count: 0, iter: 65
-- Current state: 2005-09-01-2005-10-01, record_count: 0, iter: 70
-- Current state: 2006-01-29

  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')


-- Current state: 2023-04-30-2023-05-30, record_count: 59, iter: 285
-- Current state: 2023-09-27-2023-10-27, record_count: 89, iter: 290


  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')
  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')


-- Current state: 2024-02-24-2024-03-25, record_count: 106, iter: 295
-- Current state: 2024-07-23-2024-08-22, record_count: 158, iter: 300


  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')


-- Current state: 2024-12-20-2025-01-19, record_count: 176, iter: 305
############## Result: 182 records ##############


Unnamed: 0,title,link,pubDate,description,topic
177,Archegos CFO Gets Eight Years in Prison for De...,https://news.bloomberglaw.com/litigation/arche...,2025-01-27 14:21:13-05:00,Former Archegos Capital Management CFO Patrick...,"[witnesses, market manipulation, sentencing]"
178,Serta Lenders Seek Full Appeals Court Review o...,https://news.bloomberglaw.com/litigation/serta...,2025-02-05 15:57:46-05:00,A group of lenders who participated in the con...,"[confirmation of plan, indemnification, natura..."
179,UBS Whistleblower’s Trial Win Axed Over Unfit ...,https://news.bloomberglaw.com/litigation/ubs-w...,2025-02-10 15:52:23-05:00,A former UBS Group AG researcher saw his trial...,"[whistleblowing by employee, damages, reductio..."
180,Credit Suisse Volatility Index Investors Get A...,https://news.bloomberglaw.com/litigation/credi...,2025-02-12 15:01:34-05:00,UBS unit Credit Suisse Group AG faces an addit...,"[market manipulation, class certification, lea..."
181,ComEd Four Seek Stay in Madigan Bribery Case P...,https://news.bloomberglaw.com/litigation/comed...,2025-02-18 12:11:19-05:00,Four former Commonwealth Edison executives and...,"[state environmental legislation, sentencing, ..."


Saved state to: ./data/scrape_state_20250219121917.json
Saved pickle to: ./data/scrape_result_20250219121917.pkl
-------------- Citibank  --------------
-- Current state: 2000-04-30-2000-05-30, record_count: 0, iter: 5
-- Current state: 2000-09-27-2000-10-27, record_count: 0, iter: 10
-- Current state: 2001-02-24-2001-03-26, record_count: 0, iter: 15
-- Current state: 2001-07-24-2001-08-23, record_count: 0, iter: 20
-- Current state: 2001-12-21-2002-01-20, record_count: 0, iter: 25
-- Current state: 2002-05-20-2002-06-19, record_count: 0, iter: 30
-- Current state: 2002-10-17-2002-11-16, record_count: 0, iter: 35
-- Current state: 2003-03-16-2003-04-15, record_count: 0, iter: 40
-- Current state: 2003-08-13-2003-09-12, record_count: 0, iter: 45
-- Current state: 2004-01-10-2004-02-09, record_count: 0, iter: 50
-- Current state: 2004-06-08-2004-07-08, record_count: 0, iter: 55
-- Current state: 2004-11-05-2004-12-05, record_count: 0, iter: 60
-- Current state: 2005-04-04-2005-05-04, rec

  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')
  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')


-- Current state: 2024-02-24-2024-03-25, record_count: 34, iter: 295
-- Current state: 2024-07-23-2024-08-22, record_count: 46, iter: 300
-- Current state: 2024-12-20-2025-01-19, record_count: 56, iter: 305
############## Result: 58 records ##############


Unnamed: 0,title,link,pubDate,description,topic
53,Property Manager Can’t Block Bank Records Subp...,https://news.bloomberglaw.com/litigation/prope...,2024-11-13 12:13:54-05:00,A property manager can't challenge a summons t...,"[guilty pleas, restitution, indemnification, s..."
54,"Citi Sues Ex-Bankers, Saying They Took Client ...",https://news.bloomberglaw.com/litigation/citi-...,2024-11-22 11:42:42-05:00,Citibank NA accused two private bankers of ill...,"[notice, injunctions, trade secret misappropri..."
55,Citi Wins Court Order Against Ex-Banker Who Le...,https://news.bloomberglaw.com/litigation/citi-...,2024-11-27 10:53:50-05:00,Citibank NA won a court order requiring a form...,"[breach of contract, trade secret misappropria..."
56,Citigroup Fails to Shake New York’s Online Fra...,https://news.bloomberglaw.com/litigation/citig...,2025-01-21 17:23:52-05:00,Citigroup Inc. lost its bid to avoid some clai...,"[civil fraud, unfair and deceptive trade pract..."
57,Citigroup Wins Bid to Arbitrate Veterans’ Cred...,https://news.bloomberglaw.com/litigation/citig...,2025-01-27 14:20:13-05:00,Citigroup Inc. can arbitrate a proposed class ...,"[veterans, mandatory arbitration provisions, c..."


Saved state to: ./data/scrape_state_20250219122708.json
Saved pickle to: ./data/scrape_result_20250219122708.pkl
-------------- HSBC  --------------
-- Current state: 2000-04-30-2000-05-30, record_count: 0, iter: 5
-- Current state: 2000-09-27-2000-10-27, record_count: 0, iter: 10
-- Current state: 2001-02-24-2001-03-26, record_count: 0, iter: 15
-- Current state: 2001-07-24-2001-08-23, record_count: 0, iter: 20
-- Current state: 2001-12-21-2002-01-20, record_count: 0, iter: 25
-- Current state: 2002-05-20-2002-06-19, record_count: 0, iter: 30
-- Current state: 2002-10-17-2002-11-16, record_count: 0, iter: 35
-- Current state: 2003-03-16-2003-04-15, record_count: 0, iter: 40
-- Current state: 2003-08-13-2003-09-12, record_count: 0, iter: 45
-- Current state: 2004-01-10-2004-02-09, record_count: 0, iter: 50
-- Current state: 2004-06-08-2004-07-08, record_count: 0, iter: 55
-- Current state: 2004-11-05-2004-12-05, record_count: 0, iter: 60
-- Current state: 2005-04-04-2005-05-04, record_

  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')
  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')


-- Current state: 2024-02-24-2024-03-25, record_count: 67, iter: 295
-- Current state: 2024-07-23-2024-08-22, record_count: 79, iter: 300
-- Current state: 2024-12-20-2025-01-19, record_count: 84, iter: 305
############## Result: 87 records ##############


Unnamed: 0,title,link,pubDate,description,topic
82,Mexican Bank Units Must Face Bond-Manipulation...,https://news.bloomberglaw.com/litigation/mexic...,2025-01-15 17:40:57-05:00,Mexican affiliates of big banks including Bank...,"[dismissal, class actions, indemnification, ma..."
83,Lawyers in Metals Price-Fixing Case Awarded $6...,https://news.bloomberglaw.com/litigation/lawye...,2025-01-17 14:09:30-05:00,Attorneys representing plaintiffs in a case cl...,"[settlements, futures, attorney fee awards, cl..."
84,"HSBC Bank Must Pay $550,000 in Attorney Fees i...",https://news.bloomberglaw.com/litigation/hsbc-...,2025-01-22 16:08:39-05:00,HSBC Bank USA will have to pay more than half ...,"[settlements, credit reports, witnesses, disco..."
85,HSBC Workers Should Lose Bid for Wage Class St...,https://news.bloomberglaw.com/litigation/hsbc-...,2025-01-27 14:56:51-05:00,HSBC Bank USA NA personal bankers failed to me...,"[overtime, timecards and timesheets, joinder, ..."
86,PayPal Fined for Withholding Funds in Body Sho...,https://news.bloomberglaw.com/litigation/paypa...,2025-01-30 14:58:44-05:00,"A judge declared PayPal Holdings, Inc. in civi...","[payment systems, bankruptcy trustees and exam..."


Saved state to: ./data/scrape_state_20250219123500.json
Saved pickle to: ./data/scrape_result_20250219123500.pkl
-------------- JPMorgan  --------------
-- Current state: 2000-04-30-2000-05-30, record_count: 0, iter: 5
-- Current state: 2000-09-27-2000-10-27, record_count: 0, iter: 10
-- Current state: 2001-02-24-2001-03-26, record_count: 0, iter: 15
-- Current state: 2001-07-24-2001-08-23, record_count: 0, iter: 20
-- Current state: 2001-12-21-2002-01-20, record_count: 0, iter: 25
-- Current state: 2002-05-20-2002-06-19, record_count: 0, iter: 30
-- Current state: 2002-10-17-2002-11-16, record_count: 0, iter: 35
-- Current state: 2003-03-16-2003-04-15, record_count: 0, iter: 40
-- Current state: 2003-08-13-2003-09-12, record_count: 0, iter: 45
-- Current state: 2004-01-10-2004-02-09, record_count: 0, iter: 50
-- Current state: 2004-06-08-2004-07-08, record_count: 0, iter: 55
-- Current state: 2004-11-05-2004-12-05, record_count: 0, iter: 60
-- Current state: 2005-04-04-2005-05-04, rec

  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')


-- Current state: 2020-11-11-2020-12-11, record_count: 24, iter: 255
-- Current state: 2021-04-10-2021-05-10, record_count: 36, iter: 260
-- Current state: 2021-09-07-2021-10-07, record_count: 60, iter: 265
-- Current state: 2022-02-04-2022-03-06, record_count: 81, iter: 270
-- Current state: 2022-07-04-2022-08-03, record_count: 100, iter: 275


  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')


-- Current state: 2022-12-01-2022-12-31, record_count: 119, iter: 280


  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')


-- Current state: 2023-04-30-2023-05-30, record_count: 177, iter: 285
-- Current state: 2023-09-27-2023-10-27, record_count: 250, iter: 290


  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')
  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')


-- Current state: 2024-02-24-2024-03-25, record_count: 304, iter: 295
-- Current state: 2024-07-23-2024-08-22, record_count: 357, iter: 300


  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')


-- Current state: 2024-12-20-2025-01-19, record_count: 404, iter: 305
############## Result: 420 records ##############


Unnamed: 0,title,link,pubDate,description,topic
415,JPMorgan Can Pursue Trade Secrets Suit Against...,https://news.bloomberglaw.com/litigation/jpmor...,2025-02-06 14:02:50-05:00,JPMorgan Chase Bank can proceed with its feder...,"[trade secret misappropriation, amendment of p..."
416,"Cybereason CEO Sues Mnuchin, SoftBank Fund Ove...",https://news.bloomberglaw.com/litigation/cyber...,2025-02-11 13:26:36-05:00,The chief executive officer of Cybereason Inc....,"[corporate officers, tax liability of debtor, ..."
417,‘Greenhushing’ Emerges as Anti-ESG Pressures M...,https://news.bloomberglaw.com/litigation/green...,2025-02-13 05:00:01-05:00,The Trump administration's attacks on companie...,"[environmental reporting, investment professio..."
418,Judge Halts CFPB Action After Alleged Plans to...,https://news.bloomberglaw.com/litigation/sensi...,2025-02-14 18:05:53-05:00,A federal judge on Friday ordered the Consumer...,"[bank supervision, consumer finance, mass layo..."
419,JPMorgan Set to Relive ‘Huge Mistake’ at Javic...,https://news.bloomberglaw.com/litigation/jpmor...,2025-02-18 07:00:00-05:00,JPMorgan executives are expected to take the s...,"[witnesses, securities fraud, internal investi..."


Saved state to: ./data/scrape_state_20250219124253.json
Saved pickle to: ./data/scrape_result_20250219124253.pkl
-------------- Goldman Sachs  --------------
-- Current state: 2000-04-30-2000-05-30, record_count: 0, iter: 5
-- Current state: 2000-09-27-2000-10-27, record_count: 0, iter: 10
-- Current state: 2001-02-24-2001-03-26, record_count: 0, iter: 15
-- Current state: 2001-07-24-2001-08-23, record_count: 0, iter: 20
-- Current state: 2001-12-21-2002-01-20, record_count: 0, iter: 25
-- Current state: 2002-05-20-2002-06-19, record_count: 0, iter: 30
-- Current state: 2002-10-17-2002-11-16, record_count: 0, iter: 35
-- Current state: 2003-03-16-2003-04-15, record_count: 0, iter: 40
-- Current state: 2003-08-13-2003-09-12, record_count: 0, iter: 45
-- Current state: 2004-01-10-2004-02-09, record_count: 0, iter: 50
-- Current state: 2004-06-08-2004-07-08, record_count: 0, iter: 55
-- Current state: 2004-11-05-2004-12-05, record_count: 0, iter: 60
-- Current state: 2005-04-04-2005-05-04

  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')


-- Current state: 2020-11-11-2020-12-11, record_count: 8, iter: 255
-- Current state: 2021-04-10-2021-05-10, record_count: 22, iter: 260
-- Current state: 2021-09-07-2021-10-07, record_count: 45, iter: 265
-- Current state: 2022-02-04-2022-03-06, record_count: 63, iter: 270


  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')


-- Current state: 2022-07-04-2022-08-03, record_count: 85, iter: 275
-- Current state: 2022-12-01-2022-12-31, record_count: 96, iter: 280
-- Current state: 2023-04-30-2023-05-30, record_count: 125, iter: 285
-- Current state: 2023-09-27-2023-10-27, record_count: 163, iter: 290


  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')
  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')


-- Current state: 2024-02-24-2024-03-25, record_count: 208, iter: 295
-- Current state: 2024-07-23-2024-08-22, record_count: 259, iter: 300


  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')


-- Current state: 2024-12-20-2025-01-19, record_count: 286, iter: 305
############## Result: 293 records ##############


Unnamed: 0,title,link,pubDate,description,topic
288,"Pfizer, Anti-DEI Group End Suit Over Diversity...",https://news.bloomberglaw.com/litigation/pfize...,2025-01-31 16:38:16-05:00,Pfizer Inc. and a conservative group agreed to...,"[preliminary injunctions, standing, mootness, ..."
289,Ex-Goldman Banker Advances Some Pay Claims Aga...,https://news.bloomberglaw.com/litigation/ex-go...,2025-02-07 09:56:06-05:00,An investment banker who left Goldman Sachs & ...,"[investment professional compensation, resigna..."
290,‘Greenhushing’ Emerges as Anti-ESG Pressures M...,https://news.bloomberglaw.com/litigation/green...,2025-02-13 05:00:01-05:00,The Trump administration's attacks on companie...,"[environmental reporting, investment professio..."
291,Judge Halts CFPB Action After Alleged Plans to...,https://news.bloomberglaw.com/litigation/sensi...,2025-02-14 18:05:53-05:00,A federal judge on Friday ordered the Consumer...,"[bank supervision, consumer finance, mass layo..."
292,Ozy Media CEO Hit With $96 Million Restitution...,https://news.bloomberglaw.com/litigation/ozy-m...,2025-02-17 11:23:12-05:00,Ozy Media chief executive Carlos Watson and hi...,"[forfeiture, corporate officers, securities fr..."


Saved state to: ./data/scrape_state_20250219125044.json
Saved pickle to: ./data/scrape_result_20250219125044.pkl
-------------- Credit Suisse  --------------
-- Current state: 2000-04-30-2000-05-30, record_count: 0, iter: 5
-- Current state: 2000-09-27-2000-10-27, record_count: 0, iter: 10
-- Current state: 2001-02-24-2001-03-26, record_count: 0, iter: 15
-- Current state: 2001-07-24-2001-08-23, record_count: 0, iter: 20
-- Current state: 2001-12-21-2002-01-20, record_count: 0, iter: 25
-- Current state: 2002-05-20-2002-06-19, record_count: 0, iter: 30
-- Current state: 2002-10-17-2002-11-16, record_count: 0, iter: 35
-- Current state: 2003-03-16-2003-04-15, record_count: 0, iter: 40
-- Current state: 2003-08-13-2003-09-12, record_count: 0, iter: 45
-- Current state: 2004-01-10-2004-02-09, record_count: 0, iter: 50
-- Current state: 2004-06-08-2004-07-08, record_count: 0, iter: 55
-- Current state: 2004-11-05-2004-12-05, record_count: 0, iter: 60
-- Current state: 2005-04-04-2005-05-04

  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')


-- Current state: 2023-04-30-2023-05-30, record_count: 58, iter: 285
-- Current state: 2023-09-27-2023-10-27, record_count: 77, iter: 290


  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')
  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')


-- Current state: 2024-02-24-2024-03-25, record_count: 95, iter: 295
-- Current state: 2024-07-23-2024-08-22, record_count: 128, iter: 300


  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')


-- Current state: 2024-12-20-2025-01-19, record_count: 152, iter: 305
############## Result: 156 records ##############


Unnamed: 0,title,link,pubDate,description,topic
151,Ex-Mozambique Minister Gets 8 1/2 Years in US ...,https://news.bloomberglaw.com/litigation/ex-mo...,2025-01-17 17:33:15-05:00,A former Mozambique finance minister was sente...,"[securities fraud, guarantee, wire fraud, sent..."
152,Archegos CFO Gets Eight Years in Prison for De...,https://news.bloomberglaw.com/litigation/arche...,2025-01-27 14:21:13-05:00,Former Archegos Capital Management CFO Patrick...,"[witnesses, market manipulation, sentencing]"
153,Archegos Co-CEO Drops Bid for Hwang Money Afte...,https://news.bloomberglaw.com/litigation/arche...,2025-01-28 14:58:20-05:00,Archegos Capital Management’s former co-chief ...,"[corporate officers, witnesses, restitution, c..."
154,Serta Lenders Seek Full Appeals Court Review o...,https://news.bloomberglaw.com/litigation/serta...,2025-02-05 15:57:46-05:00,A group of lenders who participated in the con...,"[confirmation of plan, indemnification, natura..."
155,Credit Suisse Volatility Index Investors Get A...,https://news.bloomberglaw.com/litigation/credi...,2025-02-12 15:01:34-05:00,UBS unit Credit Suisse Group AG faces an addit...,"[market manipulation, class certification, lea..."


Saved state to: ./data/scrape_state_20250219125836.json
Saved pickle to: ./data/scrape_result_20250219125836.pkl
-------------- Deutsche Bank  --------------
-- Current state: 2000-04-30-2000-05-30, record_count: 0, iter: 5
-- Current state: 2000-09-27-2000-10-27, record_count: 0, iter: 10
-- Current state: 2001-02-24-2001-03-26, record_count: 0, iter: 15
-- Current state: 2001-07-24-2001-08-23, record_count: 0, iter: 20
-- Current state: 2001-12-21-2002-01-20, record_count: 0, iter: 25
-- Current state: 2002-05-20-2002-06-19, record_count: 0, iter: 30
-- Current state: 2002-10-17-2002-11-16, record_count: 0, iter: 35
-- Current state: 2003-03-16-2003-04-15, record_count: 0, iter: 40
-- Current state: 2003-08-13-2003-09-12, record_count: 0, iter: 45
-- Current state: 2004-01-10-2004-02-09, record_count: 0, iter: 50
-- Current state: 2004-06-08-2004-07-08, record_count: 0, iter: 55
-- Current state: 2004-11-05-2004-12-05, record_count: 0, iter: 60
-- Current state: 2005-04-04-2005-05-04

  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')


-- Current state: 2022-07-04-2022-08-03, record_count: 105, iter: 275
-- Current state: 2022-12-01-2022-12-31, record_count: 121, iter: 280


  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')


-- Current state: 2023-04-30-2023-05-30, record_count: 166, iter: 285
-- Current state: 2023-09-27-2023-10-27, record_count: 213, iter: 290


  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')
  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')


-- Current state: 2024-02-24-2024-03-25, record_count: 264, iter: 295
-- Current state: 2024-07-23-2024-08-22, record_count: 291, iter: 300
-- Current state: 2024-12-20-2025-01-19, record_count: 304, iter: 305
############## Result: 307 records ##############


Unnamed: 0,title,link,pubDate,description,topic
302,Prudential Beats 401(k) Fund Class Action With...,https://news.bloomberglaw.com/litigation/prude...,2024-12-20 09:45:15-05:00,Prudential Insurance Co. of America used an ap...,"[summary judgment, class actions, mutual funds..."
303,Mexican Bank Units Must Face Bond-Manipulation...,https://news.bloomberglaw.com/litigation/mexic...,2025-01-15 17:40:57-05:00,Mexican affiliates of big banks including Bank...,"[dismissal, class actions, indemnification, ma..."
304,Deutsche Bank Again Can’t Recoup Billionaire’s...,https://news.bloomberglaw.com/litigation/deuts...,2025-01-30 14:25:42-05:00,Deutsche Bank AG can't collect a $236 million ...,"[civil fraud, state corporate regulation, inte..."
305,Investors Sue Drug Maker GSK Over Zantac Cance...,https://news.bloomberglaw.com/litigation/inves...,2025-02-05 11:59:42-05:00,Global pharmaceutical company GSK Plc allegedl...,"[settlements, class actions, securities violat..."
306,Unicredit Backs Down From UK Lawsuit After Rus...,https://news.bloomberglaw.com/litigation/unicr...,2025-02-11 10:55:37-05:00,UniCredit SpA will stop pursuing a high stakes...,"[multinational corporations, injunctions, inte..."


Saved state to: ./data/scrape_state_20250219130629.json
Saved pickle to: ./data/scrape_result_20250219130629.pkl
-------------- Barclays  --------------
-- Current state: 2000-04-30-2000-05-30, record_count: 0, iter: 5
-- Current state: 2000-09-27-2000-10-27, record_count: 0, iter: 10
-- Current state: 2001-02-24-2001-03-26, record_count: 0, iter: 15
-- Current state: 2001-07-24-2001-08-23, record_count: 0, iter: 20
-- Current state: 2001-12-21-2002-01-20, record_count: 0, iter: 25
-- Current state: 2002-05-20-2002-06-19, record_count: 0, iter: 30
-- Current state: 2002-10-17-2002-11-16, record_count: 0, iter: 35
-- Current state: 2003-03-16-2003-04-15, record_count: 0, iter: 40
-- Current state: 2003-08-13-2003-09-12, record_count: 0, iter: 45
-- Current state: 2004-01-10-2004-02-09, record_count: 0, iter: 50
-- Current state: 2004-06-08-2004-07-08, record_count: 0, iter: 55
-- Current state: 2004-11-05-2004-12-05, record_count: 0, iter: 60
-- Current state: 2005-04-04-2005-05-04, rec

  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')


-- Current state: 2020-11-11-2020-12-11, record_count: 10, iter: 255
-- Current state: 2021-04-10-2021-05-10, record_count: 19, iter: 260
-- Current state: 2021-09-07-2021-10-07, record_count: 30, iter: 265
-- Current state: 2022-02-04-2022-03-06, record_count: 40, iter: 270


  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')


-- Current state: 2022-07-04-2022-08-03, record_count: 60, iter: 275
-- Current state: 2022-12-01-2022-12-31, record_count: 77, iter: 280


  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')


-- Current state: 2023-04-30-2023-05-30, record_count: 105, iter: 285
-- Current state: 2023-09-27-2023-10-27, record_count: 148, iter: 290


  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')
  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')


-- Current state: 2024-02-24-2024-03-25, record_count: 173, iter: 295
-- Current state: 2024-07-23-2024-08-22, record_count: 198, iter: 300


  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')


-- Current state: 2024-12-20-2025-01-19, record_count: 218, iter: 305
############## Result: 222 records ##############


Unnamed: 0,title,link,pubDate,description,topic
217,Three Federal Tax Cases That Practitioners Mus...,https://news.bloomberglaw.com/litigation/three...,2024-12-26 04:45:01-05:00,The most important federal tax cases in 2025 i...,"[digital currency, jobs credits, federal tax, ..."
218,Hearst Units Beat Ex-Worker’s Bid to Revive Va...,https://news.bloomberglaw.com/litigation/hears...,2025-01-21 11:52:00-05:00,Two Hearst Corp. units reasonably relied on me...,"[summary judgment, disabilities discrimination..."
219,"This Week in Chancery Court: Apollo, Fidelity,...",https://news.bloomberglaw.com/litigation/this-...,2025-02-03 05:00:00-05:00,Apollo Global Management Inc. will ask the Del...,"[dismissal, state corporate regulation, privat..."
220,Air Marshals Union Advances First Amendment Cl...,https://news.bloomberglaw.com/litigation/air-m...,2025-02-05 10:19:11-05:00,A union that represents air marshals employed ...,"[freedom of speech, dismissal, law enforcement..."
221,NY Counties Can Be Held Liable for Abuse of Fo...,https://news.bloomberglaw.com/litigation/ny-co...,2025-02-18 12:38:01-05:00,Local governments in New York are responsible ...,"[negligence, foster care, burden of proof, chi..."


Saved state to: ./data/scrape_state_20250219131422.json
Saved pickle to: ./data/scrape_result_20250219131422.pkl
-------------- Bank of America  --------------
-- Current state: 2000-04-30-2000-05-30, record_count: 0, iter: 5
-- Current state: 2000-09-27-2000-10-27, record_count: 0, iter: 10
-- Current state: 2001-02-24-2001-03-26, record_count: 0, iter: 15
-- Current state: 2001-07-24-2001-08-23, record_count: 0, iter: 20
-- Current state: 2001-12-21-2002-01-20, record_count: 0, iter: 25
-- Current state: 2002-05-20-2002-06-19, record_count: 0, iter: 30
-- Current state: 2002-10-17-2002-11-16, record_count: 0, iter: 35
-- Current state: 2003-03-16-2003-04-15, record_count: 0, iter: 40
-- Current state: 2003-08-13-2003-09-12, record_count: 0, iter: 45
-- Current state: 2004-01-10-2004-02-09, record_count: 0, iter: 50
-- Current state: 2004-06-08-2004-07-08, record_count: 0, iter: 55
-- Current state: 2004-11-05-2004-12-05, record_count: 0, iter: 60
-- Current state: 2005-04-04-2005-05-

  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')


-- Current state: 2020-11-11-2020-12-11, record_count: 41, iter: 255
-- Current state: 2021-04-10-2021-05-10, record_count: 90, iter: 260
-- Current state: 2021-09-07-2021-10-07, record_count: 143, iter: 265
-- Current state: 2022-02-04-2022-03-06, record_count: 193, iter: 270


  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')


-- Current state: 2022-07-04-2022-08-03, record_count: 267, iter: 275


  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')


-- Current state: 2022-12-01-2022-12-31, record_count: 326, iter: 280


  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')


-- Current state: 2023-04-30-2023-05-30, record_count: 421, iter: 285
-- Current state: 2023-09-27-2023-10-27, record_count: 597, iter: 290


  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')
  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')


-- Current state: 2024-02-24-2024-03-25, record_count: 771, iter: 295
-- Current state: 2024-07-23-2024-08-22, record_count: 1021, iter: 300


  df['pubDate'] = pd.to_datetime(df['pubDate'], format='%a, %d %b %Y %H:%M:%S %z')


-- Current state: 2024-12-20-2025-01-19, record_count: 1193, iter: 305
############## Result: 1242 records ##############


Unnamed: 0,title,link,pubDate,description,topic
1237,Seven Cases to Watch as the Courts Weigh Trump...,https://news.bloomberglaw.com/litigation/seven...,2025-02-15 08:00:22-05:00,President Donald Trump is now defending his wa...,"[gender identity discrimination, unfair and de..."
1238,Willkie Removal in Bankruptcy Case Is Lesson i...,https://news.bloomberglaw.com/litigation/willk...,2025-02-18 05:00:01-05:00,Willkie Farr & Gallagher LLP's disqualificatio...,"[attorney disqualification, leveraged buyout, ..."
1239,Black Church Breaks New Legal Ground Taking Ov...,https://news.bloomberglaw.com/litigation/black...,2025-02-18 05:05:02-05:00,A historic Black church unable to compel the f...,"[default judgment, goodwill, freedom of speech..."
1240,What J&J Is Trying to Achieve in Bankruptcy Co...,https://news.bloomberglaw.com/litigation/what-...,2025-02-18 10:00:03-05:00,Johnson & Johnson begins a two-week trial on F...,"[settlements, personal use product safety, pha..."
1241,Merrill Lynch Ex-Advisers Look to Pause Deferr...,https://news.bloomberglaw.com/litigation/merri...,2025-02-18 13:52:33-05:00,Seven former financial advisers for Bank of Am...,"[forfeiture, commission payments, mandatory ar..."


Saved state to: ./data/scrape_state_20250219132236.json
Saved pickle to: ./data/scrape_result_20250219132236.pkl
