# Stock prices near earnings
## In this notebook, we gather data for the following features 
###    i) the closing price for a stock the (trading) day after and before earnings 
###    ii) 15 and 7 (trading) days before earnings 
###   iii) percentage changes for the same

In [1]:
import pandas as pd
import requests
import numpy as np
import matplotlib.pyplot as plt
import re
from datetime import datetime
from pandas.tseries.offsets import BDay
from datetime import datetime, timedelta


from seaborn import set_style
set_style("whitegrid")

import os

from bs4 import BeautifulSoup
import yfinance as yf

## Getting ticker and date from the earnings call transcript

In [2]:
def extract_earnings_call_info(content):
    # Regex pattern to find the ticker symbol
    ticker_pattern = re.compile(r'\((NASDAQ|NYSE):(.*?)\)')
    # More flexible regex pattern to capture various date and time formats
    date_time_pattern = re.compile(
        r'(\b\d{1,2}\s+[A-Za-z]+\s+\d{4}|\b[A-Za-z]+\s+\d{1,2},\s+\d{4}),?\s*(?:at)?\s*(\d{1,2}:\d{2}\s*(AM|PM|am|pm)?)?\s*(ET|PT|CT|MT)?',
        re.IGNORECASE
    )
    
    # Search for the ticker symbol
    ticker_search = ticker_pattern.search(content)
    ticker = ticker_search.group(2) if ticker_search else "Ticker Symbol Not Found"
    
    # Search for the date and time
    date_time_search = date_time_pattern.search(content)
    if date_time_search:
        date = date_time_search.group(1)
        if '0000' in date:  # If '0000' found, search for the next match
            start_pos = date_time_search.end()
            date_time_search = date_time_pattern.search(content, pos=start_pos)  # Start new search after the first match
            
        # Check if there's a valid second match or if the first was valid
        if date_time_search:
            date = date_time_search.group(1)
            time = date_time_search.group(2) if date_time_search.group(2) else "Time Not Provided"  # Time if present
            am_pm = date_time_search.group(3).upper() if date_time_search.group(3) else ""
            timezone = date_time_search.group(4) if date_time_search.group(4) else "Timezone Not Provided"
            date_time = f"{date} at {time} {timezone}".strip()
        else:
            date_time = "Valid Date and Time Not Found"
    else:
        date_time = "Date and Time Not Found"
    
    return ticker, date_time


## Checking if the above function is detecting ticker and date_time for most companies

In [3]:
# Assuming extract_earnings_call_info is defined elsewhere and imported

base_path = r"C:\Users\spatan5\Desktop\EarningsNLP\Earnings Call Transcripts"

counter = 0
missed_earnings = []
weird_filename = []
data_records = []  # List to hold all records for DataFrame

for company_folder in os.listdir(base_path):
    company_path = os.path.join(base_path, company_folder)
    if os.path.isdir(company_path):
        for file_name in os.listdir(company_path):
            file_path = os.path.join(company_path, file_name)
            if os.path.isfile(file_path):
                with open(file_path, 'rb') as file:
                    content = file.read()
                    soup = BeautifulSoup(content, 'html.parser')
                    text_content = soup.get_text()
                    ticker, date_time = extract_earnings_call_info(text_content)
        
                    
                    if file_name[-4:].isdigit():
                        counter += 1
                    if not file_name[-4:].isdigit():
                        weird_filename.append(file_name)
                    if file_name[-4:].isdigit() and date_time == "Date and Time Not Found":
                        missed_earnings.append(file_name)
                    if ticker == "Ticker Symbol Not Found":
                        q_index = file_name.upper().find('Q')
                        if q_index != -1:
                            ticker = file_name[:q_index]
                    # Append each record to the data_records list
                    if ticker == "FB":
                        ticker = "META"
                    data_records.append({
                        "File Name": file_name,
                        "Ticker": ticker,
                        "Earnings Date and Time": date_time
                    })
                    print(f"File: {file_name}, Ticker: {ticker}, Date and Time: {date_time}")


File: .DS_Store, Ticker: Ticker Symbol Not Found, Date and Time: Date and Time Not Found
File: AAPLQ12015, Ticker: AAPL, Date and Time: January 27, 2015 at 5:00 PM ET
File: AAPLQ12016, Ticker: AAPL, Date and Time: January 26, 2016 at 5:00 PM ET
File: AAPLQ12017, Ticker: AAPL, Date and Time: January 31, 2017 at 5:00 PM ET
File: AAPLQ12018, Ticker: AAPL, Date and Time: February 1, 2018 at 5:00 PM ET
File: AAPLQ12019, Ticker: AAPL, Date and Time: January 29, 2019 at 5:00 PM ET
File: AAPLQ12020, Ticker: AAPL, Date and Time: January 28, 2020 at 5:00 PM ET
File: AAPLQ12021, Ticker: AAPL, Date and Time: January 27, 2021 at 5:00 PM ET
File: AAPLQ12022, Ticker: AAPL, Date and Time: January 27, 2022 at 5:00 PM ET
File: AAPLQ12023, Ticker: AAPL, Date and Time: February 2, 2023 at 5:00 PM ET
File: AAPLQ12024, Ticker: AAPL, Date and Time: February 1, 2024 at 5:00 PM ET
File: AAPLQ22015, Ticker: AAPL, Date and Time: April 27, 2015 at 5:00 PM ET
File: AAPLQ22016, Ticker: AAPL, Date and Time: April 26

  soup = BeautifulSoup(content, 'html.parser')


File: UNPQ32019, Ticker: UNP, Date and Time: October 17, 2019 at 8:45 AM ET
File: UNPQ32020, Ticker: UNP, Date and Time: October 22, 2020 at 8:45 AM ET
File: UNPQ32021, Ticker: UNP, Date and Time: October 21, 2021 at 8:45 AM ET
File: UNPQ32022, Ticker: UNP, Date and Time: October 20, 2022 at 8:45 AM ET
File: UNPQ32023, Ticker: UNP, Date and Time: October 19, 2023 at 8:45 AM ET
File: UNPQ42014, Ticker: UNP, Date and Time: Date and Time Not Found
File: UNPQ42015, Ticker: UNP, Date and Time: January 20, 2016 at 8:45 AM ET
File: UNPQ42016, Ticker: UNP, Date and Time: January 19, 2017 at 8:45 AM ET
File: UNPQ42017, Ticker: UNP, Date and Time: January 25, 2017 at 8:45 PM ET
File: UNPQ42018, Ticker: UNP, Date and Time: January 24, 2019 at 8:45 AM ET
File: UNPQ42019, Ticker: UNP, Date and Time: January 23, 2020 at 8:45 AM ET
File: UNPQ42020, Ticker: UNP, Date and Time: January 21, 2021 at 8:45 AM ET
File: UNPQ42021, Ticker: UNP, Date and Time: January 20, 2021 at 8:45 AM ET
File: UNPQ42022, Ti

In [5]:
# Create DataFrame from the list of records
earnings_df = pd.DataFrame(data_records)

# Display the DataFrame to verify contents
print(earnings_df.sample(10))

# Optional: Save the DataFrame to a CSV file
earnings_df.to_csv("earnings_data.csv", index=False)

       File Name Ticker            Earnings Date and Time
143    ACNQ42015    ACN  September 24, 2015 at 8:00 AM ET
342    AMDQ42016    AMD    January 31, 2017 at 5:00 PM ET
3159  SBUXQ42022   SBUX    November 3, 2022 at 5:00 PM ET
608    BACQ22021    BAC       July 14, 2021 at 9:00 AM ET
2553  NFLXQ12022   NFLX      April 19, 2022 at 6:00 PM ET
2369   MRKQ12021    MRK      April 29, 2021 at 8:00 AM ET
2730  ORCLQ22021   ORCL   December 10, 2020 at 5:00 PM ET
2649   NOWQ32019    NOW    October 23, 2019 at 5:00 PM ET
2408    MSQ22018     MS       July 18, 2018 at 8:30 AM ET
2623   NKEQ42023    NKE       June 29, 2023 at 5:00 PM ET


In [6]:
print(counter)

3725


In [7]:
print(missed_earnings)

['PLDQ42018', 'UNPQ32014', 'UNPQ42014']


In [11]:
len(missed_earnings)

3

In [8]:
print(weird_filename, len(weird_filename))

['.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', "The Walt Disney's (DIS) CEO Bob Iger on F4Q 2014 Results - Earnings Call Transcript", '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', "Intuit's (INTU) CEO Brad Smith on F1Q 2015 Results - Earnings Call Transcript", '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store', '.DS_Store',

## Extracting 
### i) closing price of stock just on the trading day just after the earnings call
### ii) closing price of the stock from the last 7, 15 days before the earnings call

In [9]:

def parse_date_time(date_time_str):
    """Parse the datetime string by correcting format errors and converting to datetime object."""
    # Normalize spaces and remove timezone abbreviation.
    normalized_str = ' '.join(date_time_str.strip().split())
    cleaned_str = re.sub(r'\s+[A-Z]{2}$', '', normalized_str, flags=re.IGNORECASE)
    
    # Fix for incorrect AM/PM notation with 24-hour format
    match = re.search(r'(\d{1,2}):(\d{2})\s*(AM|PM)', cleaned_str, re.IGNORECASE)
    if match:
        hour = int(match.group(1))
        minute = match.group(2)
        period = match.group(3).upper()
        if hour > 12 and 'PM' in period:
            hour -= 12
        elif hour == 0 and 'AM' in period:
            hour = 12
        corrected_time = f"{hour}:{minute} {period}"
        cleaned_str = re.sub(r'\d{1,2}:\d{2}\s*(AM|PM)', corrected_time, cleaned_str, flags=re.IGNORECASE)

    # Attempt to parse the datetime with AM/PM
    try:
        return datetime.strptime(cleaned_str, "%B %d, %Y at %I:%M %p")
    except ValueError:
        # Attempt to parse without AM/PM if previous fails
        try:
            return datetime.strptime(cleaned_str, "%B %d, %Y at %H:%M")
        except ValueError:
            return None


In [10]:
def get_trading_day(ticker, date_time_str):
    """ Find the next trading day after the earnings call. """
    dt = parse_date_time(date_time_str)
    if dt.hour < 16:  # Before market close
        trading_day = dt
    else:
        trading_day = (dt + BDay(1)).date()
    
    ticker_info = yf.Ticker(ticker)
    for _ in range(4):  # Check next 4 business days
        data = ticker_info.history(start=trading_day, end=trading_day + timedelta(days=1))
        if not data.empty:
            return trading_day
        trading_day = (trading_day + BDay(1)).date()

In [11]:
def get_closing_price(ticker, date):
    """ Get closing price for a given ticker and date.
        If data is not available, checks up to four business days prior.
    """
    ticker_info = yf.Ticker(ticker)
    for i in range(5):  # Check the specified day and up to four business days before
        try:
            # Adjust the date by subtracting business days
            current_date = date - BDay(i)
            data = ticker_info.history(start=current_date, end=current_date + timedelta(days=1))
            if not data.empty:
                return data['Close'].iloc[0]
        except Exception as e:
            continue  # Try the next earlier day if there's an error

    return None  # Return None if no data is found after all attempts

In [12]:
def trading_day_before(ticker, date_time_str):
    """ Get the trading day before the earnings call. """
    dt = parse_date_time(date_time_str)
    if dt.hour >= 16:  # After market close
        trading_day = dt
    else:
        trading_day = (dt - BDay(1)).date()

    ticker_info = yf.Ticker(ticker)
    for _ in range(4):  # Check previous 4 business days
        data = ticker_info.history(start=trading_day, end=trading_day + timedelta(days=1))
        if not data.empty:
            return trading_day
        trading_day = (trading_day - BDay(1)).date()

In [13]:
def get_prior_prices(ticker, date_time_str, days_before):
    """ Get closing prices a specified number of business days before the base date. """
    base_date = parse_date_time(date_time_str)
    trading_day = (base_date - BDay(days_before)).date()
    return get_closing_price(ticker, trading_day)

In [14]:
ticker = 'ACN'
date_time_str = 'September 23, 2021 at 8:00 AM ET'
next_trading_day = get_trading_day(ticker, date_time_str)
closing_price_next_day = get_closing_price(ticker, next_trading_day)
trading_day_previous = trading_day_before(ticker, date_time_str)
closing_price_previous_day = get_closing_price(ticker, trading_day_previous)
price_7_days_before = get_prior_prices(ticker, date_time_str, 7)
price_15_days_before = get_prior_prices(ticker, date_time_str, 15)

print(f"Next trading day: {next_trading_day}")
print(f"Closing price next day: {closing_price_next_day}")
print(f"Previous trading day: {trading_day_previous}")
print(f"Closing price previous day: {closing_price_previous_day}")
print(f"Price 7 days before: {price_7_days_before}")
print(f"Price 15 days before: {price_15_days_before}")

Next trading day: 2021-09-23 08:00:00
Closing price next day: 329.62091064453125
Previous trading day: 2021-09-22
Closing price previous day: 321.6062927246094
Price 7 days before: 327.42987060546875
Price 15 days before: 327.698974609375


## Creating final dataframe with file_name and various stock prices and percentage earnings changes

In [15]:
# Assuming all functions and earnings_df are defined and imported correctly.

def compute_stock_changes(row):
    try:
        if row['Ticker'] == "Ticker Symbol Not Found" or row['Earnings Date and Time'] == "Date and Time Not Found":
            return pd.Series([None, None, None, None, None, None, None])

        # Calculate trading days and prices
        next_trading_day = get_trading_day(row['Ticker'], row['Earnings Date and Time'])
        prev_trading_day = trading_day_before(row['Ticker'], row['Earnings Date and Time'])
        price_next_day = get_closing_price(row['Ticker'], next_trading_day)
        price_prev_day = get_closing_price(row['Ticker'], prev_trading_day)
        price_7_days_before = get_prior_prices(row['Ticker'], row['Earnings Date and Time'], 7)
        price_15_days_before = get_prior_prices(row['Ticker'], row['Earnings Date and Time'], 15)

        # Calculate percentage changes
        perc_change_next_prev = ((price_next_day - price_prev_day) / price_prev_day * 100) if price_prev_day else None
        perc_change_prev_7 = ((price_prev_day - price_7_days_before) / price_7_days_before * 100) if price_7_days_before else None
        perc_change_7_15 = ((price_7_days_before - price_15_days_before) / price_15_days_before * 100) if price_15_days_before else None

        # Return a series of computed values
        return pd.Series([
            price_next_day,
            price_prev_day,
            price_7_days_before,
            price_15_days_before,
            round(perc_change_next_prev, 2) if perc_change_next_prev else None,
            round(perc_change_prev_7, 2) if perc_change_prev_7 else None,
            round(perc_change_7_15, 2) if perc_change_7_15 else None
        ])
    except Exception as e:
        # To handle unexpected errors
        print(f"Error processing {row['File Name']}: {str(e)}")
        return pd.Series([None]*7)


In [16]:
# Define the new columns to add
new_cols = ['closing_price_next_day', 'closing_price_previous_day', 'price_7_days_before', 'price_15_days_before',
            'perc_change_next_prev', 'perc_change_prev_7', 'perc_change_7_15']

# Apply the computation function
new_data = earnings_df.apply(compute_stock_changes, axis=1)
new_data.columns = new_cols

# Concatenate with the original DataFrame
final_df = pd.concat([earnings_df, new_data], axis=1)

# Display the new DataFrame to verify contents
print(final_df.head())

AAPL: No price data found, symbol may be delisted (1d 2021-01-18 00:00:00 -> 2021-01-19 00:00:00)
AAPL: No price data found, symbol may be delisted (1d 2019-04-19 00:00:00 -> 2019-04-20 00:00:00)
ABBV: No price data found, symbol may be delisted (1d 2020-04-10 00:00:00 -> 2020-04-11 00:00:00)
ABT: No price data found, symbol may be delisted (1d 2017-01-16 00:00:00 -> 2017-01-17 00:00:00)
ABT: No price data found, symbol may be delisted (1d 2020-01-01 00:00:00 -> 2020-01-02 00:00:00)
ABT: No price data found, symbol may be delisted (1d 2021-01-18 00:00:00 -> 2021-01-19 00:00:00)
ABT: No price data found, symbol may be delisted (1d 2023-01-16 00:00:00 -> 2023-01-17 00:00:00)
ABT: No price data found, symbol may be delisted (1d 2024-01-15 00:00:00 -> 2024-01-16 00:00:00)
ACN: No price data found, symbol may be delisted (1d 2014-11-27 00:00:00 -> 2014-11-28 00:00:00)
ACN: No price data found, symbol may be delisted (1d 2015-11-26 00:00:00 -> 2015-11-27 00:00:00)
ACN: No price data found, s

Error processing BKNGQ12021: 'NoneType' object has no attribute 'hour'


BKNG: No price data found, symbol may be delisted (1d 2019-02-18 00:00:00 -> 2019-02-19 00:00:00)
BKNG: No price data found, symbol may be delisted (1d 2020-02-17 00:00:00 -> 2020-02-18 00:00:00)
BKNG: No price data found, symbol may be delisted (1d 2021-02-15 00:00:00 -> 2021-02-16 00:00:00)
BLK: No price data found, symbol may be delisted (1d 2021-07-05 00:00:00 -> 2021-07-06 00:00:00)
BLK: No price data found, symbol may be delisted (1d 2014-12-25 00:00:00 -> 2014-12-26 00:00:00)
BLK: No price data found, symbol may be delisted (1d 2015-12-25 00:00:00 -> 2015-12-26 00:00:00)
BLK: No price data found, symbol may be delisted (1d 2019-12-25 00:00:00 -> 2019-12-26 00:00:00)
BLK: No price data found, symbol may be delisted (1d 2021-12-24 00:00:00 -> 2021-12-25 00:00:00)
BMY: No price data found, symbol may be delisted (1d 2014-04-18 00:00:00 -> 2014-04-19 00:00:00)
BMY: No price data found, symbol may be delisted (1d 2013-07-04 00:00:00 -> 2013-07-05 00:00:00)
BMY: No price data found, s

Error processing COPQ12020: 'NoneType' object has no attribute 'hour'
Error processing COPQ42017: 'NoneType' object has no attribute 'hour'


COST: No price data found, symbol may be delisted (1d 2017-11-23 00:00:00 -> 2017-11-24 00:00:00)
COST: No price data found, symbol may be delisted (1d 2018-11-22 00:00:00 -> 2018-11-23 00:00:00)
COST: No price data found, symbol may be delisted (1d 2023-11-23 00:00:00 -> 2023-11-24 00:00:00)
CRM: No price data found, symbol may be delisted (1d 2015-02-16 00:00:00 -> 2015-02-17 00:00:00)
CRM: No price data found, symbol may be delisted (1d 2016-02-15 00:00:00 -> 2016-02-16 00:00:00)
CRM: No price data found, symbol may be delisted (1d 2018-02-19 00:00:00 -> 2018-02-20 00:00:00)
CRM: No price data found, symbol may be delisted (1d 2023-02-20 00:00:00 -> 2023-02-21 00:00:00)
CRM: No price data found, symbol may be delisted (1d 2024-02-19 00:00:00 -> 2024-02-20 00:00:00)
CSCO: No price data found, symbol may be delisted (1d 2020-11-21 16:30:00 -> 2020-11-22 16:30:00)
CVS: No price data found, symbol may be delisted (1d 2010-01-18 00:00:00 -> 2010-01-19 00:00:00)
CVS: No price data found, 

Error processing MDTQ42014: 'NoneType' object has no attribute 'hour'


MDT: No price data found, symbol may be delisted (1d 2016-05-30 -> 2016-05-31)
META: No price data found, symbol may be delisted (1d 2020-01-20 00:00:00 -> 2020-01-21 00:00:00)
META: No price data found, symbol may be delisted (1d 2021-01-18 00:00:00 -> 2021-01-19 00:00:00)
MRK: No price data found, symbol may be delisted (1d 2019-04-19 00:00:00 -> 2019-04-20 00:00:00)


Error processing MSQ42014: index 0 is out of bounds for axis 0 with size 0


MS: No price data found, symbol may be delisted (1d 2016-01-18 -> 2016-01-19)
MS: No price data found, symbol may be delisted (1d 2017-01-16 -> 2017-01-17)
MS: No price data found, symbol may be delisted (1d 2023-01-16 -> 2023-01-17)
MS: No price data found, symbol may be delisted (1d 2024-01-15 -> 2024-01-16)
MSFT: No price data found, symbol may be delisted (1d 2019-01-21 00:00:00 -> 2019-01-22 00:00:00)
MSFT: No price data found, symbol may be delisted (1d 2020-01-20 00:00:00 -> 2020-01-21 00:00:00)
MSFT: No price data found, symbol may be delisted (1d 2022-04-15 00:00:00 -> 2022-04-16 00:00:00)
MSFT: No price data found, symbol may be delisted (1d 2023-07-04 00:00:00 -> 2023-07-05 00:00:00)
MU: No price data found, symbol may be delisted (1d 2023-06-19 00:00:00 -> 2023-06-20 00:00:00)
NEE: No price data found, symbol may be delisted (1d 2018-07-04 00:00:00 -> 2018-07-05 00:00:00)
NEE: No price data found, symbol may be delisted (1d 2020-07-03 00:00:00 -> 2020-07-04 00:00:00)
NEE: N

Error processing ORCLQ32024: 'NoneType' object has no attribute 'hour'
Error processing PANWQ22021: 'NoneType' object has no attribute 'hour'


PEP: No price data found, symbol may be delisted (1d 2022-04-15 00:00:00 -> 2022-04-16 00:00:00)
PEP: No price data found, symbol may be delisted (1d 2023-07-04 00:00:00 -> 2023-07-05 00:00:00)
PFE: No price data found, symbol may be delisted (1d 2021-01-02 10:00:00 -> 2021-01-03 10:00:00)
PFE: No price data found, symbol may be delisted (1d 2021-01-01 -> 2021-01-02)
PLD: No price data found, symbol may be delisted (1d 2020-04-10 00:00:00 -> 2020-04-11 00:00:00)
PLD: No price data found, symbol may be delisted (1d 2023-04-07 00:00:00 -> 2023-04-08 00:00:00)
PLD: No price data found, symbol may be delisted (1d 2020-01-01 00:00:00 -> 2020-01-02 00:00:00)
PM: No price data found, symbol may be delisted (1d 2020-04-10 00:00:00 -> 2020-04-11 00:00:00)
QCOM: No price data found, symbol may be delisted (1d 2015-01-19 00:00:00 -> 2015-01-20 00:00:00)
QCOM: No price data found, symbol may be delisted (1d 2016-01-18 00:00:00 -> 2016-01-19 00:00:00)
QCOM: No price data found, symbol may be delist

    File Name                   Ticker          Earnings Date and Time  \
0   .DS_Store  Ticker Symbol Not Found         Date and Time Not Found   
1  AAPLQ12015                     AAPL  January 27, 2015 at 5:00 PM ET   
2  AAPLQ12016                     AAPL  January 26, 2016 at 5:00 PM ET   
3  AAPLQ12017                     AAPL  January 31, 2017 at 5:00 PM ET   
4  AAPLQ12018                     AAPL  February 1, 2018 at 5:00 PM ET   

   closing_price_next_day  closing_price_previous_day  price_7_days_before  \
0                     NaN                         NaN                  NaN   
1               25.771795                   24.392805            23.688774   
2               21.235128                   22.728550            22.078440   
3               29.917074                   28.197569            27.883873   
4               37.894421                   39.613239            41.799549   

   price_15_days_before  perc_change_next_prev  perc_change_prev_7  \
0               

In [17]:
print(final_df.sample(10))

       File Name                   Ticker           Earnings Date and Time  \
2147  LRCXQ42016                     LRCX      July 27, 2016 at 5:00 PM ET   
3544   UPSQ22016                      UPS      July 29, 2016 at 8:30 AM ET   
2902   PGRQ22018                      PGR     August 1, 2018 at 1:30 PM ET   
920     CIQ22021                       CI     August 5, 2021 at 8:30 AM ET   
479   AVGOQ12021                     AVGO      March 4, 2021 at 5:00 PM ET   
698    BMYQ12017                      BMY    April 27, 2017 at 10:30 AM ET   
2694  NVDAQ32017                     NVDA  November 10, 2016 at 5:00 PM ET   
2982   .DS_Store  Ticker Symbol Not Found          Date and Time Not Found   
255    ADPQ32017                      ADP        May 3, 2017 at 8:30 AM ET   
741    BSXQ12019                      BSX     April 24, 2019 at 8:00 AM ET   

      closing_price_next_day  closing_price_previous_day  price_7_days_before  \
2147               82.849335                   81.852440    

In [18]:
# Assuming 'final_df' is your DataFrame
nan_counts = final_df.isna().sum()

# To display the count of NaN values for each column
print(nan_counts)


File Name                       0
Ticker                          0
Earnings Date and Time          0
closing_price_next_day        110
closing_price_previous_day    110
price_7_days_before           110
price_15_days_before          111
perc_change_next_prev         114
perc_change_prev_7            110
perc_change_7_15              114
dtype: int64


In [19]:
# Calculate the number of NaNs per row
nan_counts = final_df.isna().sum(axis=1)

# Filter rows where the count of NaNs is more than 4
rows_with_many_nans = final_df[nan_counts > 4]

# Print these rows
print(rows_with_many_nans)

      File Name                   Ticker   Earnings Date and Time  \
0     .DS_Store  Ticker Symbol Not Found  Date and Time Not Found   
40    .DS_Store  Ticker Symbol Not Found  Date and Time Not Found   
70    .DS_Store  Ticker Symbol Not Found  Date and Time Not Found   
111   .DS_Store  Ticker Symbol Not Found  Date and Time Not Found   
152   .DS_Store  Ticker Symbol Not Found  Date and Time Not Found   
...         ...                      ...                      ...   
3614  .DS_Store  Ticker Symbol Not Found  Date and Time Not Found   
3654  .DS_Store  Ticker Symbol Not Found  Date and Time Not Found   
3694  .DS_Store  Ticker Symbol Not Found  Date and Time Not Found   
3735  .DS_Store  Ticker Symbol Not Found  Date and Time Not Found   
3767  .DS_Store  Ticker Symbol Not Found  Date and Time Not Found   

      closing_price_next_day  closing_price_previous_day  price_7_days_before  \
0                        NaN                         NaN                  NaN   
40       

In [20]:
filtered_df = final_df[final_df['File Name'] != '.DS_Store']

# If you want to ensure the changes are saved back to final_df
final_df = filtered_df

In [21]:
# Calculate the number of NaNs per row
nan_counts = filtered_df.isna().sum(axis=1)

# Filter rows where the count of NaNs is more than 4
rows_with_many_nans = filtered_df[nan_counts > 4]

# Print these rows
print(rows_with_many_nans, len(rows_with_many_nans))

       File Name Ticker                             Earnings Date and Time  \
633   BKNGQ12021   BKNG                      Valid Date and Time Not Found   
979    COPQ12020    COP             April 30, 2020 at Time Not Provided ET   
1005   COPQ42017    COP           February 1, 2018 at Time Not Provided ET   
1433    FIQ32014   FISV                     October 28, 2014 at 5:00 PM ET   
2296   MDTQ42014    MDT  May 20, 2014 at Time Not Provided Timezone Not...   
2424    MSQ42014     MS                     January 20, 2014 at 9:30 AM ET   
2743  ORCLQ32024   ORCL                  CallMarch 11, 2024 at 05:00 PM ET   
2771  PANWQ22021   PANW                   February 22, 2021 at 16:30 AM ET   
2976   PLDQ42018    PLD                            Date and Time Not Found   
3510   UNPQ32014    UNP                            Date and Time Not Found   
3520   UNPQ42014    UNP                            Date and Time Not Found   

      closing_price_next_day  closing_price_previous_day  price

In [22]:
filtered_df.to_csv("filtered_data.csv", index=False)

In [27]:
filtered_df.sample(10)

Unnamed: 0,File Name,Ticker,Earnings Date and Time,closing_price_next_day,closing_price_previous_day,price_7_days_before,price_15_days_before,perc_change_next_prev,perc_change_prev_7,perc_change_7_15
2272,MDTQ12020,MDT,"August 20, 2019 at 8:00 AM ET",94.216362,91.810478,89.598511,91.431557,2.62,2.47,-2.0
3256,TJXQ22020,TJX,"August 20, 2019 at 11:00 AM ET",48.554245,48.591957,49.706585,51.602501,-0.08,-2.24,-3.67
781,CQ12018,C,"April 13, 2018 at 11:30 AM ET",57.58247,58.490677,56.203926,55.060558,-1.55,4.07,2.08
1416,FIQ12017,FI,"April 26, 2017 at 5:00 PM ET",59.945,59.994999,58.224998,57.785,-0.08,3.04,0.76
2727,ORCLQ22018,ORCL,"December 14, 2017 at 5:00 PM ET",43.48735,45.189026,43.208241,43.739456,-3.77,4.58,-1.21
3518,UNPQ32022,UNP,"October 20, 2022 at 8:45 AM ET",179.821854,192.948029,185.791794,192.118607,-6.8,3.85,-3.29
1694,IBMQ12023,IBM,"April 19, 2023 at 5:00 PM ET",120.758972,120.720741,125.22197,123.96048,0.03,-3.59,1.02
2344,MMCQ22023,MMC,"July 20, 2023 at 8:30 AM ET",186.481216,184.667648,184.214264,183.258209,0.98,0.25,0.52
1308,DISQ22016,DIS,"May 10, 2016 at 5:00 PM ET",96.191299,100.244331,97.103477,96.520439,-4.04,3.23,0.6
3052,QCOMQ32023,QCOM,"August 2, 2023 at 4:45 PM ET",116.528656,126.905296,121.829865,116.509026,-8.18,4.17,4.57


In [24]:
filtered_df.shape

(3727, 10)

In [25]:
final_df = filtered_df[~filtered_df['File Name'].str.startswith('FB', na=False)]

In [26]:
# Filter rows where the count of NaNs is more than 4
rows_with_many_nans = final_df[nan_counts > 4]

# Print these rows
print(rows_with_many_nans, len(rows_with_many_nans))

       File Name Ticker                             Earnings Date and Time  \
633   BKNGQ12021   BKNG                      Valid Date and Time Not Found   
979    COPQ12020    COP             April 30, 2020 at Time Not Provided ET   
1005   COPQ42017    COP           February 1, 2018 at Time Not Provided ET   
1433    FIQ32014   FISV                     October 28, 2014 at 5:00 PM ET   
2296   MDTQ42014    MDT  May 20, 2014 at Time Not Provided Timezone Not...   
2424    MSQ42014     MS                     January 20, 2014 at 9:30 AM ET   
2743  ORCLQ32024   ORCL                  CallMarch 11, 2024 at 05:00 PM ET   
2771  PANWQ22021   PANW                   February 22, 2021 at 16:30 AM ET   
2976   PLDQ42018    PLD                            Date and Time Not Found   
3510   UNPQ32014    UNP                            Date and Time Not Found   
3520   UNPQ42014    UNP                            Date and Time Not Found   

      closing_price_next_day  closing_price_previous_day  price