In [None]:
# NLP package used to aid in text manipulation
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

# Machine Learning modules used to prepare and measure text
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
import torch
import pandas as pd

from tqdm.notebook import trange  # Progress bar
import pdfplumber  # PDF text extraction

# HTML text processing
from bs4 import BeautifulSoup

# Helper modules
import matplotlib.pyplot as plt
from tqdm.notebook import trange  # Progress bar


import eikon as ek
from datetime import datetime
import os

# Convenient modules to simplify API access to Filings
pd.set_option('display.max_colwidth', 60)


In [None]:
import configparser
import eikon as ek

config = configparser.ConfigParser()
config.read('eikon.cfg')

#Retrieve the API key
api_key = config['Refinitiv_API']['researchNLP_API_KEY']


ek.set_app_key('')

nltk.download('punkt')

In [None]:
# Load the models
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-fls',num_labels=3)

# Download the Pre-trained transformer used to process our raw text
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-fls')

# Download the FinBert model used to process our transformed data
nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)

The *finbert-fls* model was specifically designed to analyze financial text to provide a score for each of the predictive states defined by the model:
* Specific FLS
* Non-specific FLS
* Not FLS

For example, we can feed in any text into the model to demonstrate:

In [None]:
prediction = nlp("The future for next years sales will increase by 10 %.", top_k=3)
prediction

The first 2 outcomes (Specific and Non-specific) represent statements that are **Forward Looking Statements**.  However, the main difference is that 'specific' statements can be interpreted as more precise or specific.  For example:

> *The future for next years sales will increase by 10 %.*

The above statement is specific in that it is precise regarding the increase in sales by 10%.

In [None]:
# Conversely, if we change the above statement slightly...
prediction = nlp("The future for next years sales will increase.", top_k=3)
prediction

However, the following statement:

> *The future for next years sales will increase.*

leans less against specific and more towards non-specific as it is not clear what the sales increase will be.  In general, non-specific statements can be ambiguous and potentially lead to uncertainty where specific statements provide more confidence and clarity.

#### Sentiment
As part of our analysis, I plan to evaluate the Forward Looking Statement to measure the sentiment of each.  While Forward Looking Statements can provide predictions about future business conditions, these conditions are not necessarily positive.  It might be useful to measure the overall tone of the forward looking statements during the analysis.

In [None]:
# Sentiment - Download the Pre-trained transformer used to process our raw text
sent_tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

# Sentiment - Download the FinBert model used to process our transformed data
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

#### PDF Text Extraction Function

In [None]:
def extract_text_from_pdf(pdf_path):
    text = ''
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() or ''  # Ensuring that None is not returned
    return text

### PDF Cleaning

In [None]:
import re

def clean_text(text):
    # Remove file paths and document headers
    text = re.sub(r'Filings/[\w\.-]+', '', text)
    
    # Remove malformed characters and symbols
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII characters
    text = re.sub(r'[\r\n]+', '\n', text)  # Normalize new lines
    text = re.sub(r'\[\d+\]', '', text)  # Remove citation-like numbers in brackets
    text = re.sub(r'\s{2,}', ' ', text)  # Replace multiple spaces with a single space

    # Optional: Remove numerical data that's irrelevant (careful with this, as financial data is often numerical)
    text = re.sub(r'\b\d+\.\d+\b', '', text)  # Remove floating numbers (if they are not relevant)

    return text

# Example usage
with open('Filings copy.txt', 'r', encoding='utf-8') as file:
    raw_text = file.read()

cleaned_text = clean_text(raw_text)
print(cleaned_text)  # Print first 500 characters to check the output


### Filings
The Filings service available within Refinitiv provides the ability to search and extract Global Filings documents for thousands of companies spanning over 50 years of history.  As part of the service, the interface supports the extraction of specific sections within a document.  For example, 10-Q filings are well-structured documents containing specific sections such as Management Discussion, Exhibits, Cashflow Statements, etc.  By utilizing the capabilities offered within the Filings GraphQL interface, I can explicitly choose a specific section for analysis.  Because the <b>entire Filings document</b> is typically complex and contains many standard sections, this will introduce too much insignificant text that may skew the analysis.  Instead, our goal is to utilize the power of the service to choose a specific section that can offer text that may dictate the tone representative of the overall sentiment of the Filing.
#### Define the input properties required for analysis
There are a number of ways I can collect filings data and the strategy I can use to evaluate the text.  For this analysis, I've chosen to use a single company and capture multiple reports over time.  One thing I considered was that I intentionally avoided mixing filings from multiple companies.  It's quite possible that we may get more accurate results if we focus on the writing style and tone from the same company.

To begin, I'll prepare criteria that acts as the basis for the data sets by choosing a company and a time frame to capture content.
### Filings text extraction
The above request performed the query to retrieve quarterly SEC filings for the company Tesla.  Within the response contains meta data related to the filings as well as the key information requested - the text containing the "Management Discussion".  At this point, I plan to prepare for the next step of my analysis by extracting this text for each filings document reported.
#### Capture Closing Prices
As part of the analysis, I want to capture the closing price based on the Filings date.  The goal is to look for any possible correlation with the intelligence we extract from the text and the performance of the stock.  While we know performance is based on many factors, the point of this measure is to evaluate one of many possible trends as a way to look for correlation.
### AI model evaluation
For each of the above filings, break down the entire "Management Discussions and Analysis" segment into individual sentences and run each through the pretrained FinBert models to generate our predictions.  The *evaluate()* function below performs the following analysis for each filings report:

For each sentence:
* Evaluate the FLS predictions
* Evaluate the Sentiment

Collect the above FLS results to measure the percentage of sentences that are Forward-looking statements.

In [None]:
def evaluate(text):
    # Initialize containers for the results
    fls_pct = 0
    total_sentiments = torch.zeros(1, 3)

    # Tokenize the text into individual sentences
    sentences = sent_tokenize(text)
    num_sentences = len(sentences)
    fls_count = 0

    for sentence in sentences:
        # FLS prediction
        prediction = nlp(sentence[:512], top_k=3)[0]['label']

        # Check if the sentence is a Forward Looking Statement
        if prediction.startswith("Specific") or prediction.startswith("Non"):
            fls_count += 1

            # Tokenize for sentiment analysis
            encoded_input = sent_tokenizer(sentence, return_tensors="pt", truncation=True)
            
            with torch.no_grad():
                # Sentiment prediction
                output = model(**encoded_input)
                sentiment = torch.nn.functional.softmax(output.logits, dim=-1)
                total_sentiments += sentiment

    # Calculate the percentage of FLS sentences
    if num_sentences > 0:
        fls_pct = (fls_count / num_sentences) * 100
        average_sentiments = total_sentiments / fls_count  # Average the sentiments
        dominant_sentiment_index = average_sentiments.argmax(dim=1).item()
        dominant_sentiment = model.config.id2label[dominant_sentiment_index]
        sentiment_confidence = average_sentiments[0][dominant_sentiment_index].item() * 100
    else:
        dominant_sentiment = 'neutral'
        sentiment_confidence = 0


    return fls_pct, dominant_sentiment, sentiment_confidence


#### Main Workflow
This is where we integrate the PDF processing into our existing workflow:

In [None]:
def identify_relevant_text(text):
    # List of keywords that might indicate forward-looking statements
    keywords = ['future', 'forecast', 'project', 'estimate', 'expect', 'anticipate']

    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    relevant_sentences = []

    for sentence in sentences:
        if any(keyword in sentence.lower() for keyword in keywords):
            relevant_sentences.append(sentence)

    return ' '.join(relevant_sentences)


In [None]:
timeseries = ek.get_timeseries("INVESTB.AD", start_date="2000-01-01", end_date=None , interval="daily", fields="CLOSE")

# fill in all missing dates with the previous day's value
timeseries = timeseries.asfreq('D', method='pad')
timeseries

In [None]:
formatted_date_str = "2008-09-30"


stock_price = timeseries.loc[formatted_date_str]['CLOSE']
print(stock_price)


In [None]:

def sanitize_folder_name(name):
    """ Sanitize the folder name by stripping leading/trailing spaces and replacing problematic characters. """
    return name.strip().replace(':', '').replace('\\', '').replace('/', '').replace('*', '').replace('?', '').replace('"', '').replace('<', '').replace('>', '').replace('|', '')

i = 0
divisions = []
base_path = "Filings"

for f in os.listdir(base_path):
    print(f) 
    i += 1
    division = f.split(" - ")
    divisions.append(division)

    if len(division) < 5:
        print(f"Skipping file {f} as it doesn't have enough parts when split.")
        continue
    sanitized_folder_name = sanitize_folder_name(division[2])
    folder_path = os.path.join(base_path, sanitized_folder_name)

    # Check and create the folder if it doesn't exist
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
        print(f"Folder '{folder_path}' created.")

    # Define the current file path
    current_file_path = os.path.join(base_path, f)

    # Define the new file path in the destination folder
    new_file_path = os.path.join(folder_path, f)

    # If the file already exists at the destination, delete it
    if os.path.exists(new_file_path):
        os.remove(new_file_path)
        print(f"Existing file '{new_file_path}' has been deleted.")

    # Move the file to the new folder
    try:
        os.rename(current_file_path, new_file_path)
        print(f"File '{f}' moved to '{folder_path}'.")
    except FileNotFoundError as e:
        print(f"Error moving file {f}: {e}")


df = pd.DataFrame(divisions)


df.columns = ['desc', 'date', 'ticker', 'full name', 'type', 'filing', 'extension', 'dk']
df



In [None]:
import psycopg2
from contextlib import contextmanager


# Connect to the database
@contextmanager
def get_cursor():
    try:
        conn = psycopg2.connect(
            host="localhost",
            dbname="neuralfin-gpt",
            user="postgres",
            password="1Y9k5t0mniR"
        )
        cursor = conn.cursor()
        yield cursor
        conn.commit()
    except Exception as e:
        conn.rollback()
        print(f"Database error: {e}")
    finally:
        cursor.close()
        conn.close()

with get_cursor() as cursor:
    # Execute a query
    cursor.execute("SELECT * FROM companies;")

In [None]:
cursor.execute("SELECT * FROM companies;")

companies = cursor.fetchall()

for company in companies:
  print(company)
  
cursor.execute("SELECT * FROM reports_metadata;")

reports = cursor.fetchall()

for report in reports:
  print(report)


In [None]:
# Function to save text to a file
def save_text_to_file(folder_path, ticker, date, text):

    filename = f"{ticker}_{date}.txt"
    file_path = os.path.join(folder_path, filename)
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(text)
    return file_path

# Function to check and insert into the companies table
def check_and_insert_company(company_name, ticker):
    with get_cursor() as cursor:
        cursor.execute("SELECT company_id FROM companies WHERE ticker = %s", (ticker,))
        result = cursor.fetchone()
        if result:
            return result[0]
        else:
            cursor.execute("INSERT INTO companies (company_name, ticker) VALUES (%s, %s) RETURNING company_id;", (company_name, ticker))
            return cursor.fetchone()[0]

# Function to insert into the contents table and return content ID
def insert_into_contents(file_path):
    with get_cursor() as cursor:
        cursor.execute("INSERT INTO contents (file_path) VALUES (%s) RETURNING content_id;", (file_path,))
        return cursor.fetchone()[0] 

# Function to insert data into the metadata table
def insert_data_into_db (data):
    with get_cursor() as cursor:
        cursor.execute("""
        INSERT INTO reports_metadata (company_id, date, doc_type, sentiment_perc, content_id, sentiment_lit, fls_pct) 
        VALUES (%s, %s, %s, %s, %s, %s, %s);
        """, (data['Company ID'], data['Date'], data['Doc Type'], data['Sentiment Confidence'], data['Content ID'], data['Sentiment'], data['FLS_Percentage']))
        

        
        
def check_if_report_exists(company_id, document_date):
    with get_cursor() as cursor:
        # Query to check the database
        query = "SELECT EXISTS(SELECT 1 FROM reports_metadata WHERE company_id=%s AND date=%s)"
        cursor.execute(query, (company_id, document_date))
        return cursor.fetchone()[0]


In [None]:
# This cell will process all the PDF files in the folders and extract the relevant information, then save it to the database
folder_path = "MENA_Filings-v1"
data_per_comp = []

#for folder_path in folders:
# List all files in the folder and filter for PDF files
pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]


for pdf_file in pdf_files:
    try:
        print(f"Processing {full_path}")
        full_path = os.path.join(folder_path, pdf_file)
        text = extract_text_from_pdf(full_path)
        text = clean_text(text)
        

        parts = pdf_file.split(" - ")

        #extract the required information from the file name
        document_date = parts[1]
        ticker_symbol = parts[2]
        document_type = parts[4]
        company_name = parts[3]

        
        
        # Parse the date string into a datetime object
        date_obj = datetime.strptime(document_date, '%Y-%b-%d')
        formatted_date_str = date_obj.strftime('%Y-%m-%d')
        
        
        text_file_path = save_text_to_file('filings-txt-v1', ticker_symbol, formatted_date_str, text)
        content_id = insert_into_contents(text_file_path)
        company_id = check_and_insert_company(company_name, ticker_symbol)


        if check_if_report_exists(company_id, formatted_date_str):
            print(f"Report for {ticker_symbol} on {formatted_date_str} already exists in the database. Skipping...")
            continue

        
        fls_pct, dominant_sentiment, sentiment_confidence = evaluate(text)
        


        row_data = {
            'Company ID': company_id,
            'Date': formatted_date_str,
            'Doc Type': document_type,
            'Sentiment': dominant_sentiment,
            'Content ID': content_id,
            'FLS_Percentage': fls_pct,
            'Sentiment Confidence': sentiment_confidence
        }
        
        print(row_data)

        insert_data_into_db(row_data)
        data_per_comp.append(row_data)
    except Exception as e:
        print(f"Error processing file: {pdf_file}, Error: {e}")

In [None]:
data_per_comp

In [None]:
results = pd.DataFrame(data_per_comp, columns=['Ticker', 'Company Name', 'Date', 'Doc Type', 'Sentiment', 'FLS_Percentage'])
results.to_csv('CSVs/test1.csv', index = False)

sorted_result = results.sort_values(by=['Date'], ascending=True)


# Creating The Dataset

In [None]:
import psycopg2
import pandas as pd

# Connect to the database
def get_connection():
    return psycopg2.connect(
        host="localhost",
        dbname="neuralfin-gpt",
        user="postgres",
        password="1Y9k5t0mniR"
    )

# Execute the query and fetch data as a DataFrame
with get_connection() as conn:
    query = """
    SELECT B.file_path, A.sentiment_perc, A.sentiment_lit, A.fls_pct 
    FROM reports_metadata AS A 
    INNER JOIN contents AS B ON A.content_id = B.content_id 
    WHERE fls_pct > 0;
    """
    data = pd.read_sql(query, conn)

print(data)


In [None]:
# Load Content from Files
def read_file_content(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

data['content'] = data['file_path'].apply(read_file_content)


In [None]:
# Format the Dataset

dataset = [
    {
        "instruction": "Analyze the sentiment of the following financial report.",
        "input": row["content"],
        "output": row["sentiment_lit"],
        "fls_pct": row["fls_pct"],
        "sentiment_confidence": row["sentiment_perc"]
    }
    for _, row in data.iterrows()
]


In [None]:
from datasets import Dataset

dataset = Dataset.from_list(dataset)
dataset.to_parquet("NeuralFin-GPT_MENA_Financial_Sentiments.parquet")
