# 1. Bevezetés #

# 2. A Téma elméleti kifejtése #

# 3. Fejlesztői Dokumentáció #

# 4. Implementáció #

## Imports ##

In [2]:
import backtrader as bt
import quantstats
import yfinance as yf
import numpy as np
import plotly.express as px
from matplotlib.dates import DateFormatter
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import pandas as pd
import matplotlib.pyplot as plt
from tornado.test.concurrent_test import MiscFutureTest
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import pipeline
from scipy.special import softmax
from tqdm.notebook import tqdm
import ast

plt.style.use('ggplot')

## NLP Models ##

### DistilRoBERTa Financial Sentiment Model ###

A fine-tuned version of DistilRoberta base model on the Financial PhraseBank dataset.

https://huggingface.co/mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis

In [None]:
model_DistRoBERTa = f"mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis"


_sentiment_analysis_DistRoBERTa = pipeline("sentiment-analysis",
                                           model= AutoModelForSequenceClassification.from_pretrained(
                                               model_DistRoBERTa,
                                               num_labels=3),
                                           tokenizer=AutoTokenizer.from_pretrained(model_DistRoBERTa),
                                           top_k=None, padding=True, truncation=True
                                           )

### FinBERT ###

A pre-traned NLP model to analyze sentiment of financial text.
The BERT model fine-tuned on the Financial PhraseBank dataset.

https://huggingface.co/ProsusAI/finbert

In [None]:
model_FinBERT = "ProsusAI/finbert"
_sentiment_analysis_FinBERT = pipeline("sentiment-analysis",
                                       model=AutoModelForSequenceClassification.from_pretrained(
                                           model_FinBERT,
                                           num_labels=3
                                       ),
                                       tokenizer=AutoTokenizer.from_pretrained(model_FinBERT),
                                       top_k=None, padding=True, truncation=True
                                       )

### FinDeBERTa ###

A fine-tuned version of DeBERTa base model on the None dataset.

https://huggingface.co/mrm8488/deberta-v3-ft-financial-news-sentiment-analysis

In [None]:
model_DeBERTa = f"mrm8488/deberta-v3-ft-financial-news-sentiment-analysis"
_sentiment_analysis_DeBERTa = pipeline("sentiment-analysis",
                                       model=AutoModelForSequenceClassification.from_pretrained(
                                           model_DeBERTa,
                                           num_labels=3),
                                       tokenizer=AutoTokenizer.from_pretrained(model_DeBERTa),
                                       top_k=None, padding=True)

## Functions ##

#### Preprocess Data ####

In [None]:
def preprocess_data(input_file, ticker="", debug=False):
    """
    Process stock data by combining news data with Yahoo Finance price data.
    Missing stock data is filled using linear interpolation: i/(n+1) where n is the number
    of missing days and i is the current position in the gap.

    Parameters:
    - input_file: Path to the input CSV file containing news data
    - ticker: Ticker symbol of the stock
    - debug: If True, prints debug information
    """
    try:
        # Read the CSV file
        data = pd.read_csv(input_file)

        # Remove unnamed columns
        data.drop(data.columns[data.columns.str.contains('unnamed', case=False)], axis=1, inplace=True)

        if ticker:
            # Filter data for specific ticker
            data = data[data['stock'] == ticker].copy()

            if debug:
                print(f"Filtered data for ticker '{ticker}':\n", data.head())

            # Convert date to datetime with UTC timezone
            data['date'] = pd.to_datetime(data['date'], utc=True)

            # Convert UTC time to local time and extract date
            local_dates = data['date'].dt.tz_localize(None).dt.date

            # Create a datetime index for the news data
            news_data = data.groupby(local_dates)['title'].apply(lambda x: ' | '.join(x.dropna())).reset_index()
            news_data.rename(columns={'title': 'News', 'date': 'Date'}, inplace=True)

            # Convert Date column to datetime without timezone
            news_data['Date'] = pd.to_datetime(news_data['Date'])

            if debug:
                print("\nNews data sample:\n", news_data.head())
                print("\nNews data Date dtype:", news_data['Date'].dtype)

            # Get date range for stock data (using localized dates)
            start_date = local_dates.min()
            end_date = local_dates.max()

            if debug:
                print(f"\nFetching stock data from {start_date} to {end_date}")

            # Fetch stock data from Yahoo Finance
            stock = yf.download(ticker, start=start_date, end=end_date, progress=False)
            stock = stock.reset_index()
            stock['Date'] = pd.to_datetime(stock['Date'])  # Yahoo data comes without timezone

            # Flatten multi-level columns if they exist
            stock.columns = [col[0] if isinstance(col, tuple) else col for col in stock.columns]

            # Create a complete date range including weekends
            date_range = pd.date_range(start=start_date, end=end_date, freq='D')
            complete_dates = pd.DataFrame({'Date': date_range})

            # Merge stock data with complete date range
            stock_full = pd.merge(complete_dates, stock, on='Date', how='left')

            def interpolate_gaps(series):
                """
                Custom interpolation function that uses i/(n+1) formula for gaps
                """
                result = series.copy()
                mask = series.isna()

                if not mask.any():  # No NaNs to fill
                    return result

                # Find consecutive NaN sequences
                gaps = mask.ne(mask.shift()).cumsum()[mask]

                for gap_idx in gaps.unique():
                    gap_mask = gaps == gap_idx
                    gap_size = gap_mask.sum()

                    # Find values before and after gap
                    before_idx = series.index[series.index < gaps[gap_mask].index[0]][-1] if any(series.index < gaps[gap_mask].index[0]) else None
                    after_idx = series.index[series.index > gaps[gap_mask].index[-1]][0] if any(series.index > gaps[gap_mask].index[-1]) else None

                    if before_idx is not None and after_idx is not None:
                        # Regular gap with values on both sides
                        start_val = series[before_idx]
                        end_val = series[after_idx]
                        gap_values = [start_val + (end_val - start_val) * (i / (gap_size + 1))
                                    for i in range(1, gap_size + 1)]
                        result.iloc[gaps[gap_mask].index] = gap_values
                    elif before_idx is not None:
                        # Gap at the end - use last known value
                        result.iloc[gaps[gap_mask].index] = series[before_idx]
                    elif after_idx is not None:
                        # Gap at the start - use first known value
                        result.iloc[gaps[gap_mask].index] = series[after_idx]

                return result

            # Apply custom interpolation to each numeric column
            numeric_columns = ['Adj Close', 'Close', 'High', 'Low', 'Open', 'Volume']
            for col in numeric_columns:
                stock_full[col] = interpolate_gaps(stock_full[col])

            if debug:
                print("\nStock data sample with interpolated values:\n", stock_full.head(10))

            # Merge with news data
            final_data = pd.merge(stock_full, news_data, on='Date', how='left')

            # Sort by date
            final_data = final_data.sort_values('Date')

            # Convert Date back to date (not datetime) for final output
            final_data['Date'] = final_data['Date'].dt.date

            if debug:
                print("\nFinal merged data sample:\n", final_data.head(10))
                print("\nColumns in final data:", final_data.columns.tolist())
                print(f"\nTotal rows in final data: {len(final_data)}")

        else:
            # If no ticker is specified, just save the filtered data
            final_data = data

        # Save the processed data
        output_file = f"/tmp/pycharm_project_520/src/data/input/processed/source/{ticker}.csv"
        final_data.to_csv(output_file, index=False)
        # print(f"Processed data saved to {output_file}")

        return final_data

    except FileNotFoundError:
        print(f"Error: The file '{input_file}' was not found.")
        return None
    except ValueError as ve:
        print(f"Error: {ve}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

#### Sentiment Scores ####

In [None]:
def get_sentiment_scores_test(df, news, model, debug=False):
    """
    Performs sentiment analysis using a specified pipeline and calculates sentiment scores.

    Args:
        df (pd.DataFrame): DataFrame containing the text data.
        news (str): Column name for the news headlines or text to analyze.
        model (str): The model to use for sentiment analysis ('DistRoBERTa', 'FinBERT', 'DeBERTa').
        debug (bool): If True, logs details of failures.

    Returns:
        pd.DataFrame: DataFrame with calculated sentiment scores.
    """
    # Create a copy of the DataFrame to avoid SettingWithCopyWarning
    df = df.copy()

    # Select the appropriate model pipeline
    if model == "DistRoBERTa":
        sent_pipeline = _sentiment_analysis_DistRoBERTa
    elif model == "FinBERT":
        sent_pipeline = _sentiment_analysis_FinBERT
    elif model == "DeBERTa":
        sent_pipeline = _sentiment_analysis_DeBERTa
    else:
        raise ValueError("Invalid model name. Choose 'DistRoBERTa', 'FinBERT', 'DeBERTa'.")

    res = {}
    fail = {}
    n = 0

    # Process each row in the DataFrame
    for i, row in tqdm(df.iterrows(), total=len(df), disable=True):
        text = row[news]
        try:
            if pd.isna(text):
                result = [[
                    {'label': 'neutral', 'score': 1.0},
                    {'label': 'negative', 'score': 0},
                    {'label': 'positive', 'score': 0}
                ]]
            else:
                result = sent_pipeline(text)

        except (IndexError, RuntimeError):
            if pd.isna(text) and debug:
                fail[n] = str(i)
            elif debug:
                fail[n] = text
                n += 1
            result = [[
                {'label': 'neutral', 'score': 1.0},
                {'label': 'negative', 'score': 0},
                {'label': 'positive', 'score': 0}
            ]]
            pass
        res[i] = result[0]  # Extract the first list

    # Create a new column with the sentiment score
    sentiment_score_col = f"{model}_sentiment_scores"
    df[sentiment_score_col] = res

    if debug:
        return df, fail
    else:
        return df

In [None]:
def get_sentiment_scores_thread(df, news_column, model, debug=False):
    """
    Performs CPU-optimized sentiment analysis on news text and calculates sentiment scores.

    Args:
        df (pd.DataFrame): DataFrame containing the news data
        news_column (str): Name of the column containing news text
        model (str): Model name ('DistRoBERTa', 'FinBERT', or 'DeBERTa')

    Returns:
        pd.DataFrame: Original DataFrame with added sentiment score column
    """
    df = df.copy()

    pipelines = {
        "DistRoBERTa": _sentiment_analysis_DistRoBERTa,
        "FinBERT": _sentiment_analysis_FinBERT,
        "DeBERTa": _sentiment_analysis_DeBERTa
    }

    pipe = pipelines.get(model)
    if not pipe:
        raise ValueError(f"Invalid model. Choose from: {', '.join(pipelines.keys())}")

    DEFAULT_SENTIMENT = [[{
        'label': 'negative', 'score': 0
    }, {
        'label': 'neutral', 'score': 1
    }, {
        'label': 'positive', 'score': 0
    }]]

    from concurrent.futures import ThreadPoolExecutor
    from functools import partial

    def process_single_text(text, pipe=pipe):
        """Process a single text item"""
        if pd.isna(text) or not str(text).strip():
            return DEFAULT_SENTIMENT

        try:
            result = pipe(text)
            return result if isinstance(result, list) else [result]
        except:
            return DEFAULT_SENTIMENT

    # Process texts using thread pool
    texts = df[news_column].values
    with ThreadPoolExecutor(max_workers=4) as executor:  # Adjust max_workers based on your CPU
        all_sentiments = list(tqdm(
            executor.map(process_single_text, texts),
            total=len(texts),
            desc=f"Processing {model} sentiment"
        ))

    def calculate_score(result):
        """Calculate sentiment score from result"""
        result = result[0]
        pos_score = next((item['score'] for item in result if item['label'] == 'positive'), 0)
        neg_score = next((item['score'] for item in result if item['label'] == 'negative'), 0)
        denominator = pos_score + neg_score
        return (pos_score - neg_score) / denominator if denominator > 0 else 0

    # Calculate scores and add to DataFrame
    df[f"{model}_sentiment_score"] = [calculate_score(result) for result in all_sentiments]

    return df

In [None]:
def sentiment_vector(df, scores, model, debug=False):
    """
    Creates a sentiment vector column based on sentiment scores.
    (Neutral, Positive, Negative)

    Parameters:
    - df: DataFrame containing the stock data.
    - scores: Column name for the sentiment scores.
    - model: Name of the model.
    - debug: If True, prints debugging information.
    """
    df = df.copy()

    # Function to extract scores and create a sentiment vector
    def extract_scores(row):
        # If row is a string, convert it to a Python object
        if isinstance(row, str):
            row = ast.literal_eval(row)  # Safely evaluate the string to a Python object
        # Extract scores for neutral, positive, and negative in the correct order
        dimensions = {item['label']: item['score'] for item in row}
        return dimensions['neutral'], dimensions['positive'], dimensions['negative']

    # Apply the function to the column
    sentiment_vector_col = f"{model}_sentiment_vector"
    df[sentiment_vector_col] = df[scores].apply(extract_scores)

    if debug:
        print(f"Sentiment vector column '{sentiment_vector_col}' created successfully.")
        print(df[sentiment_vector_col].head())

    return df

In [None]:
def adjust_scores(df, vectors, model, debug=False):
    """Calculate adjusted sentiment scores from vector column."""
    df = df.copy()

    def calculate_normalized_score(vector):
        neutral, positive, negative = vector
        total = positive + negative #+ neutral
        return (positive - negative) / total if total > 0 else 0

    def calculate_raw_score(vector):
        neutral, positive, negative = vector
        return positive if positive > negative else (-1*negative) if negative > positive else 0


    df[f"{model}_adjusted_score"] = df[vectors].apply(calculate_normalized_score)
    df[f"{model}_raw_score"] = df[vectors].apply(calculate_raw_score)

    if debug:
        print(f"Adjusted scores distribution:\n{df[f'{model}_adjusted_score'].describe()}")

    return df

#### Validate Predictions ####

In [None]:
def validate_predictions(df, tar, sco, vol, model, score_type, target_factor=0.15, score_factor = 0.5, debug=False):
    """
    Validate prediction based on adaptive volatility-based threshold for neutrality.

    Args:
        df (pd.DataFrame): DataFrame containing stock data with calculated volatility.
        tar (str): Column name for the target price change (e.g., 'Daily Return').
        sco (str): Column name for the sentiment score.
        model (str): Model name to include in the predictions column.
        target_factor (float): Multiplier for volatility to define the neutral threshold for the target.
        score_factor (float): Threshold for neutral sentiment score.
        debug (bool): If True, logs details of failures.

    Returns:
        pd.DataFrame: DataFrame with predictions and success/failure of predictions.
    """
    predict_dict = {}
    fail = {}
    n = 0

    # Validate input columns exist
    required_cols = [tar, sco, vol]
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing required columns: {missing_cols}")

    # Calculate adaptive volatility threshold and predictions
    for i, row in tqdm(df.iterrows(), total=len(df), disable=not debug):
        try:
            target = row[tar]
            score = row[sco]

            # Handle NaN values
            if pd.isna(target) or pd.isna(score) or pd.isna(row[vol]):
                predict_dict[i] = 0  # or another default value for NaN cases
                continue


            neu_target_threshold = abs(row[vol] * target_factor) if abs(row[vol] * target_factor) <= 1 else 1 # Add abs() for safety

            #neu_score_threshold = abs(row[vol] * score_factor) if abs(row[vol] * score_factor) <= 1 else 1 # Add abs() for safety
            neu_score_threshold = score_factor

            # Improved prediction logic
            if abs(target) > neu_target_threshold and abs(score) > neu_score_threshold:
                # Only evaluate directional accuracy when both target and score are non-zero
                if target != 0 and score != 0:
                    predict_dict[i] = 1 if (target * score) > 0 else 0  # Simplified comparison
                else:
                    predict_dict[i] = 0  # Case where either value is zero
            else:
                # Modified neutral case logic
                predict_dict[i] = 1 if abs(score) < neu_score_threshold and abs(target) < neu_target_threshold else 0

        except KeyError as e:
            if debug:
                print(f"KeyError at index {i}: {str(e)}")
            predict_dict[i] = 0
        except Exception as e:
            if debug:
                fail[n] = {'index': i, 'error': str(e)}
                n += 1
            predict_dict[i] = 0

    # Create the predictions DataFrame more efficiently
    predictions = pd.Series(predict_dict, name=f"{model}_{score_type}_predictions")

    # Update the DataFrame more efficiently
    if predictions.name in df.columns:
        df = df.drop(columns=[predictions.name])
    df = df.join(predictions)

    if debug:
        return df, fail
    else:
        return df

In [None]:
def add_trend(df, close, day=1, debug=False):
    """
    Calculates the trend of a stock price.

    Parameters:
    - df: DataFrame containing the stock data.
    - close: Column name for the close price.
    - day: Number of days to calculate the trend.
    """
    try:
        # Calculate the trend
        tend_col = f"trend_{day}d"
        df[tend_col] = df[close].pct_change(periods=day).shift(-1)

        if debug:
            return df
        else:
            return df

    except KeyError as e:
        if debug:
            print(f"KeyError: {str(e)}")
            return df
        else:
            return df

In [None]:
def add_volatility(df, close, window=5, trading_days=252, debug=False):
    """
    Calculate rolling volatility for a DataFrame's close price column.

    Parameters:
    -----------
    df : pandas.DataFrame
        Input DataFrame containing price data
    close : str
        Name of the column containing closing prices
    window : int, optional
        Rolling window size for volatility calculation (default: 5)
    trading_days : int, optional
        Annual trading days for annualized volatility (default: 252)
    debug : bool, optional
        If True, returns additional debug information (default: False)

    Returns:
    --------
    pandas.DataFrame
        Original DataFrame with added volatility columns
    """
    # Create a copy of the DataFrame to avoid modifying the original
    result = df.copy()

    # Calculate log returns
    result['log_returns'] = np.log(result[close] / result[close].shift(1))

    # Calculate rolling standard deviation of log returns (annualized)
    result[f'volatility_{window}d'] = (
        result['log_returns']
        .rolling(window=window)
        .std()
        * np.sqrt(trading_days)
    )

    # Optional debug information
    if debug:
        result['rolling_mean'] = result['log_returns'].rolling(window=window).mean()
        result['rolling_variance'] = result['log_returns'].rolling(window=window).var()

    return result

#### Process Data ####

In [None]:
def process_data(input_file, ticker="", models=('DistRoBERTa', 'FinBERT', 'DeBERTa'), debug=False):
    """
    Process stock data

    Parameters:
    - input_file: Path to the input CSV file containing news data
    - output_file: Path to save the processed CSV file
    - ticker: Ticker symbol of the stock
    - debug: If True, prints debug information
    """
    preprocess_data(input_file, ticker=ticker)

    data = pd.read_csv(f'/tmp/pycharm_project_520/src/data/input/processed/source/{ticker}.csv')

    data = add_trend(data, "Close")
    data = add_volatility(data, 'Close')
    for model in models:
        data = get_sentiment_scores_test(data, 'News', model=model)
        data = sentiment_vector(data, f'{model}_sentiment_scores', model)
        data = adjust_scores(data, f'{model}_sentiment_vector', model)
        data = validate_predictions(data, 'trend_1d', f'{model}_adjusted_score', 'volatility_5d', model, 'adjusted')
        data = validate_predictions(data, 'trend_1d', f'{model}_raw_score', 'volatility_5d', model, 'raw')

    data.to_csv(f'/tmp/pycharm_project_520/src/data/output/{ticker}_sentiment.csv', index=False)

    print(f"DeBERTa Accuracy:\nRaw: {(sum((data['DeBERTa_raw_predictions'])) / len(data.index))},\tAdjusted: {(sum((data['DeBERTa_adjusted_predictions'])) / len(data.index))}")
    print(f"FinBERT Accuracy:\nRaw: {(sum((data['FinBERT_raw_predictions'])) / len(data.index))},\tAdjusted: {(sum((data['FinBERT_adjusted_predictions'])) / len(data.index))}")
    print(f"DistRoBERTa Accuracy:\nRaw: {(sum((data['DistRoBERTa_raw_predictions'])) / len(data.index))},\tAdjusted: {(sum((data['DistRoBERTa_adjusted_predictions'])) / len(data.index))}")

    return data

#### Plot ####

In [None]:
def create_sentiment_plot(df, data_name, size=4, debug=False):
    fig = go.Figure()

    # Define color schemes for each model: (true_color, false_color)
    color_schemes = {
        'DistRoBERTa': ('#2ecc71', '#a9dfbf'),  # Green shades
        'FinBERT': ('#3498db', '#aed6f1'),      # Blue shades
        'DeBERTa': ('#9b59b6', '#d7bde2')       # Purple shades
    }

    sentiment_models = [
        ('DistRoBERTa', 'DistRoBERTa_adjusted_score', 'DistRoBERTa_predictions'),
        ('FinBERT', 'FinBERT_adjusted_score', 'FinBERT_predictions'),
        ('DeBERTa', 'DeBERTa_adjusted_score', 'DeBERTa_predictions')
    ]

    for model_name, score_col, pred_col in sentiment_models:
        # Get color scheme for this model
        true_color, false_color = color_schemes[model_name]

        # Create scatter plot for each prediction value (0 and 1)
        for pred_value in [0, 1]:
            mask = df[pred_col] == pred_value

            fig.add_trace(
                go.Scatter(
                    x=df[mask]['Date'],
                    y=df[mask][score_col],
                    name=f'{model_name} (Prediction={pred_value.__bool__()})',
                    mode='markers',  # Removed 'lines' to show only markers
                    marker=dict(
                        color=true_color if pred_value == 1 else false_color,
                        size=10,
                        symbol='circle'
                    )
                )
            )

    fig.update_layout(
        title=f'{data_name} Sentiment Scores Over Time by Model',
        xaxis_title='Date',
        yaxis_title='Sentiment Score',
        template='plotly_white',
        hovermode='x unified',
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1
        ),
        xaxis=dict(showgrid=True),
        yaxis=dict(showgrid=True)
    )
    fig.update_traces(marker=dict(size=size))
    return fig

In [None]:
def plot_sentiment_and_stock(data, model_name, debug=False):
    """
    Plot sentiment scores and stock price changes over time.

    Parameters:
    - data: DataFrame containing sentiment and stock data.
    - model_name: String, name of the sentiment analysis model (e.g., 'DeBERTa').
    """

    plot_data = data.copy()

    plot_data.dropna(inplace=True)
    plot_data.dropna(inplace=True)

    something_unique, ax1 = plt.subplots(figsize=(22, 6))

    # Plot adjusted sentiment score moving average
    adjusted_score_col = f"{model_name}_adjusted_score_ma"
    ax1.plot(plot_data.index, plot_data[adjusted_score_col], color='blue', label=f'{model_name} Adjusted Score MA')
    ax1.set_title(f'{model_name} Sentiment Scores Over Time by Model')
    ax1.set_xlabel('Date')
    ax1.set_ylabel('Adjusted Sentiment Score', color='blue')
    ax1.tick_params(axis='y', labelcolor='blue')

    ax1.xaxis.set_major_locator(plt.MaxNLocator(12))
    ax1.xaxis.set_major_formatter(DateFormatter('%y-%m-%d'))

    # Add stock price change on secondary y-axis
    ax2 = ax1.twinx()
    ax2.plot(plot_data.index, plot_data['close_diff'], color='red', label='Close Price Change')
    ax2.set_ylabel('Price Change', color='red')
    ax2.tick_params(axis='y', labelcolor='red')

    # Add raw sentiment score moving average on another y-axis
    ax3 = ax1.twinx()
    raw_score_col = f"{model_name}_raw_score_ma"
    ax3.plot(plot_data.index, plot_data[raw_score_col], color='orange', label=f'{model_name} Raw Score MA')
    ax3.set_ylabel('Raw Sentiment Score', color='orange')
    ax3.tick_params(axis='y', labelcolor='orange')

    # Combine legends
    lines1, labels1 = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    lines3, labels3 = ax3.get_legend_handles_labels()
    ax1.legend(lines1 + lines2 + lines3, labels1 + labels2 + labels3, loc='upper left', fontsize=12)

    plt.show()

In [None]:
def organize_data_for_plot(data, models=('DistRoBERTa', 'FinBERT', 'DeBERTa'), stock_window=120, sentiment_window=90, resample='D', debug=False):
    """
    Organizes the data for plotting.

    Parameters:
    - data: DataFrame containing the data to organize.
    - models: Tuple of model names to process.
    - stock_window: Rolling window size for stock prices.
    - sentiment_window: Rolling window size for sentiment scores.
    - resample: Resampling frequency (e.g., 'D' for daily).
    - debug: If True, prints intermediate debug information.

    Returns:
    - DataFrame organized for plotting.
    """
    # Create a copy of the data
    plot_data = data.copy()

    # Ensure the 'Date' column is converted to datetime if it exists
    if 'Date' in plot_data.columns:
        plot_data['Date'] = pd.to_datetime(plot_data['Date'], errors='coerce')
        plot_data.set_index('Date', inplace=True)  # Set 'Date' as the index
    else:
        raise ValueError("The DataFrame must contain a 'Date' column.")

    # Drop unnecessary columns
    plot_data.drop(columns=['High', 'Low', 'Open', 'Volume', 'Adj Close', 'News', 'log_returns'], inplace=True, errors='ignore')

    # Debug: Print remaining columns
    if debug:
        print("Remaining columns after initial drop:", plot_data.columns)

    # Drop model-specific columns
    for model in models:
        model_columns = [f'{model}_sentiment_scores', f'{model}_sentiment_vector', f'{model}_predictions']
        plot_data.drop(columns=model_columns, inplace=True, errors='ignore')

    # Resample the data and interpolate missing values
    plot_data = plot_data.resample(resample).mean()
    plot_data.interpolate(method='linear', inplace=True)

    # Calculate rolling averages for sentiment scores
    for model in models:
        if f'{model}_adjusted_score' in plot_data.columns:
            plot_data[f'{model}_adjusted_score_ma'] = plot_data[f'{model}_adjusted_score'].rolling(window=sentiment_window, center=False).mean()
        if f'{model}_raw_score' in plot_data.columns:
            plot_data[f'{model}_raw_score_ma'] = plot_data[f'{model}_raw_score'].rolling(window=sentiment_window, center=False).mean()

    # Calculate rolling averages and differences for stock prices
    if 'Close' in plot_data.columns:
        plot_data['close_ma'] = plot_data['Close'].rolling(window=stock_window, center=False).mean()
        plot_data['close_diff'] = plot_data['Close'] - plot_data['close_ma']

    return plot_data

## Sentiment Analysis ##

### NVDA ###

#### Process Data ####

In [None]:
NVDA_data = process_data("/tmp/pycharm_project_520/src/data/input/raw/source/analyst_ratings_processed.csv", "/tmp/pycharm_project_520/src/data/input/processed/source/", ticker="NVDA")

#### Plot Data ####

In [None]:
NVDA_data.plot.line(y="Close", x='Date', figsize=(22, 6))

In [None]:
NVDA_data.plot.line(y="trend_1d", x='Date', figsize=(22, 6))

In [None]:
NVDA_data.plot.line(y="volatility_5d", x='Date', figsize=(22, 6))

In [None]:
NVDA_plot_data = organize_data_for_plot(NVDA_data, resample='W', sentiment_window=3, stock_window=3)

In [None]:
plot_sentiment_and_stock(NVDA_plot_data, 'DeBERTa')

In [None]:
plot_sentiment_and_stock(NVDA_plot_data, 'FinBERT')

In [None]:
plot_sentiment_and_stock(NVDA_plot_data, 'DistRoBERTa')

#### Correlation ####

In [None]:
NVDA_data[['DeBERTa_adjusted_score', 'FinBERT_adjusted_score', 'DistRoBERTa_adjusted_score', 'trend_1d']].rolling(window=5, center=False).mean().corr(method='spearman')

In [None]:
NVDA_data[['DeBERTa_adjusted_score', 'FinBERT_adjusted_score', 'DistRoBERTa_adjusted_score', 'trend_1d']].rolling(window=5, center=False).mean().corr()

In [None]:
NVDA_data[['DeBERTa_raw_score', 'FinBERT_raw_score', 'DistRoBERTa_raw_score', 'trend_1d']].rolling(window=5, center=False).mean().corr(method='spearman')

In [None]:
NVDA_data[['DeBERTa_raw_score', 'FinBERT_raw_score', 'DistRoBERTa_raw_score', 'trend_1d']].rolling(window=5, center=False).mean().corr()

### O ###

#### Process Data ####

In [None]:
O_data = process_data("/tmp/pycharm_project_520/src/data/input/raw/source/analyst_ratings_processed.csv", "/tmp/pycharm_project_520/src/data/input/processed/source/", ticker="O")

#### Plot Data ####

In [None]:
O_data.plot.line(y="Close", x='Date', figsize=(22, 6))

In [None]:
O_data.plot.line(y="trend_1d", x='Date', figsize=(22, 6))

In [None]:
O_data.plot.line(y="volatility_5d", x='Date', figsize=(22, 6))

In [None]:
O_plot_data = organize_data_for_plot(O_data, resample='D')

In [None]:
plot_sentiment_and_stock(O_plot_data, 'DeBERTa')

In [None]:
plot_sentiment_and_stock(O_plot_data, 'FinBERT')

In [None]:
plot_sentiment_and_stock(O_plot_data, 'DistRoBERTa')

#### Correlation ####

In [None]:
O_data[['DeBERTa_adjusted_score', 'FinBERT_adjusted_score', 'DistRoBERTa_adjusted_score', 'trend_1d']].rolling(window=28, center=False).mean().corr(method='spearman')

In [None]:
O_data[['DeBERTa_adjusted_score', 'FinBERT_adjusted_score', 'DistRoBERTa_adjusted_score', 'trend_1d']].rolling(window=28, center=False).mean().corr()

In [None]:
O_data[['DeBERTa_raw_score', 'FinBERT_raw_score', 'DistRoBERTa_raw_score', 'trend_1d']].rolling(window=28, center=False).mean().corr(method='spearman')

In [None]:
O_data[['DeBERTa_raw_score', 'FinBERT_raw_score', 'DistRoBERTa_raw_score', 'trend_1d']].rolling(window=28, center=False).mean().corr()

### MU ###

#### Process Data ####

In [None]:
MU_data = process_data("/tmp/pycharm_project_520/src/data/input/raw/source/analyst_ratings_processed.csv", "/tmp/pycharm_project_520/src/data/input/processed/source/", ticker="MU")

#### Plot Data ####

In [None]:
MU_data.plot.line(y="Close", x='Date', figsize=(22, 6))

In [None]:
MU_data.plot.line(y="trend_1d", x='Date', figsize=(22, 6))

In [None]:
MU_data.plot.line(y="volatility_5d", x='Date', figsize=(22, 6))

In [None]:
MU_plot_data = organize_data_for_plot(MU_data, resample='D')

In [None]:
plot_sentiment_and_stock(MU_plot_data, 'DeBERTa')

In [None]:
plot_sentiment_and_stock(MU_plot_data, 'FinBERT')

In [None]:
plot_sentiment_and_stock(MU_plot_data, 'DistRoBERTa')

#### Correlation ####

In [None]:
MU_data[['DeBERTa_adjusted_score', 'FinBERT_adjusted_score', 'DistRoBERTa_adjusted_score', 'trend_1d']].rolling(window=28, center=False).mean().corr(method='spearman')

In [None]:
MU_data[['DeBERTa_adjusted_score', 'FinBERT_adjusted_score', 'DistRoBERTa_adjusted_score', 'trend_1d']].rolling(window=28, center=False).mean().corr()

In [None]:
MU_data[['DeBERTa_raw_score', 'FinBERT_raw_score', 'DistRoBERTa_raw_score', 'trend_1d']].rolling(window=28, center=False).mean().corr(method='spearman')

In [None]:
MU_data[['DeBERTa_raw_score', 'FinBERT_raw_score', 'DistRoBERTa_raw_score', 'trend_1d']].rolling(window=28, center=False).mean().corr()

### MS ###

#### Process Data ####

In [None]:
MS_data = process_data("/tmp/pycharm_project_520/src/data/input/raw/source/analyst_ratings_processed.csv", "/tmp/pycharm_project_520/src/data/input/processed/source/", ticker="MS")

#### Plot Data ####

In [None]:
MS_data.plot.line(y="Close", x='Date', figsize=(22, 6))

In [None]:
MS_data.plot.line(y="trend_1d", x='Date', figsize=(22, 6))

In [None]:
MS_data.plot.line(y="volatility_5d", x='Date', figsize=(22, 6))

In [None]:
MS_plot_data = organize_data_for_plot(MS_data, resample='D')

In [None]:
plot_sentiment_and_stock(MS_plot_data, 'DeBERTa')

In [None]:
plot_sentiment_and_stock(MS_plot_data, 'FinBERT')

In [None]:
plot_sentiment_and_stock(MS_plot_data, 'DistRoBERTa')

#### Correlation ####

In [None]:
MS_data[['DeBERTa_adjusted_score', 'FinBERT_adjusted_score', 'DistRoBERTa_adjusted_score', 'trend_1d']].rolling(window=28, center=False).mean().corr(method='spearman')

In [None]:
MS_data[['DeBERTa_adjusted_score', 'FinBERT_adjusted_score', 'DistRoBERTa_adjusted_score', 'trend_1d']].rolling(window=28, center=False).mean().corr()

In [None]:
MS_data[['DeBERTa_raw_score', 'FinBERT_raw_score', 'DistRoBERTa_raw_score', 'trend_1d']].rolling(window=28, center=False).mean().corr(method='spearman')

In [None]:
MS_data[['DeBERTa_raw_score', 'FinBERT_raw_score', 'DistRoBERTa_raw_score', 'trend_1d']].rolling(window=28, center=False).mean().corr()

### GOOG ###

#### Process Data ####

In [None]:
GOOG_data = process_data("/tmp/pycharm_project_520/src/data/input/raw/source/analyst_ratings_processed.csv", "/tmp/pycharm_project_520/src/data/input/processed/source/", ticker="GOOG")

#### Plot Data ####

In [None]:
GOOG_data.plot.line(y="Close", x='Date', figsize=(22, 6))

In [None]:
GOOG_data.plot.line(y="trend_1d", x='Date', figsize=(22, 6))

In [None]:
GOOG_data.plot.line(y="volatility_5d", x='Date', figsize=(22, 6))

In [None]:
GOOG_plot_data = organize_data_for_plot(GOOG_data, resample='D')

In [None]:
plot_sentiment_and_stock(GOOG_plot_data, 'DeBERTa')

In [None]:
plot_sentiment_and_stock(GOOG_plot_data, 'FinBERT')

In [None]:
plot_sentiment_and_stock(GOOG_plot_data, 'DistRoBERTa')

#### Correlation ####

In [None]:
GOOG_data[['DeBERTa_adjusted_score', 'FinBERT_adjusted_score', 'DistRoBERTa_adjusted_score', 'trend_1d']].rolling(window=28, center=False).mean().corr(method='spearman')

In [None]:
GOOG_data[['DeBERTa_adjusted_score', 'FinBERT_adjusted_score', 'DistRoBERTa_adjusted_score', 'trend_1d']].rolling(window=28, center=False).mean().corr()

In [None]:
GOOG_data[['DeBERTa_raw_score', 'FinBERT_raw_score', 'DistRoBERTa_raw_score', 'trend_1d']].rolling(window=28, center=False).mean().corr(method='spearman')

In [None]:
GOOG_data[['DeBERTa_raw_score', 'FinBERT_raw_score', 'DistRoBERTa_raw_score', 'trend_1d']].rolling(window=28, center=False).mean().corr()

### MRK ###

#### Process Data ####

In [None]:
MRK_data = process_data("/tmp/pycharm_project_520/src/data/input/raw/source/analyst_ratings_processed.csv", "/tmp/pycharm_project_520/src/data/input/processed/source/", ticker="MRK")

#### Plot Data ####

In [None]:
MRK_data.plot.line(y="Close", x='Date', figsize=(22, 6))

In [None]:
MRK_data.plot.line(y="trend_1d", x='Date', figsize=(22, 6))

In [None]:
MRK_data.plot.line(y="volatility_5d", x='Date', figsize=(22, 6))

In [None]:
MRK_plot_data = organize_data_for_plot(MRK_data, resample='D')

In [None]:
plot_sentiment_and_stock(MRK_plot_data, 'DeBERTa')

In [None]:
plot_sentiment_and_stock(MRK_plot_data, 'FinBERT')

In [None]:
plot_sentiment_and_stock(MRK_plot_data, 'DistRoBERTa')

#### Correlation ####

In [None]:
MRK_data[['DeBERTa_adjusted_score', 'FinBERT_adjusted_score', 'DistRoBERTa_adjusted_score', 'trend_1d']].rolling(window=28, center=False).mean().corr(method='spearman')

In [None]:
MRK_data[['DeBERTa_adjusted_score', 'FinBERT_adjusted_score', 'DistRoBERTa_adjusted_score', 'trend_1d']].rolling(window=28, center=False).mean().corr()

In [None]:
MRK_data[['DeBERTa_raw_score', 'FinBERT_raw_score', 'DistRoBERTa_raw_score', 'trend_1d']].rolling(window=28, center=False).mean().corr(method='spearman')

In [None]:
MRK_data[['DeBERTa_raw_score', 'FinBERT_raw_score', 'DistRoBERTa_raw_score', 'trend_1d']].rolling(window=28, center=False).mean().corr()

### QQQ ###

#### Process Data ####

In [None]:
QQQ_data = process_data("/tmp/pycharm_project_520/src/data/input/raw/source/analyst_ratings_processed.csv", "/tmp/pycharm_project_520/src/data/input/processed/source/", ticker="QQQ")

#### Plot Data ####

In [None]:
QQQ_data.plot.line(y="Close", x='Date', figsize=(22, 6))

In [None]:
QQQ_data.plot.line(y="trend_1d", x='Date', figsize=(22, 6))

In [None]:
QQQ_data.plot.line(y="volatility_5d", x='Date', figsize=(22, 6))

In [None]:
QQQ_plot_data = organize_data_for_plot(QQQ_data, resample='D')

In [None]:
plot_sentiment_and_stock(QQQ_plot_data, 'DeBERTa')

In [None]:
plot_sentiment_and_stock(QQQ_plot_data, 'FinBERT')

In [None]:
plot_sentiment_and_stock(QQQ_plot_data, 'DistRoBERTa')

#### Correlation ####

In [None]:
QQQ_data[['DeBERTa_adjusted_score', 'FinBERT_adjusted_score', 'DistRoBERTa_adjusted_score', 'trend_1d']].rolling(window=28, center=False).mean().corr(method='spearman')

In [None]:
QQQ_data[['DeBERTa_adjusted_score', 'FinBERT_adjusted_score', 'DistRoBERTa_adjusted_score', 'trend_1d']].rolling(window=28, center=False).mean().corr()

In [None]:
QQQ_data[['DeBERTa_raw_score', 'FinBERT_raw_score', 'DistRoBERTa_raw_score', 'trend_1d']].rolling(window=28, center=False).mean().corr(method='spearman')

In [None]:
QQQ_data[['DeBERTa_raw_score', 'FinBERT_raw_score', 'DistRoBERTa_raw_score', 'trend_1d']].rolling(window=28, center=False).mean().corr()

### MSFT ###

#### Process Data ####

In [None]:
MSFT_data = process_data("/tmp/pycharm_project_520/src/data/input/raw/source/MicrosoftNews.csv", "/tmp/pycharm_project_520/src/data/input/processed/source/", ticker="MSFT")

#### Plot Data ####

In [None]:
MSFT_data.plot.line(y="Close", x='Date', figsize=(22, 6))

In [None]:
MSFT_data.plot.line(y="trend_1d", x='Date', figsize=(22, 6))

In [None]:
MSFT_data.plot.line(y="volatility_5d", x='Date', figsize=(22, 6))

In [None]:
MSFT_plot_data = organize_data_for_plot(MSFT_data, resample='D')

In [None]:
plot_sentiment_and_stock(MSFT_plot_data, 'DeBERTa')

In [None]:
plot_sentiment_and_stock(MSFT_plot_data, 'FinBERT')

In [None]:
plot_sentiment_and_stock(MSFT_plot_data, 'DistRoBERTa')

#### Correlation ####

In [None]:
MSFT_data[['DeBERTa_adjusted_score', 'FinBERT_adjusted_score', 'DistRoBERTa_adjusted_score', 'trend_1d']].rolling(window=28, center=False).mean().corr(method='spearman')

In [None]:
MSFT_data[['DeBERTa_adjusted_score', 'FinBERT_adjusted_score', 'DistRoBERTa_adjusted_score', 'trend_1d']].rolling(window=28, center=False).mean().corr()

In [None]:
MSFT_data[['DeBERTa_raw_score', 'FinBERT_raw_score', 'DistRoBERTa_raw_score', 'trend_1d']].rolling(window=28, center=False).mean().corr(method='spearman')

In [None]:
MSFT_data[['DeBERTa_raw_score', 'FinBERT_raw_score', 'DistRoBERTa_raw_score', 'trend_1d']].rolling(window=28, center=False).mean().corr()

### AAPL ###

#### Process Data ####

In [None]:
AAPL_data = process_data("/tmp/pycharm_project_520/src/data/input/raw/source/AAPL.csv", ticker="AAPL")

#### Plot Data ####

In [None]:
AAPL_data.plot.line(y="Close", x='Date', figsize=(22, 6))

In [None]:
AAPL_data.plot.line(y="trend_1d", x='Date', figsize=(22, 6))

In [None]:
AAPL_data.plot.line(y="volatility_5d", x='Date', figsize=(22, 6))

In [None]:
AAPL_plot_data = organize_data_for_plot(AAPL_data, resample='B')

In [None]:
plot_sentiment_and_stock(AAPL_plot_data, 'DeBERTa')

In [None]:
plot_sentiment_and_stock(AAPL_plot_data, 'FinBERT')

In [None]:
plot_sentiment_and_stock(AAPL_plot_data, 'DistRoBERTa')

#### Correlation ####

In [None]:
AAPL_data[['DeBERTa_adjusted_score', 'FinBERT_adjusted_score', 'DistRoBERTa_adjusted_score', 'trend_1d']].rolling(window=28, center=False).mean().corr(method='spearman')

In [None]:
AAPL_data[['DeBERTa_adjusted_score', 'FinBERT_adjusted_score', 'DistRoBERTa_adjusted_score', 'trend_1d']].rolling(window=28, center=False).mean().corr()

In [None]:
AAPL_data[['DeBERTa_raw_score', 'FinBERT_raw_score', 'DistRoBERTa_raw_score', 'trend_1d']].rolling(window=28, center=False).mean().corr(method='spearman')

In [None]:
AAPL_data[['DeBERTa_raw_score', 'FinBERT_raw_score', 'DistRoBERTa_raw_score', 'trend_1d']].rolling(window=28, center=False).mean().corr()

## TO DO ##

Histogram Equalization
3d megbízhatóság számolás

### Plot Sentiment Scores ###

In [None]:
aapl_fig = create_sentiment_plot(AAPL_data, "Apple", size=4)
aapl_fig.show()

In [None]:
msft_fig = create_sentiment_plot(MSFT_data, "Microsoft", size=4)
msft_fig.show()

## Testing ##

In [47]:
MU_data = pd.read_csv('/tmp/pycharm_project_520/src/data/output/MU_sentiment.csv')

In [106]:
def process_sentiment_data(data, model, score_type, window, resample):
    # Convert the string representation of list to actual list and extract positive sentiment
    df = data.copy()
    df['sentiment'] = df[f'{model}_{score_type}_score']

    # Select and rename required columns
    processed_df = df.loc[:, ['Date', 'Close', 'High', 'Low', 'Open', 'Volume', 'sentiment']]
    processed_df['Date'] = pd.to_datetime(processed_df['Date'])
    processed_df.set_index('Date', inplace=True)
    processed_df['sentiment'] = processed_df['sentiment'].resample(resample).mean()
    processed_df['sentiment'] = processed_df['sentiment'].rolling(window=window, center=False).mean()
    processed_df.interpolate(method='linear', inplace=True)
    processed_df.fillna(0, inplace=True)
    return processed_df


In [4]:
class PandasSent(bt.feeds.PandasData):
    lines = ('sentiment',)
    params = (
        ('datetime', None),
        ('open', 'Open'),
        ('high', 'High'),
        ('low', 'Low'),
        ('close', 'Close'),
        ('volume', 'Volume'),
        ('sentiment', 'sentiment'),
    )


In [1]:
def run_backtest(msft_data, model='FinBERT', score_type='raw', window=5, resample='D'):

    processed_data = process_sentiment_data(msft_data, model, score_type, window, resample)
    data_feed = PandasSent(dataname=processed_data)
    cerebro = bt.Cerebro()
    cerebro.addstrategy(SentimentStrategy)
    cerebro.adddata(data_feed)
    cerebro.broker.setcash(100000.0)
    cerebro.broker.setcommission(commission=0.01)  # 0.1% commission
    cerebro.addanalyzer(bt.analyzers.PyFolio, _name='PyFolio')
    start_value = cerebro.broker.getvalue()
    results = cerebro.run()
    final_value = cerebro.broker.getvalue()
    print('Starting Portfolio Value: %.2f' % start_value)
    print('Final Portfolio Value: %.2f' % final_value)
    print('Return: %.2f%%' % ((final_value - start_value) / start_value * 100))

    return results, cerebro

In [206]:
class SentimentStrategy(bt.Strategy):
    params = (
        ('exitbars', 3),
    )

    def log(self, txt, dt=None):
        dt = dt or self.datas[0].datetime.date(0)
        print(f'{dt.isoformat()}: {txt}')

    def __init__(self):
        self.dataclose = self.datas[0].close
        self.datasentiment = self.datas[0].sentiment
        self.order = None
        self.buyprice = None
        self.buycomm = None

    def notify_order(self, order):
        if order.status in [order.Submitted, order.Accepted]:
            return

        if order.status in [order.Completed]:
            if order.isbuy():
                self.log(
                    f"BUY EXECUTED, Price: {order.executed.price:.2f}, "
                    f"Cost: {order.executed.value:.2f}, "
                    f"Comm: {order.executed.comm:.2f}"
                )
                self.buyprice = order.executed.price
                self.buycomm = order.executed.comm
            else:
                self.log(
                    f"SELL EXECUTED, Price: {order.executed.price:.2f}, "
                    f"Cost: {order.executed.value:.2f}, "
                    f"Comm: {order.executed.comm:.2f}"
                )

            self.bar_executed = len(self)

        elif order.status in [order.Canceled, order.Margin, order.Rejected]:
            self.log(f"Order Canceled/Margin/Rejected")

        self.order = None

    def notify_trade(self, trade):
        if not trade.isclosed:
            return

        self.log(f"OPERATION PROFIT, GROSS: {trade.pnl:.2f}, NET: {trade.pnlcomm:.2f}")

    def next(self):
        self.log(f"Close: {self.dataclose[0]:.2f}, Sentiment: {self.datasentiment[0]:.3f}")

        if self.order:
            return

        if not self.position:
            # Modified condition to use positive sentiment as buy signal
            if self.datasentiment[0] > 0.1:
                self.log(f'BUY CREATE, {self.dataclose[0]:.2f}')
                self.order = self.buy(size=self.datasentiment[0]*10000)
            if self.datasentiment[0] < -6:
                self.log(f'BUY CREATE, {self.dataclose[0]:.2f}')
                self.order = self.buy(size=10000)

        else:
            if self.datasentiment[0] < -1: #len(self) >= (self.bar_executed + self.params.exitbars):
                self.log(f'SELL CREATE, {self.dataclose[0]:.2f}')
                self.order = self.sell()


In [207]:
results, cerebro = run_backtest(MS_data, 'DeBERTa', 'adjusted', window=30, resample='D')


2010-01-20: Close: 30.63, Sentiment: 0.000
2010-01-21: Close: 29.34, Sentiment: 0.000
2010-01-22: Close: 27.80, Sentiment: 0.000
2010-01-23: Close: 27.78, Sentiment: 0.000
2010-01-24: Close: 27.76, Sentiment: 0.000
2010-01-25: Close: 27.74, Sentiment: 0.000
2010-01-26: Close: 27.33, Sentiment: 0.000
2010-01-27: Close: 27.95, Sentiment: 0.000
2010-01-28: Close: 27.49, Sentiment: 0.000
2010-01-29: Close: 26.78, Sentiment: 0.000
2010-01-30: Close: 27.04, Sentiment: 0.000
2010-01-31: Close: 27.30, Sentiment: 0.000
2010-02-01: Close: 27.56, Sentiment: 0.000
2010-02-02: Close: 28.06, Sentiment: 0.000
2010-02-03: Close: 27.89, Sentiment: 0.000
2010-02-04: Close: 26.63, Sentiment: 0.000
2010-02-05: Close: 27.26, Sentiment: 0.000
2010-02-06: Close: 27.04, Sentiment: 0.000
2010-02-07: Close: 26.82, Sentiment: 0.000
2010-02-08: Close: 26.60, Sentiment: 0.000
2010-02-09: Close: 27.13, Sentiment: 0.000
2010-02-10: Close: 27.51, Sentiment: 0.000
2010-02-11: Close: 27.12, Sentiment: 0.000
2010-02-12:

In [8]:
 class CustomPlotter:
    def __init__(self, cerebro_instance):
        self.cerebro = cerebro_instance

    def plot_results(self):
        """
        Plot the trading results including:
        - Price and sentiment data
        - Portfolio value over time
        """
        # Get the first strategy from the cerebro
        strategy = self.cerebro.runstrats[0][0]

        # Create figure and axis objects with a single subplot
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10), height_ratios=[2, 1])

        # Get the data from the data feed
        datas = strategy.datas[0]

        # Convert dates to list for x-axis
        dates = [bt.num2date(x) for x in datas.lines.datetime.plot()]

        # Plot close prices
        ax1.plot(dates, datas.lines.close.plot(), label='Close Price', color='blue')
        ax1.set_title('MSFT Price and Trading Activity')
        ax1.set_ylabel('Price')
        ax1.grid(True)
        ax1.legend()

        # Plot sentiment scores if available
        if hasattr(datas.lines, 'deberta_sentiment_scores'):
            ax2.plot(dates, datas.lines.deberta_sentiment_scores.plot(),
                    label='Sentiment Score', color='green')
            ax2.set_title('Sentiment Scores')
            ax2.set_ylabel('Score')
            ax2.grid(True)
            ax2.legend()

        # Format x-axis
        for ax in [ax1, ax2]:
            ax.tick_params(axis='x', rotation=45)

        # Add buy/sell markers if available in the strategy
        if hasattr(strategy, 'buy_points') and hasattr(strategy, 'sell_points'):
            for point in strategy.buy_points:
                ax1.plot(bt.num2date(point[0]), point[1], '^',
                        color='green', markersize=10, label='Buy')
            for point in strategy.sell_points:
                ax1.plot(bt.num2date(point[0]), point[1], 'v',
                        color='red', markersize=10, label='Sell')

        # Adjust layout and display
        plt.tight_layout()
        return fig

def plot_backtest(cerebro_instance):
    """
    Wrapper function to create and show the plot
    """
    plotter = CustomPlotter(cerebro_instance)
    fig = plotter.plot_results()
    return fig

In [9]:
fig = plot_backtest(cerebro)
plt.show()

<IPython.core.display.Javascript object>

# 5. Összefoglalás #

# 6. Summary #

# 7 Irodalomjegyzék #