In [62]:
import matplotlib as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error as mse
from datetime import datetime
import statsmodels.api as sm

In [67]:
def balanced_weighted_average(sentiment_scores, decay_factor):
    """
    Calculate a balanced weighted average of sentiment scores without biasing towards negative values.
    
    Parameters:
    - sentiment_scores: A list or pandas Series of sentiment scores (positive and negative).
    - decay_factor: A value between 0 and 1 to control the decay rate of weights; defaults to 0.95.
    
    Returns:
    - A single balanced weighted average score.
    """
    # Initialize positive and negative scores with respective weights
    positive_scores = sentiment_scores[sentiment_scores > 0]
    negative_scores = sentiment_scores[sentiment_scores < 0]
    
    # Calculate decay weights for each score in reverse order (older scores get smaller weights)
    decay_weights = decay_factor ** np.arange(len(sentiment_scores))[::-1]

    # Separate weights for positive and negative scores
    pos_weights = decay_weights[:len(positive_scores)]
    neg_weights = decay_weights[:len(negative_scores)]

    # Calculate the weighted average for positive and negative scores separately
    pos_weighted_avg = (positive_scores * pos_weights).sum() / pos_weights.sum() if len(pos_weights) > 0 else 0
    neg_weighted_avg = (negative_scores * neg_weights).sum() / neg_weights.sum() if len(neg_weights) > 0 else 0

    # Return the balanced average by combining positive and negative averages equally
    balanced_avg = (pos_weighted_avg + neg_weighted_avg) / 2

    return balanced_avg

In [68]:
def compute_stock_sentiment_sma_changes(stock_symbols, interval_days, decay_factor, start_date, end_date):
    # Initialize dictionaries to hold data
    df_stock_news_sentiment_scores_dict = {}
    df_news_sentiment_scores_dict = {}
    df_stock_data_dict = {}
    results = {}

    # Ensure start_date and end_date are datetime.date objects
    if isinstance(start_date, str):
        start_date = datetime.strptime(start_date, '%Y-%m-%d').date()
    if isinstance(end_date, str):
        end_date = datetime.strptime(end_date, '%Y-%m-%d').date()

    for symbol in stock_symbols:
        # Load data and convert date columns to datetime.date
        try:
            df_stock_news_sentiment_scores_dict[symbol] = pd.read_csv(f'/Users/rishabhbhardwaj/Desktop/Bootcamp project/Sentiment_scores/stock_news_sentiment_scores/stock_news_sentiment_analysis_results_{symbol}.csv')
            df_news_sentiment_scores_dict[symbol] = pd.read_csv(f'/Users/rishabhbhardwaj/Desktop/Bootcamp project/Sentiment_scores/news_sentiment_scores/2000-2024/sentiment_analysis_results_{symbol}.csv')
            df_stock_data_dict[symbol] = pd.read_csv(f'/Users/rishabhbhardwaj/Desktop/Bootcamp project/stocks data/stock_data_{symbol}.csv')
        except FileNotFoundError as e:
            print(f"Error loading data for {symbol}: {e}")
            continue

        # Rename columns to distinguish sentiment sources
        df_news_sentiment_scores_dict[symbol] = df_news_sentiment_scores_dict[symbol].rename(columns={'weighted compound sentiment score': 'weighted compound news sentiment score'})
        df_stock_news_sentiment_scores_dict[symbol] = df_stock_news_sentiment_scores_dict[symbol].rename(columns={'weighted compound sentiment score': 'weighted compound stock sentiment score'})

        # Convert 'Date' to datetime.date format
        for df in [df_stock_news_sentiment_scores_dict[symbol], df_news_sentiment_scores_dict[symbol], df_stock_data_dict[symbol]]:
            df['Date'] = pd.to_datetime(df['Date']).dt.date

        # Filter data within date range
        filtered_stock_sentiment = df_stock_news_sentiment_scores_dict[symbol][
            (df_stock_news_sentiment_scores_dict[symbol]['Date'] >= start_date) & 
            (df_stock_news_sentiment_scores_dict[symbol]['Date'] <= end_date)
        ]
        filtered_news_sentiment = df_news_sentiment_scores_dict[symbol][
            (df_news_sentiment_scores_dict[symbol]['Date'] >= start_date) & 
            (df_news_sentiment_scores_dict[symbol]['Date'] <= end_date)
        ]
        filtered_stock = df_stock_data_dict[symbol][
            (df_stock_data_dict[symbol]['Date'] >= start_date) & 
            (df_stock_data_dict[symbol]['Date'] <= end_date)
        ]

        # Merge data on 'Date'
        merged_data = pd.merge(filtered_news_sentiment, filtered_stock_sentiment, on='Date', how='inner')
        merged_data = pd.merge(merged_data, filtered_stock, on='Date', how='inner')
        merged_data.sort_values(by='Date', inplace=True)

        # Check if data is sufficient
        if len(merged_data) < interval_days:
            print(f"Not enough data for {symbol} with interval_days = {interval_days}. Skipping.")
            continue

        # Initialize lists to store results
        price_diff_list = []
        balanced_avg_stock_sentiment_list = []
        balanced_avg_news_sentiment_list = []
        sma_list = []
        date_list = []

        # Calculate SMA, price difference, and balanced weighted average sentiment
        for i in range(len(merged_data) - interval_days + 1):
            date_d = merged_data['Date'].iloc[i + interval_days - 1]

            # Calculate price difference
            close_d = merged_data['Close'].iloc[i]
            close_d_T = merged_data['Close'].iloc[i + interval_days - 1]
            price_diff = close_d_T - close_d

            # Calculate SMA
            sma = merged_data['Close'].iloc[i:i + interval_days].mean()

            # Calculate balanced weighted averages for sentiments
            news_sentiment_scores = merged_data['weighted compound news sentiment score'].iloc[i:i + interval_days]
            stock_sentiment_scores = merged_data['weighted compound stock sentiment score'].iloc[i:i + interval_days]

            balanced_avg_stock_sentiment = balanced_weighted_average(stock_sentiment_scores, decay_factor)
            balanced_avg_news_sentiment = balanced_weighted_average(news_sentiment_scores, decay_factor)

            # Append results
            date_list.append(date_d)
            price_diff_list.append(price_diff)
            sma_list.append(sma)
            balanced_avg_stock_sentiment_list.append(balanced_avg_stock_sentiment)
            balanced_avg_news_sentiment_list.append(balanced_avg_news_sentiment)

        # Store results for the symbol
        results[symbol] = pd.DataFrame({
            'Date': date_list,
            f'{symbol}_Price_Diff_{interval_days}d': price_diff_list,
            f'{symbol}_SMA_{interval_days}d': sma_list,
            f'{symbol}_Balanced_Avg_Stock_Sentiment_{interval_days}d': balanced_avg_stock_sentiment_list,
            f'{symbol}_Balanced_Avg_News_Sentiment_{interval_days}d': balanced_avg_news_sentiment_list
        })

    return results


In [58]:
stock_symbols = ['GOOG', 'MSFT', 'NVDA','AMZN','AAPL']
interval_days = 14
decay_factor = 0.60
results = compute_stock_sentiment_sma_changes(stock_symbols, interval_days, decay_factor, start_date='2011-05-16', end_date='2024-09-21')

In [59]:
results['NVDA']

Unnamed: 0,Date,NVDA_Price_Diff_14d,NVDA_SMA_14d,NVDA_Balanced_Avg_Stock_Sentiment_14d,NVDA_Balanced_Avg_News_Sentiment_14d
0,2016-10-24,1.402091,0.610644,0.056799,-0.090017
1,2016-12-14,2.035604,0.756274,0.068279,-0.042524
2,2016-12-28,2.34383,0.924198,0.066722,-0.103465
3,2017-05-11,2.816398,1.122249,0.074453,-0.052518
4,2017-06-09,3.356416,1.364398,0.079559,-0.097251
5,2017-08-14,3.811921,1.637301,0.081651,-0.128987
6,2018-01-05,4.885393,1.993076,0.02494,-0.107629
7,2018-02-13,5.251381,2.37247,0.006671,-0.06086
8,2018-03-22,5.456349,2.764122,0.054831,-0.074
9,2018-04-09,4.83569,3.107125,0.024221,-0.060012


In [48]:
def fit_sentiments_vs_metric_as_mlr(stock_symbols, results, interval_days, metric):
    """
    Fits multilinear regression of sentiment scores against either SMA or price difference.

    Parameters:
    - stock_symbols: List of stock symbols to process.
    - results: Dictionary with stock data DataFrames.
    - interval_days: The interval of days over which metrics are calculated.
    - metric: Specify either 'sma' or 'price_diff' to choose which metric to plot.

    Returns:
    - Dictionary containing models and MSEs for each stock symbol.
    """
    # Initialize dictionaries to store linear models and MSEs
    mlr_model = {}
    mlr_mses = {}

    for symbol in stock_symbols:
        # Construct column names based on the actual interval_days and selected metric
        news_sentiment_col = f'{symbol}_Balanced_Avg_News_Sentiment_{interval_days}d'
        stock_sentiment_col = f'{symbol}_Balanced_Avg_Stock_Sentiment_{interval_days}d'
        if metric == 'sma':
            metric_col = f'{symbol}_SMA_{interval_days}d'
        elif metric == 'price_diff':
            metric_col = f'{symbol}_Price_Diff_{interval_days}d'
        else:
            print(f"Unknown metric '{metric}'. Choose 'sma' or 'price_diff'. Skipping.")
            continue

        # Check if the expected columns exist in the DataFrame for the current symbol
        if (
            symbol in results and 
            news_sentiment_col in results[symbol].columns and 
            stock_sentiment_col in results[symbol].columns and 
            metric_col in results[symbol].columns
        ):
            # Extract features (sentiment columns) and target (metric column)
            X = results[symbol][[news_sentiment_col, stock_sentiment_col]].values
            y = results[symbol][metric_col].values

            # Fit a multilinear regression model
            pipeline = Pipeline([
                ('scale', StandardScaler()),
                ('mlr', LinearRegression())
            ])
            pipeline.fit(X, y)
            y_pred = pipeline.predict(X)

            # Store the model and MSE
            mlr_model[symbol] = pipeline
            mlr_mses[symbol] = mse(y, y_pred)

            print(f"{symbol}: Model fitted. MSE = {mlr_mses[symbol]:.4f}")
        else:
            print(f"Required columns for {symbol} with interval {interval_days} days not found. Skipping.")

    return {'models': mlr_model, 'mse': mlr_mses}

In [56]:
fit_sentiments_vs_metric_as_mlr(stock_symbols, results, interval_days, 'sma')

GOOG: Model fitted. MSE = 1456.2790
MSFT: Model fitted. MSE = 6484.8479
NVDA: Model fitted. MSE = 239.5404
AMZN: Model fitted. MSE = 2878.2509
AAPL: Model fitted. MSE = 2626.0585


{'models': {'GOOG': Pipeline(steps=[('scale', StandardScaler()), ('mlr', LinearRegression())]),
  'MSFT': Pipeline(steps=[('scale', StandardScaler()), ('mlr', LinearRegression())]),
  'NVDA': Pipeline(steps=[('scale', StandardScaler()), ('mlr', LinearRegression())]),
  'AMZN': Pipeline(steps=[('scale', StandardScaler()), ('mlr', LinearRegression())]),
  'AAPL': Pipeline(steps=[('scale', StandardScaler()), ('mlr', LinearRegression())])},
 'mse': {'GOOG': np.float64(1456.2790118454427),
  'MSFT': np.float64(6484.847862521754),
  'NVDA': np.float64(239.5404311643511),
  'AMZN': np.float64(2878.250887770791),
  'AAPL': np.float64(2626.0585347191563)}}

In [60]:
def run_multilinear_regression(results, stock_symbol, interval_days, metric):
    """
    Runs a multilinear regression of balanced average news and stock sentiment scores
    against either SMA or price difference.

    Parameters:
    - results: Dictionary with stock data DataFrames.
    - stock_symbol: The symbol of the stock to analyze.
    - interval_days: The interval of days over which metrics are calculated.
    - metric: Specify either 'sma' or 'price_diff' to choose which metric to analyze.
    """
    # Define column names based on interval_days and selected metric
    balanced_avg_news_sentiment_col = f'{stock_symbol}_Balanced_Avg_News_Sentiment_{interval_days}d'
    balanced_avg_stock_sentiment_col = f'{stock_symbol}_Balanced_Avg_Stock_Sentiment_{interval_days}d'
    if metric == 'sma':
        metric_col = f'{stock_symbol}_SMA_{interval_days}d'
    elif metric == 'price_diff':
        metric_col = f'{stock_symbol}_Price_Diff_{interval_days}d'
    else:
        print(f"Unknown metric '{metric}'. Choose 'sma' or 'price_diff'.")
        return

    # Check if the required columns exist in the DataFrame
    if stock_symbol in results and \
       balanced_avg_news_sentiment_col in results[stock_symbol].columns and \
       balanced_avg_stock_sentiment_col in results[stock_symbol].columns and \
       metric_col in results[stock_symbol].columns:
        
        # Extract predictor variables and the dependent variable
        X_news = np.array(results[stock_symbol][balanced_avg_news_sentiment_col])
        X_stock = np.array(results[stock_symbol][balanced_avg_stock_sentiment_col])
        y_stats = np.array(results[stock_symbol][metric_col])
        
        # Combine predictors into a single 2D array
        X_combined = np.column_stack((X_news, X_stock))
        
        # Add a constant to the predictors for the intercept
        X_with_const = sm.add_constant(X_combined)

        # Fit the OLS model
        model_ols = sm.OLS(y_stats, X_with_const)
        results_ols = model_ols.fit()

        # Print the summary to see coefficients and other statistics
        print(results_ols.summary())

        # Get the confidence intervals for the coefficients
        confidence_intervals = results_ols.conf_int(alpha=0.05)  # 95% CI by default
        print("Confidence intervals:\n", confidence_intervals)
    else:
        print(f"Required columns for {stock_symbol} with interval {interval_days} days and metric '{metric}' not found in results.")


In [66]:
# Define the parameters
stock_symbol = 'AAPL'
interval_days = 14
metric = 'sma'  

# Run the multilinear regression
run_multilinear_regression(results, stock_symbol, interval_days, metric)

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.361
Model:                            OLS   Adj. R-squared:                  0.347
Method:                 Least Squares   F-statistic:                     25.66
Date:                Sun, 17 Nov 2024   Prob (F-statistic):           1.45e-09
Time:                        15:01:03   Log-Likelihood:                -503.42
No. Observations:                  94   AIC:                             1013.
Df Residuals:                      91   BIC:                             1020.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         88.6036      6.515     13.599      0.0