# Importing the Libraries

In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from newsapi import NewsApiClient
import re

# Collecting News Data

In [2]:
import requests
import pandas as pd
from datetime import datetime, timedelta

# Replace 'YOUR_API_KEY' with your actual Finnhub API key
API_KEY = 'cse9gdhr01qs1ihohca0cse9gdhr01qs1ihohcag'

# Function to fetch news data from Finnhub
def get_news_data(symbol, from_date, to_date):
    url = f'https://finnhub.io/api/v1/company-news?symbol={symbol}&from={from_date}&to={to_date}&token={API_KEY}'
    response = requests.get(url)
    articles = response.json()
    
    # Prepare the news data
    news_data = []
    for article in articles:
        title = article['headline'] if 'headline' in article else ''
        description = article['summary'] if 'summary' in article else ''
        published_at = datetime.fromtimestamp(article['datetime']).strftime('%Y-%m-%d %H:%M:%S')
        news_data.append((published_at, title + ' ' + description))
    
    return pd.DataFrame(news_data, columns=['Date', 'News'])

# Get the current date and the date one year ago
to_date = datetime.now().strftime('%Y-%m-%d')
from_date = (datetime.now() - timedelta(days=365)).strftime('%Y-%m-%d')

# Fetch news data for the last year for Apple Inc. (AAPL)
news_df = get_news_data('JPM', from_date, to_date)

# Show the first 5 rows of the dataframe
print(news_df.head())

                  Date                                               News
0  2024-10-26 09:54:57  JPMorgan: A Cautious Investor's Perspective JP...
1  2024-10-26 08:40:00  Dividend Champion, Contender, And Challenger H...
2  2024-10-26 02:02:47  Will Jamie Dimon break political silence befor...
3  2024-10-25 21:21:02  TDVG: High Dividend Growth, But Competitors Ar...
4  2024-10-25 19:54:00  JPMorgan, Bank Of America, Citi Earnings Summa...


In [3]:
news_df.shape

(204, 2)

# Collecting Historical Price Data

In [4]:
import yfinance as yf
from datetime import datetime, timedelta

# Get the current date and the date one year ago
end_date = datetime.now().strftime('%Y-%m-%d')
start_date = (datetime.now() - timedelta(days=365)).strftime('%Y-%m-%d')

# Download stock data for the last year
stock_data = yf.download('JPM', start=start_date, end=end_date)

# Show the first 5 rows of the stock data
print(stock_data.head())

[*********************100%%**********************]  1 of 1 completed

                  Open        High         Low       Close   Adj Close  \
Date                                                                     
2023-10-27  138.949997  139.100006  135.190002  135.690002  132.525391   
2023-10-30  136.440002  138.039993  136.039993  137.419998  134.215027   
2023-10-31  137.710007  139.240005  137.470001  139.059998  135.816788   
2023-11-01  139.250000  140.529999  138.470001  138.940002  135.699585   
2023-11-02  140.089996  141.479996  139.229996  141.419998  138.121735   

              Volume  
Date                  
2023-10-27  17434700  
2023-10-30   9855500  
2023-10-31   9672500  
2023-11-01   9432000  
2023-11-02  10186200  





# Finding the CAGR, Sharpe Ratio, Sortino Ratio and Maximum Dropdown

In [5]:
import numpy as np

# 1. CAGR (Compound Annual Growth Rate)
def calculate_cagr(data):
    # Assuming data['Adj Close'] is the adjusted closing price
    cumulative_return = data['Adj Close'].iloc[-1] / data['Adj Close'].iloc[0]
    years = len(data) / 252  # Assuming 252 trading days in a year
    cagr = cumulative_return ** (1 / years) - 1
    return cagr

# 2. Sharpe Ratio
def calculate_sharpe_ratio(data, risk_free_rate=0.02):
    returns = data['Adj Close'].pct_change()
    excess_return = returns.mean() - risk_free_rate / 252
    sharpe_ratio = excess_return / returns.std() * np.sqrt(252)
    return sharpe_ratio

# 3. Sortino Ratio
def calculate_sortino_ratio(data, risk_free_rate=0.02):
    returns = data['Adj Close'].pct_change()
    negative_return = returns[returns < 0]
    excess_return = returns.mean() - risk_free_rate / 252
    sortino_ratio = excess_return / negative_return.std() * np.sqrt(252) if negative_return.std() != 0 else np.nan
    return sortino_ratio

# 4. Maximum Drawdown
def calculate_max_drawdown(data):
    cumulative_return = (1 + data['Adj Close'].pct_change()).cumprod()
    cumulative_max = cumulative_return.cummax()
    drawdown = cumulative_return / cumulative_max - 1
    max_drawdown = drawdown.min()
    return max_drawdown

# Calculate the metrics
cagr = calculate_cagr(stock_data)
sharpe_ratio = calculate_sharpe_ratio(stock_data)
sortino_ratio = calculate_sortino_ratio(stock_data)
max_drawdown = calculate_max_drawdown(stock_data)

# Print the results
print(f"CAGR: {cagr:.2%}")
print(f"Sharpe Ratio: {sharpe_ratio:.2f}")
print(f"Sortino Ratio: {sortino_ratio:.2f}")
print(f"Max Drawdown: {max_drawdown:.2%}")

CAGR: 68.10%
Sharpe Ratio: 2.67
Sortino Ratio: 3.01
Max Drawdown: -10.13%


# Cleaning the news data

In [6]:
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Preprocess the news data (clean text)
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)      # Remove numbers
    text = text.lower().strip()          # Convert to lowercase
    return text

news_df['News'] = news_df['News'].apply(preprocess_text)

# Check if 'Date' column exists, and drop it if necessary
if 'Date' in stock_data.columns:
    stock_data = stock_data.drop(columns=['Date'])

# Merging the news data with the historical price data

In [7]:
# Reset the index of stock_data to make 'Date' a column
stock_data = stock_data.reset_index()

# Convert the 'Date' column in both DataFrames to the same format
stock_data['Date'] = pd.to_datetime(stock_data['Date']).dt.date
news_df['Date'] = pd.to_datetime(news_df['Date']).dt.date

# Merge the two DataFrames on the 'Date' column
merged_df = pd.merge(news_df, stock_data, on='Date', how='inner')

# Preview the merged DataFrame
merged_df.head()

Unnamed: 0,Date,News,Open,High,Low,Close,Adj Close,Volume
0,2024-10-25,tdvg high dividend growth but competitors are ...,225.0,225.619995,220.880005,222.309998,222.309998,6366700
1,2024-10-25,jpmorgan bank of america citi earnings summari...,225.0,225.619995,220.880005,222.309998,222.309998,6366700
2,2024-10-25,piper sandler reaffirms their buy rating on jp...,225.0,225.619995,220.880005,222.309998,222.309998,6366700
3,2024-10-25,jpmorgan exceeds percent in the capital of bp...,225.0,225.619995,220.880005,222.309998,222.309998,6366700
4,2024-10-25,corecard corp confidently befitting a five yea...,225.0,225.619995,220.880005,222.309998,222.309998,6366700


In [8]:
merged_df.shape

(185, 8)

# Performing VADER Sentiment Analysis 

In [9]:
analyzer = SentimentIntensityAnalyzer()

# Analyze sentiment for each news article
def get_sentiment_score(text):
    sentiment = analyzer.polarity_scores(text)
    return sentiment['compound']

# Apply sentiment analysis on each news article
merged_df['sentiment_score'] = merged_df['News'].apply(get_sentiment_score)

# Preview the data
merged_df[['Date', 'News', 'Close', 'sentiment_score']].head()

Unnamed: 0,Date,News,Close,sentiment_score
0,2024-10-25,tdvg high dividend growth but competitors are ...,222.309998,0.7899
1,2024-10-25,jpmorgan bank of america citi earnings summari...,222.309998,0.7845
2,2024-10-25,piper sandler reaffirms their buy rating on jp...,222.309998,0.0
3,2024-10-25,jpmorgan exceeds percent in the capital of bp...,222.309998,0.2023
4,2024-10-25,corecard corp confidently befitting a five yea...,222.309998,0.4767


# Setting a signal for buying or selling 

In [10]:
# Function to classify sentiment and provide buy/hold/sell signals
def classify_sentiment_and_signal(sentiment_score):
    if sentiment_score > 0.4:
        return 'Positive', 1  # 1 for "Buy"
    elif sentiment_score < 0:
        return 'Negative', -1  # -1 for "Sell"
    else:
        return 'Neutral', 0  # 0 for "Hold"

# Apply the classification and signal to the DataFrame
merged_df['Sentiment_Label'], merged_df['Signal'] = zip(*merged_df['sentiment_score'].apply(classify_sentiment_and_signal))

# Display the DataFrame with updated sentiment labels and signals
new_df = merged_df[['Date', 'News', 'sentiment_score', 'Sentiment_Label', 'Signal']]

# Show the first few rows to inspect the results
new_df.head()

Unnamed: 0,Date,News,sentiment_score,Sentiment_Label,Signal
0,2024-10-25,tdvg high dividend growth but competitors are ...,0.7899,Positive,1
1,2024-10-25,jpmorgan bank of america citi earnings summari...,0.7845,Positive,1
2,2024-10-25,piper sandler reaffirms their buy rating on jp...,0.0,Neutral,0
3,2024-10-25,jpmorgan exceeds percent in the capital of bp...,0.2023,Neutral,0
4,2024-10-25,corecard corp confidently befitting a five yea...,0.4767,Positive,1


# Based on the historical data, the number of positives or negatives are checked whether buying or selling is valid

In [11]:
# Function to classify overall sentiment and determine Buy/Hold/Sell decision
def calculate_final_decision(merged_df):
    # Count total signals in the dataframe
    positive_count = (merged_df['Signal'] == 1).sum()  # Buy signals (Positive)
    neutral_count = (merged_df['Sentiment_Label'] == 'Neutral').sum()  # Neutral signals
    negative_count = (merged_df['Signal'] == -1).sum()  # Sell signals (Negative)
    
    # Display the counts
    print(f"Total Positive (Buy): {positive_count}")
    print(f"Total Neutral (Hold): {neutral_count}")
    print(f"Total Negative (Sell): {negative_count}")
    
    # Determine the final decision based on the counts
    if positive_count > neutral_count and positive_count > negative_count:
        return "Buy"
    elif neutral_count > positive_count and neutral_count > negative_count:
        return "Hold"
    else:
        return "Sell"

# Example Usage: Calculate the final decision
final_decision = calculate_final_decision(merged_df)

# Display the final decision
print(f"Final Decision: {final_decision}")

Total Positive (Buy): 91
Total Neutral (Hold): 75
Total Negative (Sell): 19
Final Decision: Buy


# Changing the data to fit a ML model

In [12]:
merged_df = merged_df.drop(columns = ['Open', 'High', 'Low', 'Adj Close', 'Volume'])

# Assuming 'Price' is a column in your DataFrame
merged_df['Price_MA'] = merged_df['Close'].rolling(window=5).mean()  # 5-day moving average for example

# Now you can proceed with calculating Price_Change
merged_df['Price_Change'] = merged_df['Price_MA'].pct_change() * 100  # Percent change in price

# Feature Engineering: Moving Averages for Sentiment
merged_df['Sentiment_MA_5'] = merged_df['sentiment_score'].rolling(window=5).mean()
merged_df['Sentiment_MA_10'] = merged_df['sentiment_score'].rolling(window=10).mean()

# Fill missing values after calculating moving averages
merged_df.fillna(0, inplace=True)

In [13]:
merged_df.head()

Unnamed: 0,Date,News,Close,sentiment_score,Sentiment_Label,Signal,Price_MA,Price_Change,Sentiment_MA_5,Sentiment_MA_10
0,2024-10-25,tdvg high dividend growth but competitors are ...,222.309998,0.7899,Positive,1,0.0,0.0,0.0,0.0
1,2024-10-25,jpmorgan bank of america citi earnings summari...,222.309998,0.7845,Positive,1,0.0,0.0,0.0,0.0
2,2024-10-25,piper sandler reaffirms their buy rating on jp...,222.309998,0.0,Neutral,0,0.0,0.0,0.0,0.0
3,2024-10-25,jpmorgan exceeds percent in the capital of bp...,222.309998,0.2023,Neutral,0,0.0,0.0,0.0,0.0
4,2024-10-25,corecard corp confidently befitting a five yea...,222.309998,0.4767,Positive,1,222.309998,0.0,0.45068,0.0


# Model Building and Fitting

In [14]:
import yfinance as yf
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression  # Change to LogisticRegression for binary classification
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [15]:
# Define features (X) and target (y)
X = merged_df[['sentiment_score', 'Price_Change', 'Sentiment_MA_5', 'Sentiment_MA_10']]  # Feature set
y = merged_df['Signal']  # Target variable (Buy = 1, Hold = 0, Sell = -1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [16]:
# Initialize classifiers
logistic_regression = LogisticRegression()
random_forest = RandomForestClassifier()
gradient_boosting = GradientBoostingClassifier()

In [17]:
# Train and evaluate Logistic Regression
logistic_regression.fit(X_train, y_train)
y_pred_lr = logistic_regression.predict(X_test)
print("Logistic Regression Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_lr)}")
print(classification_report(y_test, y_pred_lr))

# Train and evaluate Random Forest
random_forest.fit(X_train, y_train)
y_pred_rf = random_forest.predict(X_test)
print("Random Forest Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf)}")
print(classification_report(y_test, y_pred_rf))

# Train and evaluate Gradient Boosting
gradient_boosting.fit(X_train, y_train)
y_pred_gb = gradient_boosting.predict(X_test)
print("Gradient Boosting Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_gb)}")
print(classification_report(y_test, y_pred_gb))

Logistic Regression Performance:
Accuracy: 0.8928571428571429
              precision    recall  f1-score   support

          -1       1.00      0.17      0.29         6
           0       0.78      0.95      0.86        19
           1       0.97      1.00      0.98        31

    accuracy                           0.89        56
   macro avg       0.92      0.70      0.71        56
weighted avg       0.91      0.89      0.87        56

Random Forest Performance:
Accuracy: 1.0
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00         6
           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        31

    accuracy                           1.00        56
   macro avg       1.00      1.00      1.00        56
weighted avg       1.00      1.00      1.00        56

Gradient Boosting Performance:
Accuracy: 1.0
              precision    recall  f1-score   support

          -1       1.00      1.00  

# Evaluating

In [18]:
# Make final predictions with the best performing model
final_predictions = logistic_regression.predict(X_test)
print("Final Predictions using the best performing model:")
print(final_predictions)

Final Predictions using the best performing model:
[ 0  0  1  0  1  1  1  1  0  1  1  0  0  1  0  0  0  0  0  1  0  1  1  1
  1  1  0  0  0  1  1  1  0  1  1  1  1  1 -1  1  1  1  1  0  0  0  0  1
  1  1  1  1  0  1  0  0]


# Final decision

In [19]:
# Aggregate predictions to decide final action
def calculate_final_decision(predictions):
    # Count occurrences of each class
    counts = pd.Series(predictions).value_counts()
    most_common = counts.idxmax()  # Get the most frequent class
    return most_common

# Define a mapping from numerical signals to actions
def map_signal_to_action(signal):
    if signal == 1:
        return "Buy"
    elif signal == -1:
        return "Sell"
    elif signal == 0:
        return "Hold"
    else:
        return "Unknown"

# Calculate the final decision
final_decision = calculate_final_decision(final_predictions)

# Map the final decision to the corresponding action
final_action = map_signal_to_action(final_decision)

print(f"Final Decision (Numerical): {final_decision}")
print(f"Final Action: {final_action}")

Final Decision (Numerical): 1
Final Action: Buy
