Detailed steps to develop a portfolio optimization model that incorporates sentiment-enhanced features. The notebook will cover:

1. **Data Collection**: Collecting financial and sentiment data.
2. **Sentiment Analysis**: Using a fine-tuned BERT model for sentiment scoring.
3. **Portfolio Optimization**: Implementing both classical Mean-Variance Optimization and advanced Black-Litterman model.
4. **Constraints and Objectives**: Applying constraints and objectives in the optimization process.

Here is the content for the Jupyter Notebook:



### Portfolio Optimization with Sentiment Analysis

#### 1. Install Necessary Libraries

Make sure to install the required libraries:



```bash
!pip install transformers datasets yfinance cvxpy stable-baselines3 nltk
!pip install gym
```


In [1]:

#### 2. Import Libraries

import numpy as np
import pandas as pd
import yfinance as yf
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
from cvxpy import Variable, Maximize, quad_form, Problem, sum, Parameter
from stable_baselines3 import PPO
from stable_baselines3.common.envs import DummyVecEnv
from stable_baselines3.common.env_checker import check_env
import matplotlib.pyplot as plt
import requests
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')



  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'stable_baselines3'


#### 3. Load and Fine-Tune BERT Model (if not already done)

You can skip this section if you already have a fine-tuned BERT model. Otherwise, use the following code to fine-tune and save the model.


In [None]:

from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

# Load dataset
dataset = load_dataset('imdb')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize data
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

# Prepare for PyTorch
tokenized_datasets.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

# Fine-tune BERT
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
)

trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained('./fine-tuned-bert')
tokenizer.save_pretrained('./fine-tuned-bert')



In [None]:

#### 4. Load the Fine-Tuned BERT Model

from transformers import BertTokenizer, BertForSequenceClassification, pipeline

# Load the fine-tuned model and tokenizer
model = BertForSequenceClassification.from_pretrained('./fine-tuned-bert')
tokenizer = BertTokenizer.from_pretrained('./fine-tuned-bert')

# Create sentiment analysis pipeline
sentiment_model = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)



In [2]:

#### 5. Collect Financial Data

tickers = ["AAPL", "MSFT", "GOOGL"]
data = yf.download(tickers, start="2020-01-01", end="2023-01-01")



[*********************100%%**********************]  3 of 3 completed


In [None]:

#### 6. Collect and Process Sentiment Data

def get_news_data(ticker):
    url = (f'https://newsapi.org/v2/everything?'
           f'q={ticker}&'
           'from=2020-01-01&'
           'sortBy=popularity&'
           'apiKey=YOUR_NEWSAPI_KEY')
    response = requests.get(url)
    return response.json()

# Example for one stock
news_data = get_news_data("AAPL")

# Extract and preprocess news articles
def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha()]
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

news_articles = [preprocess_text(article['description']) for article in news_data['articles']]

# Get sentiment scores
sentiment_scores = sentiment_model(news_articles)

# Compute sentiment index
def compute_sentiment_index(sentiment_scores):
    positive_scores = [score['score'] for score in sentiment_scores if score['label'] == 'POSITIVE']
    negative_scores = [score['score'] for score in sentiment_scores if score['label'] == 'NEGATIVE']
    return sum(positive_scores) - sum(negative_scores)

sentiment_index = compute_sentiment_index(sentiment_scores)



In [None]:

#### 7. Black-Litterman Model

# Calculate returns and covariance matrix
returns = data['Adj Close'].pct_change().dropna()
mu = returns.mean()
cov_matrix = returns.cov()

# Black-Litterman parameters
tau = 0.025  # scaling factor
P = np.eye(len(tickers))  # Identity matrix for simplicity
Q = np.array([sentiment_index] * len(tickers))  # Sentiment views

# Calculate Black-Litterman expected returns
pi = np.dot(cov_matrix, mu)
sigma_inv = np.linalg.inv(cov_matrix)
omega = np.diag(np.diag(np.dot(np.dot(P, cov_matrix), P.T)) * tau)
bl_returns = np.linalg.inv(sigma_inv + np.dot(np.dot(P.T, np.linalg.inv(omega)), P))
bl_returns = np.dot(bl_returns, (np.dot(sigma_inv, pi) + np.dot(np.dot(P.T, np.linalg.inv(omega)), Q)))



In [None]:

#### 8. Portfolio Optimization with Constraints


n = len(tickers)
w = Variable(n)
gamma = Parameter(nonneg=True)  # Risk aversion coefficient

# Constraints
constraints = [
    sum(w) == 1,         # Weights must sum to 1
    w >= 0,              # No short selling
    # Additional constraints such as sector diversification and liquidity can be added here
]

# Objective function
objective = Maximize(bl_returns.T @ w - gamma * quad_form(w, cov_matrix))

# Optimization problem
problem = Problem(objective, constraints)

# Solve for different levels of risk aversion
gamma_values = np.logspace(-2, 2, num=50)
portfolio_weights = []

for gamma_value in gamma_values:
    gamma.value = gamma_value
    problem.solve()
    portfolio_weights.append(w.value)

# Convert to DataFrame
portfolio_weights_df = pd.DataFrame(portfolio_weights, columns=tickers)



In [None]:

#### 9. Evaluate Portfolio Performance


# Calculate expected returns and risks for each portfolio
expected_returns = portfolio_weights_df @ bl_returns
risks = [np.sqrt(w.T @ cov_matrix @ w) for w in portfolio_weights]

# Sharpe ratio
risk_free_rate = 0.01  # Assuming a risk-free rate of 1%
sharpe_ratios = (expected_returns - risk_free_rate) / risks

# Plot the results
plt.figure(figsize=(10, 6))
plt.plot(risks, expected_returns, 'o-')
for i, txt in enumerate(tickers):
    plt.annotate(txt, (risks[i], expected_returns[i]))
plt.xlabel('Risk (Standard Deviation)')
plt.ylabel('Expected Return')
plt.title('Efficient Frontier with Black-Litterman Model')
plt.show()

# Plot Sharpe ratio
plt.figure(figsize=(10, 6))
plt.plot(gamma_values, sharpe_ratios, 'o-')
plt.xlabel('Risk Aversion Coefficient (gamma)')
plt.ylabel('Sharpe Ratio')
plt.title('Sharpe Ratio vs Risk Aversion')
plt.xscale('log')
plt.show()



In [None]:

#### 10. Reinforcement Learning


from gym import Env, spaces

class PortfolioEnv(Env):
    def __init__(self, returns, cov_matrix, bl_returns, risk_free_rate):
        super(PortfolioEnv, self).__init__()
        self.returns = returns
        self.cov_matrix = cov_matrix
        self.bl_returns = bl_returns
        self.risk_free_rate = risk_free_rate
        self.num_assets = returns.shape[1]
        
        # Action space: asset weights
        self.action_space = spaces.Box(low=0, high=1, shape=(self.num_assets,), dtype=np.float32)
        
        # Observation space: returns, covariance matrix, and Black-Litterman expected returns
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(self.num_assets + self.num_assets**2 + self.num_assets,), dtype=np.float32)
        
    def reset(self):
        # Reset state to initial observation
        self.state = np.concatenate([self.bl_returns, self.cov_matrix.flatten(), self.returns.mean(axis=0)])
        return self.state
    
    def step(self, action):
        # Normalize action to ensure weights sum to 1
        weights = action / np.sum(action)
        
        # Calculate portfolio return and risk
        portfolio_return = np.dot(weights, self.bl_returns)
        portfolio_risk = np.sqrt(np.dot

(weights.T, np.dot(self.cov_matrix, weights)))
        
        # Calculate Sharpe ratio as the reward
        sharpe_ratio = (portfolio_return - self.risk_free_rate) / portfolio_risk
        
        # For simplicity, consider each step as a terminal step
        done = True
        
        # Calculate next state (not relevant in this terminal step setting)
        self.state = np.concatenate([self.bl_returns, self.cov_matrix.flatten(), self.returns.mean(axis=0)])
        
        return self.state, sharpe_ratio, done, {}

# Create and check environment
env = PortfolioEnv(returns, cov_matrix, bl_returns, risk_free_rate)
check_env(env)

# Train RL model
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose=1)
model.learn(total_timesteps=10000)

# Get optimized portfolio weights
obs = env.reset()
action, _states = model.predict(obs)
optimized_weights = action / np.sum(action)
print("Optimized Portfolio Weights:", optimized_weights)
