Connected to base (Python 3.11.4)

In [1]:
import yfinance as yf
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from nltk.sentiment import SentimentIntensityAnalyzer
from newsapi import NewsApiClient

# Initializing the sentiment intensity analyzer from NLTK
sid = SentimentIntensityAnalyzer()

# Initializing the NewsAPI client with API key
newsapi = NewsApiClient(api_key='ec064ce719114fe78bd3affdd71e5db8')  # Replace with your actual API key

def fetch_data(ticker):
    # Fetch 20 years of historical stock data for the provided ticker using yfinance
    t = yf.Ticker(ticker)
    historical_data = t.history(period="20y")
    historical_data.reset_index(inplace=True)
    historical_data['Date'] = historical_data['Date'].dt.tz_localize(None)
    return historical_data

def fetch_news(ticker, company_name=None):
    query = f'"{ticker}"'
    if company_name:
        query += f' OR "{company_name}"'
    
    all_articles = newsapi.get_everything(q=query,
                                          from_param='2024-11-25',
                                          to='2024-12-06',
                                          language='en',
                                          sort_by='relevancy')
    news_data = all_articles.get('articles', [])
    
    if not news_data:
        print(f"No articles found for ticker: {ticker}")
        return pd.DataFrame(columns=['Date', 'title'])

    news_df = pd.DataFrame(news_data)
    
    # Converting publication timestamp to datetime for consistency
    news_df['Date'] = pd.to_datetime(news_df['publishedAt']).dt.date
    news_df['Date'] = pd.to_datetime(news_df['Date'])
    
    return news_df[['Date', 'title']]

def extract_sentiment(news_df, sentiment_scaling_factor):
    news_df['title_sentiment'] = news_df['title'].apply(lambda x: sid.polarity_scores(x)['compound'] * sentiment_scaling_factor)
    return news_df

def merge_news_with_data(data, news):
    # Ensuring 'Date' column in both dataframes is of type datetime
    data['Date'] = pd.to_datetime(data['Date'])
    news['Date'] = pd.to_datetime(news['Date'])

    merged_data = data.merge(news, on='Date', how='left')
    merged_data.fillna(0, inplace=True)
    return merged_data

def process_data(data, look_back=250):
    for i in range(1, look_back + 1):
        col_name = f"lag_{i}"
        data[col_name] = data['Close'].shift(i)
        
    data.dropna(inplace=True)
    X = data.drop(['Close', 'Date', 'title'], axis=1)
    y = data['Close']
    scaler_x = MinMaxScaler()
    scaler_y = MinMaxScaler()
    X_scaled = scaler_x.fit_transform(X)
    y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1))
    return pd.DataFrame(X_scaled, columns=X.columns), pd.Series(y_scaled.ravel()), scaler_x, scaler_y

def execute(ticker, target_date, sentiment_scaling_factor=10000.0):
    look_back = 60
    data = fetch_data(ticker)
    news = fetch_news(ticker)
    news = extract_sentiment(news, sentiment_scaling_factor)
    data = merge_news_with_data(data, news)
    X, y, scaler_x, scaler_y = process_data(data, look_back)
    
    column_names = X.columns.tolist()

    model = RandomForestRegressor(n_estimators=100)
    model.fit(X, y)
    
    target_data = X.iloc[-1].values.reshape(1, -1)
    pred_scaled = model.predict(target_data)
    pred = scaler_y.inverse_transform([[pred_scaled[0]]])[0, 0]
    print(f"\nThe predicted closing price for {ticker} on {target_date} is {pred}")

    last_known_price = y.iloc[-1]
    if pred > last_known_price:
        print(f"The model suggests buying a CALL option for {ticker} expiring on {target_date}.")
    elif pred < last_known_price:
        print(f"The model suggests buying a PUT option for {ticker} expiring on {target_date}.")
    else:
        print(f"The model suggests no clear direction for {ticker} on {target_date}.")

    last_month = pd.Timestamp(target_date) - pd.DateOffset(months=1)
    filtered_data = data[data['Date'] > last_month]
    data_sorted_by_sentiment = filtered_data.sort_values(by='title_sentiment', key=abs, ascending=False)
    top_10_news = data_sorted_by_sentiment[['Date', 'title', 'title_sentiment']].head(10)
    print("\nTop 10 influential")


# Fetch news for debugging purposes
fetch_news('TSLA')

# Call the main function
execute('TSLA', '2025-2-26', sentiment_scaling_factor=10000.0)

#%%

#   BERT SENTIMENT MODEL
import yfinance as yf
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from newsapi import NewsApiClient
from transformers import pipeline

# Initialize sentiment analysis using BERT from transformers
sentiment_analysis = pipeline("sentiment-analysis")

newsapi = NewsApiClient(api_key='ec064ce719114fe78bd3affdd71e5db8')  

def fetch_data(ticker):
    # Fetch 20 years of historical data for the given ticker using yfinance
    t = yf.Ticker(ticker)
    historical_data = t.history(period="20y")
    historical_data.reset_index(inplace=True)
    # Remove timezone information to make it consistent with other datetime objects
    historical_data['Date'] = historical_data['Date'].dt.tz_localize(None)
    return historical_data

def fetch_news(ticker):
    # Fetch news articles for the given ticker for the past month
    all_articles = newsapi.get_everything(q=f"{ticker}",
    from_param='2024-12-07',
    to='2024-12-30',
    language='en',
    sort_by='relevancy')

    news_data = all_articles.get('articles', [])
    
    if not news_data:
        print(f"No articles found for ticker: {ticker}")
        return pd.DataFrame(columns=['Date', 'title'])

    news_df = pd.DataFrame(news_data)
    # Remove timezone information to make it consistent with other datetime objects
    news_df['Date'] = pd.to_datetime(news_df['publishedAt']).dt.tz_localize(None)
    return news_df[['Date', 'title', 'description', 'content']]

def extract_sentiment(news_df, sentiment_weight=1000.0):
    # Combine title, description, and content for better sentiment analysis
    news_df['combined_text'] = news_df['title'] + ' ' + news_df.get('description', '') + ' ' + news_df.get('content', '')

    # Use BERT sentiment analysis to extract sentiment from the combined text
    news_df['title_sentiment'] = news_df['combined_text'].apply(lambda x: sentiment_weight if sentiment_analysis(x)[0]['label'] == 'POSITIVE' else (-sentiment_weight if sentiment_analysis(x)[0]['label'] == 'NEGATIVE' else 0))
    
    return news_df[['Date', 'title', 'title_sentiment', 'combined_text']]

def merge_news_with_data(data, news):
    # Merge historical stock data with news data on Date
    merged_data = data.merge(news, on='Date', how='left')
    merged_data.fillna(0, inplace=True)
    # Calculate a weighted sentiment score based on sentiment and volume
    merged_data['weighted_sentiment'] = merged_data['title_sentiment'] * merged_data['Volume']
    return merged_data

def process_data(data, look_back=250):
    # Create a list to hold all the lag features
    lag_columns = []

    # Generate lag features and append them to the list
    for i in range(1, look_back + 1):
        lag_column = data['Close'].shift(i)
        lag_column.name = f"lag_{i}"
        lag_columns.append(lag_column)

    # Combine all lag features into a single DataFrame
    lag_features = pd.concat(lag_columns, axis=1)

    # Combine the original data with the lag features
    data = pd.concat([data, lag_features], axis=1)
    data.dropna(inplace=True)

    # Define columns that are not features for the model
    columns_to_drop = ['Close', 'Date', 'title', 'combined_text']
    
    # Split data into features and target variable
    X = data.drop(columns_to_drop, axis=1)
    y = data['Close']

    # Normalize feature data and target data
    scaler_x = MinMaxScaler()
    scaler_y = MinMaxScaler()
    X_scaled = scaler_x.fit_transform(X)
    y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1))
    return pd.DataFrame(X_scaled, columns=X.columns), pd.Series(y_scaled.ravel()), scaler_x, scaler_y



def execute(ticker, target_date, sentiment_weight=100000.0):
    # Function to execute the entire pipeline
    data = fetch_data(ticker)
    news = fetch_news(ticker)
    news = extract_sentiment(news, sentiment_weight)
    data = merge_news_with_data(data, news)
    X, y, scaler_x, scaler_y = process_data(data)

    # Train a random forest regressor with the processed data
    column_names = X.columns.tolist()
    model = RandomForestRegressor(n_estimators=100)
    model.fit(X, y)

    # Predict the price for the target date
    target_data = X.iloc[-1].values.reshape(1, -1)
    pred_scaled = model.predict(target_data)
    pred = scaler_y.inverse_transform([[pred_scaled[0]]])[0, 0]
    print(f"\nThe predicted closing price for {ticker} on {target_date} is {pred}")
    
    # Provide a trading suggestion based on the prediction
    last_known_price = y.iloc[-1]
    if pred > last_known_price:
        print(f"The model suggests buying a CALL option for {ticker} expiring on {target_date}.")
    elif pred < last_known_price:
        print(f"The model suggests buying a PUT option for {ticker} expiring on {target_date}.")
    else:
        print(f"The model suggests no clear direction for {ticker} on {target_date}.")

    # Display importance of sentiment in the model
    importance = model.feature_importances_
    if 'title_sentiment' in column_names:
        sentiment_index = column_names.index('title_sentiment')
        sentiment_importance = importance[sentiment_index]
        print(f"\nImportance of title_sentiment: {sentiment_importance}")

    # Display top 3 positive news articles
    positive_news = news[news['title_sentiment'] == sentiment_weight].sort_values(by="Date", ascending=False).head(3)
    print("\nTop 3 Positive News Articles (with combined text):")
    for idx, row in positive_news.iterrows():
        print("\nTitle:", row['title'])
        print("Combined Text:", row['combined_text'])

# Call the function with sentiment_weight
execute('TSLA', '2025-12-26', sentiment_weight=100000.0)

NewsAPIException: {'status': 'error', 'code': 'parameterInvalid', 'message': 'You are trying to request results too far in the past. Your plan permits you to request articles as far back as 2024-12-30, but you have requested 2024-11-25. You may need to upgrade to a paid plan.'}

In [2]:
import yfinance as yf
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from nltk.sentiment import SentimentIntensityAnalyzer
from newsapi import NewsApiClient
# Initializing the sentiment intensity analyzer from NLTK

sid = SentimentIntensityAnalyzer()
# Initializing the NewsAPI client with API key

newsapi = NewsApiClient(api_key='ec064ce719114fe78bd3affdd71e5db8')  # Replace with your actual API key
def fetch_data(ticker):
    # Fetch 20 years of historical stock data for the provided ticker using yfinance
    t = yf.Ticker(ticker)
    historical_data = t.history(period="20y")
    historical_data.reset_index(inplace=True)
    historical_data['Date'] = historical_data['Date'].dt.tz_localize(None)
    return historical_data

def fetch_news(ticker, company_name=None):
    query = f'"{ticker}"'
    if company_name:
        query += f' OR "{company_name}"'
    all_articles = newsapi.get_everything(q=query,
                                          from_param='2024-11-25',
                                          to='2024-12-06',
                                          language='en',
                                          sort_by='relevancy')
    news_data = all_articles.get('articles', [])
    if not news_data:
        print(f"No articles found for ticker: {ticker}")
        return pd.DataFrame(columns=['Date', 'title'])
    news_df = pd.DataFrame(news_data)
    # Converting publication timestamp to datetime for consistency
    news_df['Date'] = pd.to_datetime(news_df['publishedAt']).dt.date
    news_df['Date'] = pd.to_datetime(news_df['Date'])
    return news_df[['Date', 'title']]

def extract_sentiment(news_df, sentiment_scaling_factor):
    news_df['title_sentiment'] = news_df['title'].apply(lambda x: sid.polarity_scores(x)['compound'] * sentiment_scaling_factor)
    return news_df

def merge_news_with_data(data, news):
    # Ensuring 'Date' column in both dataframes is of type datetime
    data['Date'] = pd.to_datetime(data['Date'])
    news['Date'] = pd.to_datetime(news['Date'])
    merged_data = data.merge(news, on='Date', how='left')
    merged_data.fillna(0, inplace=True)
    return merged_data

def process_data(data, look_back=250):
    for i in range(1, look_back + 1):
        col_name = f"lag_{i}"
        data[col_name] = data['Close'].shift(i)
    data.dropna(inplace=True)
    X = data.drop(['Close', 'Date', 'title'], axis=1)
    y = data['Close']
    scaler_x = MinMaxScaler()
    scaler_y = MinMaxScaler()
    X_scaled = scaler_x.fit_transform(X)
    y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1))
    return pd.DataFrame(X_scaled, columns=X.columns), pd.Series(y_scaled.ravel()), scaler_x, scaler_y

def execute(ticker, target_date, sentiment_scaling_factor=10000.0):
    look_back = 30
    data = fetch_data(ticker)
    news = fetch_news(ticker)
    news = extract_sentiment(news, sentiment_scaling_factor)
    data = merge_news_with_data(data, news)
    X, y, scaler_x, scaler_y = process_data(data, look_back)
    column_names = X.columns.tolist()
    model = RandomForestRegressor(n_estimators=100)
    model.fit(X, y)
    target_data = X.iloc[-1].values.reshape(1, -1)
    pred_scaled = model.predict(target_data)
    pred = scaler_y.inverse_transform([[pred_scaled[0]]])[0, 0]
    print(f"\nThe predicted closing price for {ticker} on {target_date} is {pred}")
    last_known_price = y.iloc[-1]
    if pred > last_known_price:
        print(f"The model suggests buying a CALL option for {ticker} expiring on {target_date}.")
    elif pred < last_known_price:
        print(f"The model suggests buying a PUT option for {ticker} expiring on {target_date}.")
    else:
        print(f"The model suggests no clear direction for {ticker} on {target_date}.")
    last_month = pd.Timestamp(target_date) - pd.DateOffset(months=1)
    filtered_data = data[data['Date'] > last_month]
    data_sorted_by_sentiment = filtered_data.sort_values(by='title_sentiment', key=abs, ascending=False)
    top_10_news = data_sorted_by_sentiment[['Date', 'title', 'title_sentiment']].head(10)
    print("\nTop 10 influential")
# Fetch news for debugging purposes

fetch_news('TSLA')
# Call the main function

execute('TSLA', '2025-2-26', sentiment_scaling_factor=10000.0)
#%%
#   BERT SENTIMENT MODEL

import yfinance as yf
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from newsapi import NewsApiClient
from transformers import pipeline
# Initialize sentiment analysis using BERT from transformers

sentiment_analysis = pipeline("sentiment-analysis")
newsapi = NewsApiClient(api_key='ec064ce719114fe78bd3affdd71e5db8')  
def fetch_data(ticker):
    # Fetch 20 years of historical data for the given ticker using yfinance
    t = yf.Ticker(ticker)
    historical_data = t.history(period="20y")
    historical_data.reset_index(inplace=True)
    # Remove timezone information to make it consistent with other datetime objects
    historical_data['Date'] = historical_data['Date'].dt.tz_localize(None)
    return historical_data

def fetch_news(ticker):
    # Fetch news articles for the given ticker for the past month
    all_articles = newsapi.get_everything(q=f"{ticker}",
    from_param='2024-12-07',
    to='2024-12-30',
    language='en',
    sort_by='relevancy')
    news_data = all_articles.get('articles', [])
    if not news_data:
        print(f"No articles found for ticker: {ticker}")
        return pd.DataFrame(columns=['Date', 'title'])
    news_df = pd.DataFrame(news_data)
    # Remove timezone information to make it consistent with other datetime objects
    news_df['Date'] = pd.to_datetime(news_df['publishedAt']).dt.tz_localize(None)
    return news_df[['Date', 'title', 'description', 'content']]

def extract_sentiment(news_df, sentiment_weight=1000.0):
    # Combine title, description, and content for better sentiment analysis
    news_df['combined_text'] = news_df['title'] + ' ' + news_df.get('description', '') + ' ' + news_df.get('content', '')
    # Use BERT sentiment analysis to extract sentiment from the combined text
    news_df['title_sentiment'] = news_df['combined_text'].apply(lambda x: sentiment_weight if sentiment_analysis(x)[0]['label'] == 'POSITIVE' else (-sentiment_weight if sentiment_analysis(x)[0]['label'] == 'NEGATIVE' else 0))
    return news_df[['Date', 'title', 'title_sentiment', 'combined_text']]

def merge_news_with_data(data, news):
    # Merge historical stock data with news data on Date
    merged_data = data.merge(news, on='Date', how='left')
    merged_data.fillna(0, inplace=True)
    # Calculate a weighted sentiment score based on sentiment and volume
    merged_data['weighted_sentiment'] = merged_data['title_sentiment'] * merged_data['Volume']
    return merged_data

def process_data(data, look_back=250):
    # Create a list to hold all the lag features
    lag_columns = []
    # Generate lag features and append them to the list
    for i in range(1, look_back + 1):
        lag_column = data['Close'].shift(i)
        lag_column.name = f"lag_{i}"
        lag_columns.append(lag_column)
    # Combine all lag features into a single DataFrame
    lag_features = pd.concat(lag_columns, axis=1)
    # Combine the original data with the lag features
    data = pd.concat([data, lag_features], axis=1)
    data.dropna(inplace=True)
    # Define columns that are not features for the model
    columns_to_drop = ['Close', 'Date', 'title', 'combined_text']
    # Split data into features and target variable
    X = data.drop(columns_to_drop, axis=1)
    y = data['Close']
    # Normalize feature data and target data
    scaler_x = MinMaxScaler()
    scaler_y = MinMaxScaler()
    X_scaled = scaler_x.fit_transform(X)
    y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1))
    return pd.DataFrame(X_scaled, columns=X.columns), pd.Series(y_scaled.ravel()), scaler_x, scaler_y

def execute(ticker, target_date, sentiment_weight=100000.0):
    # Function to execute the entire pipeline
    data = fetch_data(ticker)
    news = fetch_news(ticker)
    news = extract_sentiment(news, sentiment_weight)
    data = merge_news_with_data(data, news)
    X, y, scaler_x, scaler_y = process_data(data)
    # Train a random forest regressor with the processed data
    column_names = X.columns.tolist()
    model = RandomForestRegressor(n_estimators=100)
    model.fit(X, y)
    # Predict the price for the target date
    target_data = X.iloc[-1].values.reshape(1, -1)
    pred_scaled = model.predict(target_data)
    pred = scaler_y.inverse_transform([[pred_scaled[0]]])[0, 0]
    print(f"\nThe predicted closing price for {ticker} on {target_date} is {pred}")
    # Provide a trading suggestion based on the prediction
    last_known_price = y.iloc[-1]
    if pred > last_known_price:
        print(f"The model suggests buying a CALL option for {ticker} expiring on {target_date}.")
    elif pred < last_known_price:
        print(f"The model suggests buying a PUT option for {ticker} expiring on {target_date}.")
    else:
        print(f"The model suggests no clear direction for {ticker} on {target_date}.")
    # Display importance of sentiment in the model
    importance = model.feature_importances_
    if 'title_sentiment' in column_names:
        sentiment_index = column_names.index('title_sentiment')
        sentiment_importance = importance[sentiment_index]
        print(f"\nImportance of title_sentiment: {sentiment_importance}")
    # Display top 3 positive news articles
    positive_news = news[news['title_sentiment'] == sentiment_weight].sort_values(by="Date", ascending=False).head(3)
    print("\nTop 3 Positive News Articles (with combined text):")
    for idx, row in positive_news.iterrows():
        print("\nTitle:", row['title'])
        print("Combined Text:", row['combined_text'])
# Call the function with sentiment_weight

execute('TSLA', '2025-12-26', sentiment_weight=100000.0)

NewsAPIException: {'status': 'error', 'code': 'parameterInvalid', 'message': 'You are trying to request results too far in the past. Your plan permits you to request articles as far back as 2024-12-30, but you have requested 2024-11-25. You may need to upgrade to a paid plan.'}

In [3]:
NEWS_API_KEY = os.getenv("NEWS_API_KEY")  # Set your API key in environment variables
if not NEWS_API_KEY:
    raise ValueError("Missing NewsAPI key! Set it using os.environ.")
# Initialize NewsAPI client

newsapi = NewsApiClient(api_key=NEWS_API_KEY)
# Initialize sentiment analysis using BERT

sentiment_analysis = pipeline("sentiment-analysis")
def fetch_data(ticker):
    """Fetch 20 years of historical stock data using yfinance."""
    t = yf.Ticker(ticker)
    historical_data = t.history(period="20y")
    historical_data.reset_index(inplace=True)
    historical_data['Date'] = historical_data['Date'].dt.tz_localize(None)
    return historical_data

def fetch_news(ticker):
    """Fetch recent news articles for the given stock ticker."""
    today = datetime.today().date()
    # Set the max allowed date (from the NewsAPI error message)
    max_allowed_date = datetime(2024, 12, 30).date()
    # Adjust date range dynamically to avoid API errors
    from_date = max(max_allowed_date, today - timedelta(days=7))
    to_date = today
    print(f"Fetching news from {from_date} to {to_date}...")
    all_articles = newsapi.get_everything(
        q=ticker,
        from_param=from_date.strftime('%Y-%m-%d'),
        to=to_date.strftime('%Y-%m-%d'),
        language='en',
        sort_by='relevancy'
    )
    news_data = all_articles.get('articles', [])
    if not news_data:
        print(f"No articles found for {ticker}")
        return pd.DataFrame(columns=['Date', 'title', 'description', 'content'])
    news_df = pd.DataFrame(news_data)
    news_df['Date'] = pd.to_datetime(news_df['publishedAt']).dt.tz_localize(None)
    return news_df[['Date', 'title', 'description', 'content']]

def extract_sentiment(news_df):
    """Extract sentiment scores using BERT sentiment analysis."""
    if news_df.empty:
        return news_df
    news_df['combined_text'] = news_df['title'].fillna('') + ' ' + news_df['description'].fillna('') + ' ' + news_df['content'].fillna('')
    news_df['sentiment_score'] = news_df['combined_text'].apply(lambda x: sentiment_analysis(x[:512])[0]['score'])  # Limit to 512 characters
    news_df['sentiment_label'] = news_df['combined_text'].apply(lambda x: sentiment_analysis(x[:512])[0]['label'])
    # Convert labels into numerical values
    news_df['title_sentiment'] = news_df.apply(
        lambda x: x['sentiment_score'] if x['sentiment_label'] == 'POSITIVE' else 
                  (-x['sentiment_score'] if x['sentiment_label'] == 'NEGATIVE' else 0),
        axis=1
    )
    return news_df[['Date', 'title', 'title_sentiment', 'combined_text']]

def merge_news_with_data(data, news):
    """Merge stock data with news sentiment scores."""
    if news.empty:
        data['title_sentiment'] = 0
        return data
    merged_data = data.merge(news, on='Date', how='left')
    merged_data.fillna({'title_sentiment': 0}, inplace=True)
    # Weight sentiment by volume for impact
    merged_data['weighted_sentiment'] = merged_data['title_sentiment'] * merged_data['Volume']
    return merged_data

def process_data(data, look_back=60):
    """Create lag features and prepare data for model training."""
    for i in range(1, look_back + 1):
        data[f"lag_{i}"] = data['Close'].shift(i)
    data.dropna(inplace=True)
    # Define columns to exclude from training
    columns_to_drop = ['Close', 'Date', 'title', 'combined_text']
    X = data.drop(columns=[col for col in columns_to_drop if col in data.columns])
    y = data['Close']
    # Scale data
    scaler_x = MinMaxScaler()
    scaler_y = MinMaxScaler()
    X_scaled = scaler_x.fit_transform(X)
    y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1))
    return pd.DataFrame(X_scaled, columns=X.columns), pd.Series(y_scaled.ravel()), scaler_x, scaler_y

def plot_feature_importance(model, X):
    """Plot feature importance of the trained model."""
    importance = model.feature_importances_
    features = X.columns
    sorted_idx = np.argsort(importance)
    plt.figure(figsize=(10, 6))
    plt.barh([features[i] for i in sorted_idx[-10:]], importance[sorted_idx[-10:]])
    plt.xlabel("Feature Importance")
    plt.title("Top 10 Important Features")
    plt.show()

def execute(ticker, target_date):
    """Execute the stock prediction pipeline."""
    look_back = 60
    # Fetch and process stock data
    data = fetch_data(ticker)
    # Fetch and process news sentiment
    news = fetch_news(ticker)
    news = extract_sentiment(news)
    data = merge_news_with_data(data, news)
    # Prepare data for model training
    X, y, scaler_x, scaler_y = process_data(data, look_back)
    # Train model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X, y)
    # Make prediction
    target_data = X.iloc[-1].values.reshape(1, -1)
    pred_scaled = model.predict(target_data)
    pred = scaler_y.inverse_transform([[pred_scaled[0]]])[0, 0]
    print(f"\nüìà Predicted closing price for {ticker} on {target_date}: **${pred:.2f}**")
    # Generate trading recommendation
    last_known_price = scaler_y.inverse_transform([[y.iloc[-1]]])[0, 0]
    if pred > last_known_price:
        print(f"üìä **Recommendation:** Buy a **CALL** option for {ticker} expiring on {target_date}.")
    elif pred < last_known_price:
        print(f"üìâ **Recommendation:** Buy a **PUT** option for {ticker} expiring on {target_date}.")
    else:
        print(f"‚öñÔ∏è No clear direction for {ticker} on {target_date}.")
    # Plot feature importance
    plot_feature_importance(model, X)
    # Display top 3 positive news articles
    positive_news = news[news['title_sentiment'] > 0].sort_values(by="Date", ascending=False).head(3)
    print("\nüîç **Top 3 Positive News Articles:**")
    for _, row in positive_news.iterrows():
        print(f"\nüìå **Title:** {row['title']}")
        print(f"üìù **Content:** {row['combined_text'][:200]}...")  # Display first 200 characters
# Run the model

execute('TSLA', '2025-12-26')

ValueError: Missing NewsAPI key! Set it using os.environ.

In [4]:
import os
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from newsapi import NewsApiClient
from transformers import pipeline
NEWS_API_KEY = "ec064ce719114fe78bd3affdd71e5db8"  # Replace with your actual API key
# Initialize NewsAPI client

newsapi = NewsApiClient(api_key=NEWS_API_KEY)
# Initialize sentiment analysis using BERT

sentiment_analysis = pipeline("sentiment-analysis")
def fetch_data(ticker):
    """Fetch 20 years of historical stock data using yfinance."""
    t = yf.Ticker(ticker)
    historical_data = t.history(period="20y")
    historical_data.reset_index(inplace=True)
    historical_data['Date'] = historical_data['Date'].dt.tz_localize(None)
    return historical_data

def fetch_news(ticker):
    """Fetch recent news articles for the given stock ticker."""
    today = datetime.today().date()
    # Set the max allowed date (from the NewsAPI error message)
    max_allowed_date = datetime(2024, 12, 30).date()
    # Adjust date range dynamically to avoid API errors
    from_date = max(max_allowed_date, today - timedelta(days=7))
    to_date = today
    print(f"Fetching news from {from_date} to {to_date}...")
    all_articles = newsapi.get_everything(
        q=ticker,
        from_param=from_date.strftime('%Y-%m-%d'),
        to=to_date.strftime('%Y-%m-%d'),
        language='en',
        sort_by='relevancy'
    )
    news_data = all_articles.get('articles', [])
    if not news_data:
        print(f"No articles found for {ticker}")
        return pd.DataFrame(columns=['Date', 'title', 'description', 'content'])
    news_df = pd.DataFrame(news_data)
    news_df['Date'] = pd.to_datetime(news_df['publishedAt']).dt.tz_localize(None)
    return news_df[['Date', 'title', 'description', 'content']]

def extract_sentiment(news_df):
    """Extract sentiment scores using BERT sentiment analysis."""
    if news_df.empty:
        return news_df
    news_df['combined_text'] = news_df['title'].fillna('') + ' ' + news_df['description'].fillna('') + ' ' + news_df['content'].fillna('')
    news_df['sentiment_score'] = news_df['combined_text'].apply(lambda x: sentiment_analysis(x[:512])[0]['score'])  # Limit to 512 characters
    news_df['sentiment_label'] = news_df['combined_text'].apply(lambda x: sentiment_analysis(x[:512])[0]['label'])
    # Convert labels into numerical values
    news_df['title_sentiment'] = news_df.apply(
        lambda x: x['sentiment_score'] if x['sentiment_label'] == 'POSITIVE' else 
                  (-x['sentiment_score'] if x['sentiment_label'] == 'NEGATIVE' else 0),
        axis=1
    )
    return news_df[['Date', 'title', 'title_sentiment', 'combined_text']]

def merge_news_with_data(data, news):
    """Merge stock data with news sentiment scores."""
    if news.empty:
        data['title_sentiment'] = 0
        return data
    merged_data = data.merge(news, on='Date', how='left')
    merged_data.fillna({'title_sentiment': 0}, inplace=True)
    # Weight sentiment by volume for impact
    merged_data['weighted_sentiment'] = merged_data['title_sentiment'] * merged_data['Volume']
    return merged_data

def process_data(data, look_back=60):
    """Create lag features and prepare data for model training."""
    for i in range(1, look_back + 1):
        data[f"lag_{i}"] = data['Close'].shift(i)
    data.dropna(inplace=True)
    # Define columns to exclude from training
    columns_to_drop = ['Close', 'Date', 'title', 'combined_text']
    X = data.drop(columns=[col for col in columns_to_drop if col in data.columns])
    y = data['Close']
    # Scale data
    scaler_x = MinMaxScaler()
    scaler_y = MinMaxScaler()
    X_scaled = scaler_x.fit_transform(X)
    y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1))
    return pd.DataFrame(X_scaled, columns=X.columns), pd.Series(y_scaled.ravel()), scaler_x, scaler_y

def plot_feature_importance(model, X):
    """Plot feature importance of the trained model."""
    importance = model.feature_importances_
    features = X.columns
    sorted_idx = np.argsort(importance)
    plt.figure(figsize=(10, 6))
    plt.barh([features[i] for i in sorted_idx[-10:]], importance[sorted_idx[-10:]])
    plt.xlabel("Feature Importance")
    plt.title("Top 10 Important Features")
    plt.show()

def execute(ticker, target_date):
    """Execute the stock prediction pipeline."""
    look_back = 60
    # Fetch and process stock data
    data = fetch_data(ticker)
    # Fetch and process news sentiment
    news = fetch_news(ticker)
    news = extract_sentiment(news)
    data = merge_news_with_data(data, news)
    # Prepare data for model training
    X, y, scaler_x, scaler_y = process_data(data, look_back)
    # Train model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X, y)
    # Make prediction
    target_data = X.iloc[-1].values.reshape(1, -1)
    pred_scaled = model.predict(target_data)
    pred = scaler_y.inverse_transform([[pred_scaled[0]]])[0, 0]
    print(f"\nüìà Predicted closing price for {ticker} on {target_date}: **${pred:.2f}**")
    # Generate trading recommendation
    last_known_price = scaler_y.inverse_transform([[y.iloc[-1]]])[0, 0]
    if pred > last_known_price:
        print(f"üìä **Recommendation:** Buy a **CALL** option for {ticker} expiring on {target_date}.")
    elif pred < last_known_price:
        print(f"üìâ **Recommendation:** Buy a **PUT** option for {ticker} expiring on {target_date}.")
    else:
        print(f"‚öñÔ∏è No clear direction for {ticker} on {target_date}.")
    # Plot feature importance
    plot_feature_importance(model, X)
    # Display top 3 positive news articles
    positive_news = news[news['title_sentiment'] > 0].sort_values(by="Date", ascending=False).head(3)
    print("\nüîç **Top 3 Positive News Articles:**")
    for _, row in positive_news.iterrows():
        print(f"\nüìå **Title:** {row['title']}")
        print(f"üìù **Content:** {row['combined_text'][:200]}...")  # Display first 200 characters
# Run the model

execute('TSLA', '2025-12-26')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


Fetching news from 2025-01-24 to 2025-01-31...


ValueError: Found array with 0 sample(s) (shape=(0, 68)) while a minimum of 1 is required by MinMaxScaler.

In [5]:
import os
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from newsapi import NewsApiClient
from transformers import pipeline
NEWS_API_KEY = "ec064ce719114fe78bd3affdd71e5db8"  # Replace with your actual API key
# Initialize NewsAPI client

newsapi = NewsApiClient(api_key=NEWS_API_KEY)
# Initialize sentiment analysis using BERT

sentiment_analysis = pipeline("sentiment-analysis")
def fetch_data(ticker):
    """Fetch 20 years of historical stock data using yfinance."""
    t = yf.Ticker(ticker)
    historical_data = t.history(period="20y")
    historical_data.reset_index(inplace=True)
    historical_data['Date'] = historical_data['Date'].dt.tz_localize(None)
    return historical_data

def fetch_news(ticker):
    """Fetch recent news articles for the given stock ticker."""
    today = datetime.today().date()
    # Set the max allowed date (from the NewsAPI error message)
    max_allowed_date = datetime(2024, 12, 30).date()
    # Adjust date range dynamically to avoid API errors
    from_date = max(max_allowed_date, today - timedelta(days=7))
    to_date = today
    print(f"Fetching news from {from_date} to {to_date}...")
    all_articles = newsapi.get_everything(
        q=ticker,
        from_param=from_date.strftime('%Y-%m-%d'),
        to=to_date.strftime('%Y-%m-%d'),
        language='en',
        sort_by='relevancy'
    )
    news_data = all_articles.get('articles', [])
    if not news_data:
        print(f"No articles found for {ticker}")
        return pd.DataFrame(columns=['Date', 'title', 'description', 'content'])
    news_df = pd.DataFrame(news_data)
    news_df['Date'] = pd.to_datetime(news_df['publishedAt']).dt.tz_localize(None)
    return news_df[['Date', 'title', 'description', 'content']]

def extract_sentiment(news_df):
    """Extract sentiment scores using BERT sentiment analysis."""
    if news_df.empty:
        return news_df
    news_df['combined_text'] = news_df['title'].fillna('') + ' ' + news_df['description'].fillna('') + ' ' + news_df['content'].fillna('')
    news_df['sentiment_score'] = news_df['combined_text'].apply(lambda x: sentiment_analysis(x[:512])[0]['score'])  # Limit to 512 characters
    news_df['sentiment_label'] = news_df['combined_text'].apply(lambda x: sentiment_analysis(x[:512])[0]['label'])
    # Convert labels into numerical values
    news_df['title_sentiment'] = news_df.apply(
        lambda x: x['sentiment_score'] if x['sentiment_label'] == 'POSITIVE' else 
                  (-x['sentiment_score'] if x['sentiment_label'] == 'NEGATIVE' else 0),
        axis=1
    )
    return news_df[['Date', 'title', 'title_sentiment', 'combined_text']]

def merge_news_with_data(data, news):
    """Merge stock data with news sentiment scores."""
    if news.empty:
        data['title_sentiment'] = 0
        return data
    merged_data = data.merge(news, on='Date', how='left')
    merged_data.fillna({'title_sentiment': 0}, inplace=True)
    # Weight sentiment by volume for impact
    merged_data['weighted_sentiment'] = merged_data['title_sentiment'] * merged_data['Volume']
    return merged_data

def process_data(data, look_back=60):
    """Create lag features and prepare data for model training."""
    for i in range(1, look_back + 1):
        data[f"lag_{i}"] = data['Close'].shift(i)
    data.dropna(inplace=True)
    # Define columns to exclude from training
    columns_to_drop = ['Close', 'Date', 'title', 'combined_text']
    X = data.drop(columns=[col for col in columns_to_drop if col in data.columns])
    y = data['Close']
    # Scale data
    scaler_x = MinMaxScaler()
    scaler_y = MinMaxScaler()
    X_scaled = scaler_x.fit_transform(X)
    y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1))
    return pd.DataFrame(X_scaled, columns=X.columns), pd.Series(y_scaled.ravel()), scaler_x, scaler_y

def plot_feature_importance(model, X):
    """Plot feature importance of the trained model."""
    importance = model.feature_importances_
    features = X.columns
    sorted_idx = np.argsort(importance)
    plt.figure(figsize=(10, 6))
    plt.barh([features[i] for i in sorted_idx[-10:]], importance[sorted_idx[-10:]])
    plt.xlabel("Feature Importance")
    plt.title("Top 10 Important Features")
    plt.show()

def execute(ticker, target_date):
    """Execute the stock prediction pipeline."""
    look_back = 60
    # Fetch and process stock data
    data = fetch_data(ticker)
    # Fetch and process news sentiment
    news = fetch_news(ticker)
    news = extract_sentiment(news)
    data = merge_news_with_data(data, news)
    # Prepare data for model training
    X, y, scaler_x, scaler_y = process_data(data, look_back)
    # Train model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X, y)
    # Make prediction
    target_data = X.iloc[-1].values.reshape(1, -1)
    pred_scaled = model.predict(target_data)
    pred = scaler_y.inverse_transform([[pred_scaled[0]]])[0, 0]
    print(f"\nüìà Predicted closing price for {ticker} on {target_date}: **${pred:.2f}**")
    # Generate trading recommendation
    last_known_price = scaler_y.inverse_transform([[y.iloc[-1]]])[0, 0]
    if pred > last_known_price:
        print(f"üìä **Recommendation:** Buy a **CALL** option for {ticker} expiring on {target_date}.")
    elif pred < last_known_price:
        print(f"üìâ **Recommendation:** Buy a **PUT** option for {ticker} expiring on {target_date}.")
    else:
        print(f"‚öñÔ∏è No clear direction for {ticker} on {target_date}.")
    # Plot feature importance
    plot_feature_importance(model, X)
    # Display top 3 positive news articles
    positive_news = news[news['title_sentiment'] > 0].sort_values(by="Date", ascending=False).head(3)
    print("\nüîç **Top 3 Positive News Articles:**")
    for _, row in positive_news.iterrows():
        print(f"\n **Title:** {row['title']}")
        print(f" **Content:** {row['combined_text'][:200]}...")  # Display first 200 characters
# Run the model

execute('TSLA', '2025-12-26')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Fetching news from 2025-01-24 to 2025-01-31...


ValueError: Found array with 0 sample(s) (shape=(0, 68)) while a minimum of 1 is required by MinMaxScaler.

In [6]:
import os
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from newsapi import NewsApiClient
from transformers import pipeline
NEWS_API_KEY = "ec064ce719114fe78bd3affdd71e5db8"  # Replace with your actual API key
# Initialize NewsAPI client

newsapi = NewsApiClient(api_key=NEWS_API_KEY)
# Initialize sentiment analysis using BERT

sentiment_analysis = pipeline("sentiment-analysis")
def fetch_data(ticker):
    """Fetch 20 years of historical stock data using yfinance."""
    t = yf.Ticker(ticker)
    historical_data = t.history(period="20y")
    historical_data.reset_index(inplace=True)
    historical_data['Date'] = historical_data['Date'].dt.tz_localize(None)
    return historical_data

def fetch_news(ticker):
    """Fetch recent news articles for the given stock ticker."""
    today = datetime.today().date()
    # Set the max allowed date (from the NewsAPI error message)
    max_allowed_date = datetime(2024, 12, 30).date()
    # Adjust date range dynamically to avoid API errors
    from_date = max(max_allowed_date, today - timedelta(days=7))
    to_date = today
    print(f"Fetching news from {from_date} to {to_date}...")
    all_articles = newsapi.get_everything(
        q=ticker,
        from_param=from_date.strftime('%Y-%m-%d'),
        to=to_date.strftime('%Y-%m-%d'),
        language='en',
        sort_by='relevancy'
    )
    news_data = all_articles.get('articles', [])
    if not news_data:
        print(f"No articles found for {ticker}")
        return pd.DataFrame(columns=['Date', 'title', 'description', 'content'])
    news_df = pd.DataFrame(news_data)
    news_df['Date'] = pd.to_datetime(news_df['publishedAt']).dt.tz_localize(None)
    return news_df[['Date', 'title', 'description', 'content']]

def extract_sentiment(news_df):
    """Extract sentiment scores using BERT sentiment analysis."""
    if news_df.empty:
        return news_df
    news_df['combined_text'] = news_df['title'].fillna('') + ' ' + news_df['description'].fillna('') + ' ' + news_df['content'].fillna('')
    news_df['sentiment_score'] = news_df['combined_text'].apply(lambda x: sentiment_analysis(x[:512])[0]['score'])  # Limit to 512 characters
    news_df['sentiment_label'] = news_df['combined_text'].apply(lambda x: sentiment_analysis(x[:512])[0]['label'])
    # Convert labels into numerical values
    news_df['title_sentiment'] = news_df.apply(
        lambda x: x['sentiment_score'] if x['sentiment_label'] == 'POSITIVE' else 
                  (-x['sentiment_score'] if x['sentiment_label'] == 'NEGATIVE' else 0),
        axis=1
    )
    return news_df[['Date', 'title', 'title_sentiment', 'combined_text']]

def merge_news_with_data(data, news):
    """Merge stock data with news sentiment scores."""
    if news.empty:
        data['title_sentiment'] = 0
        return data
    merged_data = data.merge(news, on='Date', how='left')
    merged_data.fillna({'title_sentiment': 0}, inplace=True)
    # Weight sentiment by volume for impact
    merged_data['weighted_sentiment'] = merged_data['title_sentiment'] * merged_data['Volume']
    return merged_data

def process_data(data, look_back=60):
    """Create lag features and prepare data for model training."""
    # Ensure dataset has enough rows
    if len(data) < look_back:
        raise ValueError(f"Not enough data to create lag features. Need at least {look_back} rows, but only have {len(data)}.")
    for i in range(1, look_back + 1):
        data[f"lag_{i}"] = data['Close'].shift(i)
    data.dropna(inplace=True)
    # Ensure that we still have data after dropping NaNs
    if data.empty:
        raise ValueError("After applying lag features, no rows remain. Ensure the dataset has sufficient historical data.")
    # Define columns to exclude from training
    columns_to_drop = ['Close', 'Date', 'title', 'combined_text']
    X = data.drop(columns=[col for col in columns_to_drop if col in data.columns], errors='ignore')
    y = data['Close']
    # Ensure X is not empty before scaling
    if X.empty:
        raise ValueError("No valid feature data remaining after preprocessing. Check dataset integrity.")
    # Scale data
    scaler_x = MinMaxScaler()
    scaler_y = MinMaxScaler()
    X_scaled = scaler_x.fit_transform(X)
    y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1))
    return pd.DataFrame(X_scaled, columns=X.columns), pd.Series(y_scaled.ravel()), scaler_x, scaler_y

def plot_feature_importance(model, X):
    """Plot feature importance of the trained model."""
    importance = model.feature_importances_
    features = X.columns
    sorted_idx = np.argsort(importance)
    plt.figure(figsize=(10, 6))
    plt.barh([features[i] for i in sorted_idx[-10:]], importance[sorted_idx[-10:]])
    plt.xlabel("Feature Importance")
    plt.title("Top 10 Important Features")
    plt.show()

def execute(ticker, target_date):
    """Execute the stock prediction pipeline."""
    look_back = 60
    # Fetch and process stock data
    data = fetch_data(ticker)
    # Fetch and process news sentiment
    news = fetch_news(ticker)
    news = extract_sentiment(news)
    data = merge_news_with_data(data, news)
    # Prepare data for model training
    X, y, scaler_x, scaler_y = process_data(data, look_back)
    # Train model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X, y)
    # Make prediction
    target_data = X.iloc[-1].values.reshape(1, -1)
    pred_scaled = model.predict(target_data)
    pred = scaler_y.inverse_transform([[pred_scaled[0]]])[0, 0]
    print(f"\nüìà Predicted closing price for {ticker} on {target_date}: **${pred:.2f}**")
    # Generate trading recommendation
    last_known_price = scaler_y.inverse_transform([[y.iloc[-1]]])[0, 0]
    if pred > last_known_price:
        print(f"üìä **Recommendation:** Buy a **CALL** option for {ticker} expiring on {target_date}.")
    elif pred < last_known_price:
        print(f"üìâ **Recommendation:** Buy a **PUT** option for {ticker} expiring on {target_date}.")
    else:
        print(f"‚öñÔ∏è No clear direction for {ticker} on {target_date}.")
    # Plot feature importance
    plot_feature_importance(model, X)
    # Display top 3 positive news articles
    positive_news = news[news['title_sentiment'] > 0].sort_values(by="Date", ascending=False).head(3)
    print("\nüîç **Top 3 Positive News Articles:**")
    for _, row in positive_news.iterrows():
        print(f"\n **Title:** {row['title']}")
        print(f" **Content:** {row['combined_text'][:200]}...")  # Display first 200 characters
# Run the model

execute('TSLA', '2025-12-26')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Fetching news from 2025-01-24 to 2025-01-31...


ValueError: After applying lag features, no rows remain. Ensure the dataset has sufficient historical data.

In [7]:
import os
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from newsapi import NewsApiClient
from transformers import pipeline
NEWS_API_KEY = "ec064ce719114fe78bd3affdd71e5db8"  # Replace with your actual API key
# Initialize NewsAPI client

newsapi = NewsApiClient(api_key=NEWS_API_KEY)
# Initialize sentiment analysis using BERT

sentiment_analysis = pipeline("sentiment-analysis")
def fetch_data(ticker):
    """Fetch 20 years of historical stock data using yfinance."""
    t = yf.Ticker(ticker)
    historical_data = t.history(period="20y")
    historical_data.reset_index(inplace=True)
    historical_data['Date'] = historical_data['Date'].dt.tz_localize(None)
    return historical_data

def fetch_news(ticker):
    """Fetch recent news articles for the given stock ticker."""
    today = datetime.today().date()
    # Set the max allowed date (from the NewsAPI error message)
    max_allowed_date = datetime(2024, 12, 30).date()
    # Adjust date range dynamically to avoid API errors
    from_date = max(max_allowed_date, today - timedelta(days=7))
    to_date = today
    print(f"Fetching news from {from_date} to {to_date}...")
    all_articles = newsapi.get_everything(
        q=ticker,
        from_param=from_date.strftime('%Y-%m-%d'),
        to=to_date.strftime('%Y-%m-%d'),
        language='en',
        sort_by='relevancy'
    )
    news_data = all_articles.get('articles', [])
    if not news_data:
        print(f"No articles found for {ticker}")
        return pd.DataFrame(columns=['Date', 'title', 'description', 'content'])
    news_df = pd.DataFrame(news_data)
    news_df['Date'] = pd.to_datetime(news_df['publishedAt']).dt.tz_localize(None)
    return news_df[['Date', 'title', 'description', 'content']]

def extract_sentiment(news_df):
    """Extract sentiment scores using BERT sentiment analysis."""
    if news_df.empty:
        return news_df
    news_df['combined_text'] = news_df['title'].fillna('') + ' ' + news_df['description'].fillna('') + ' ' + news_df['content'].fillna('')
    news_df['sentiment_score'] = news_df['combined_text'].apply(lambda x: sentiment_analysis(x[:512])[0]['score'])  # Limit to 512 characters
    news_df['sentiment_label'] = news_df['combined_text'].apply(lambda x: sentiment_analysis(x[:512])[0]['label'])
    # Convert labels into numerical values
    news_df['title_sentiment'] = news_df.apply(
        lambda x: x['sentiment_score'] if x['sentiment_label'] == 'POSITIVE' else 
                  (-x['sentiment_score'] if x['sentiment_label'] == 'NEGATIVE' else 0),
        axis=1
    )
    return news_df[['Date', 'title', 'title_sentiment', 'combined_text']]

def merge_news_with_data(data, news):
    """Merge stock data with news sentiment scores."""
    if news.empty:
        data['title_sentiment'] = 0
        return data
    merged_data = data.merge(news, on='Date', how='left')
    merged_data.fillna({'title_sentiment': 0}, inplace=True)
    # Weight sentiment by volume for impact
    merged_data['weighted_sentiment'] = merged_data['title_sentiment'] * merged_data['Volume']
    return merged_data

def process_data(data, look_back=60):
    """Create lag features and prepare data for model training."""
    # Ensure dataset has enough rows before applying lags
    if len(data) < look_back + 1:
        print(f"‚ö†Ô∏è Warning: Dataset has only {len(data)} rows. Reducing look_back to {max(1, len(data) - 1)}.")
        look_back = max(1, len(data) - 1)  # Adjust look_back dynamically
    for i in range(1, look_back + 1):
        data[f"lag_{i}"] = data['Close'].shift(i)
    # Drop NaNs created by lag features
    data.dropna(inplace=True)
    # Ensure we still have data after dropping NaNs
    if data.empty:
        raise ValueError("‚ö†Ô∏è Error: After applying lag features, no rows remain. Reduce `look_back` or check dataset.")
    # Define columns to exclude from training
    columns_to_drop = ['Close', 'Date', 'title', 'combined_text']
    X = data.drop(columns=[col for col in columns_to_drop if col in data.columns], errors='ignore')
    y = data['Close']
    # Ensure X is not empty before scaling
    if X.empty:
        raise ValueError("‚ö†Ô∏è Error: No valid feature data remaining after preprocessing. Check dataset integrity.")
    # Scale data
    scaler_x = MinMaxScaler()
    scaler_y = MinMaxScaler()
    X_scaled = scaler_x.fit_transform(X)
    y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1))
    return pd.DataFrame(X_scaled, columns=X.columns), pd.Series(y_scaled.ravel()), scaler_x, scaler_y

def plot_feature_importance(model, X):
    """Plot feature importance of the trained model."""
    importance = model.feature_importances_
    features = X.columns
    sorted_idx = np.argsort(importance)
    plt.figure(figsize=(10, 6))
    plt.barh([features[i] for i in sorted_idx[-10:]], importance[sorted_idx[-10:]])
    plt.xlabel("Feature Importance")
    plt.title("Top 10 Important Features")
    plt.show()

def execute(ticker, target_date):
    """Execute the stock prediction pipeline."""
    look_back = 60
    # Fetch and process stock data
    data = fetch_data(ticker)
    # Fetch and process news sentiment
    news = fetch_news(ticker)
    news = extract_sentiment(news)
    data = merge_news_with_data(data, news)
    # Prepare data for model training
    X, y, scaler_x, scaler_y = process_data(data, look_back)
    # Train model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X, y)
    # Make prediction
    target_data = X.iloc[-1].values.reshape(1, -1)
    pred_scaled = model.predict(target_data)
    pred = scaler_y.inverse_transform([[pred_scaled[0]]])[0, 0]
    print(f"\nüìà Predicted closing price for {ticker} on {target_date}: **${pred:.2f}**")
    # Generate trading recommendation
    last_known_price = scaler_y.inverse_transform([[y.iloc[-1]]])[0, 0]
    if pred > last_known_price:
        print(f"üìä **Recommendation:** Buy a **CALL** option for {ticker} expiring on {target_date}.")
    elif pred < last_known_price:
        print(f"üìâ **Recommendation:** Buy a **PUT** option for {ticker} expiring on {target_date}.")
    else:
        print(f"‚öñÔ∏è No clear direction for {ticker} on {target_date}.")
    # Plot feature importance
    plot_feature_importance(model, X)
    # Display top 3 positive news articles
    positive_news = news[news['title_sentiment'] > 0].sort_values(by="Date", ascending=False).head(3)
    print("\nüîç **Top 3 Positive News Articles:**")
    for _, row in positive_news.iterrows():
        print(f"\n **Title:** {row['title']}")
        print(f" **Content:** {row['combined_text'][:200]}...")  # Display first 200 characters
# Run the model

execute('TSLA', '2025-12-26')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Fetching news from 2025-01-24 to 2025-01-31...


ValueError: ‚ö†Ô∏è Error: After applying lag features, no rows remain. Reduce `look_back` or check dataset.

In [8]:
pip install xformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting xformers
  Downloading xformers-0.0.29.post1.tar.gz (8.5 MB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m8.5/8.5 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: xformers
  Building wheel for xformers (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m√ó[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m‚îÇ[0m exit code: [1;36m1[0m
  [31m‚ï∞‚îÄ>[0m [31m[195 lines of output][0m
  [31m   [0m running bdist_wheel
  [31m

Restarted base (Python 3.11.4)

In [1]:
def execute(ticker, target_date, look_back=60):
    """Execute the stock prediction pipeline with debugging information."""
    print(f"\nüöÄ Fetching stock data for {ticker}...")
    data = fetch_data(ticker)
    print(f"‚úÖ Stock data retrieved: {len(data)} rows\n")
    print(f"üì∞ Fetching news for {ticker}...")
    news = fetch_news(ticker)
    print(f"‚úÖ News data retrieved: {len(news)} articles\n")
    print("üîç Extracting sentiment from news articles...")
    news = extract_sentiment(news)
    print(f"‚úÖ Sentiment analysis applied: {news.shape[0]} rows\n")
    print("üîó Merging stock data with news sentiment...")
    data = merge_news_with_data(data, news)
    print(f"‚úÖ Data after merging: {data.shape}\n")
    print("üìä Preview of merged data:")
    print(data.head())  # Show first few rows
    print("\n‚è≥ Processing data with lag features...")
    try:
        X, y, scaler_x, scaler_y = process_data(data, look_back)
    except ValueError as e:
        print(f"\nüö® Error in process_data(): {e}")
        print("üìä Data before failing:")
        print(data.head())  # Show first few rows before error
        return
    print(f"‚úÖ Processed data: {X.shape} features, {y.shape} targets\n")
    print("üìä Preview of processed features:")
    print(X.head())  # Show first few processed rows
    print("üõ† Training model...")
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X, y)
    print("‚úÖ Model training complete!\n")
    print("üìà Predicting future price...")
    target_data = X.iloc[-1].values.reshape(1, -1)
    pred_scaled = model.predict(target_data)
    pred = scaler_y.inverse_transform([[pred_scaled[0]]])[0, 0]
    print(f"\nüìä **Predicted closing price for {ticker} on {target_date}: ${pred:.2f}**\n")
    last_known_price = scaler_y.inverse_transform([[y.iloc[-1]]])[0, 0]
    if pred > last_known_price:
        print(f"üìä **Recommendation:** Buy a **CALL** option for {ticker} expiring on {target_date}.")
    elif pred < last_known_price:
        print(f"üìâ **Recommendation:** Buy a **PUT** option for {ticker} expiring on {target_date}.")
    else:
        print(f"‚öñÔ∏è No clear direction for {ticker} on {target_date}.")
    # Display feature importance
    plot_feature_importance(model, X)
    # Show top 3 positive news articles
    positive_news = news[news['title_sentiment'] > 0].sort_values(by="Date", ascending=False).head(3)
    print("\nüîç **Top 3 Positive News Articles:**")
    for _, row in positive_news.iterrows():
        print(f"\nüìå **Title:** {row['title']}")
        print(f"üìù **Content:** {row['combined_text'][:200]}...")  # Display first 200 characters
# Run with debug mode

execute('TSLA', '2025-12-26')


üöÄ Fetching stock data for TSLA...


NameError: name 'fetch_data' is not defined

In [2]:
import os
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from newsapi import NewsApiClient
from transformers import pipeline
NEWS_API_KEY = "ec064ce719114fe78bd3affdd71e5db8"  # Replace with your actual API key
# Initialize NewsAPI client

newsapi = NewsApiClient(api_key=NEWS_API_KEY)
# Initialize sentiment analysis using BERT

sentiment_analysis = pipeline("sentiment-analysis")
def fetch_data(ticker):
    """Fetch 20 years of historical stock data using yfinance."""
    t = yf.Ticker(ticker)
    historical_data = t.history(period="20y")
    historical_data.reset_index(inplace=True)
    historical_data['Date'] = historical_data['Date'].dt.tz_localize(None)
    return historical_data

def fetch_news(ticker):
    """Fetch recent news articles for the given stock ticker."""
    today = datetime.today().date()
    # Set the max allowed date (from the NewsAPI error message)
    max_allowed_date = datetime(2024, 12, 30).date()
    # Adjust date range dynamically to avoid API errors
    from_date = max(max_allowed_date, today - timedelta(days=7))
    to_date = today
    print(f"Fetching news from {from_date} to {to_date}...")
    all_articles = newsapi.get_everything(
        q=ticker,
        from_param=from_date.strftime('%Y-%m-%d'),
        to=to_date.strftime('%Y-%m-%d'),
        language='en',
        sort_by='relevancy'
    )
    news_data = all_articles.get('articles', [])
    if not news_data:
        print(f"No articles found for {ticker}")
        return pd.DataFrame(columns=['Date', 'title', 'description', 'content'])
    news_df = pd.DataFrame(news_data)
    news_df['Date'] = pd.to_datetime(news_df['publishedAt']).dt.tz_localize(None)
    return news_df[['Date', 'title', 'description', 'content']]

def extract_sentiment(news_df):
    """Extract sentiment scores using BERT sentiment analysis."""
    if news_df.empty:
        return news_df
    news_df['combined_text'] = news_df['title'].fillna('') + ' ' + news_df['description'].fillna('') + ' ' + news_df['content'].fillna('')
    news_df['sentiment_score'] = news_df['combined_text'].apply(lambda x: sentiment_analysis(x[:512])[0]['score'])  # Limit to 512 characters
    news_df['sentiment_label'] = news_df['combined_text'].apply(lambda x: sentiment_analysis(x[:512])[0]['label'])
    # Convert labels into numerical values
    news_df['title_sentiment'] = news_df.apply(
        lambda x: x['sentiment_score'] if x['sentiment_label'] == 'POSITIVE' else 
                  (-x['sentiment_score'] if x['sentiment_label'] == 'NEGATIVE' else 0),
        axis=1
    )
    return news_df[['Date', 'title', 'title_sentiment', 'combined_text']]

def merge_news_with_data(data, news):
    """Merge stock data with news sentiment scores."""
    if news.empty:
        data['title_sentiment'] = 0
        return data
    merged_data = data.merge(news, on='Date', how='left')
    merged_data.fillna({'title_sentiment': 0}, inplace=True)
    # Weight sentiment by volume for impact
    merged_data['weighted_sentiment'] = merged_data['title_sentiment'] * merged_data['Volume']
    return merged_data

def process_data(data, look_back=60):
    """Create lag features and prepare data for model training."""
    # Ensure dataset has enough rows before applying lags
    if len(data) < look_back + 1:
        print(f"‚ö†Ô∏è Warning: Dataset has only {len(data)} rows. Reducing look_back to {max(1, len(data) - 1)}.")
        look_back = max(1, len(data) - 1)  # Adjust look_back dynamically
    # Create lag features
    for i in range(1, look_back + 1):
        data[f"lag_{i}"] = data['Close'].shift(i)
    # Drop NaNs created by lag features
    data.dropna(inplace=True)
    # Ensure we still have data after dropping NaNs
    if data.empty:
        raise ValueError(f"‚ö†Ô∏è Error: After applying lag features, no rows remain. Reduce `look_back` or check dataset. Available rows: {len(data)}")
    # Define columns to exclude from training
    columns_to_drop = ['Close', 'Date', 'title', 'combined_text']
    X = data.drop(columns=[col for col in columns_to_drop if col in data.columns], errors='ignore')
    y = data['Close']
    # Ensure X is not empty before scaling
    if X.empty:
        raise ValueError("‚ö†Ô∏è Error: No valid feature data remaining after preprocessing. Check dataset integrity.")
    # Scale data
    scaler_x = MinMaxScaler()
    scaler_y = MinMaxScaler()
    X_scaled = scaler_x.fit_transform(X)
    y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1))
    return pd.DataFrame(X_scaled, columns=X.columns), pd.Series(y_scaled.ravel()), scaler_x, scaler_y

def plot_feature_importance(model, X):
    """Plot feature importance of the trained model."""
    importance = model.feature_importances_
    features = X.columns
    sorted_idx = np.argsort(importance)
    plt.figure(figsize=(10, 6))
    plt.barh([features[i] for i in sorted_idx[-10:]], importance[sorted_idx[-10:]])
    plt.xlabel("Feature Importance")
    plt.title("Top 10 Important Features")
    plt.show()

def execute(ticker, target_date):
    """Execute the stock prediction pipeline."""
    look_back = 60
    # Fetch and process stock data
    data = fetch_data(ticker)
    # Fetch and process news sentiment
    news = fetch_news(ticker)
    news = extract_sentiment(news)
    data = merge_news_with_data(data, news)
    # Prepare data for model training
    X, y, scaler_x, scaler_y = process_data(data, look_back)
    # Train model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X, y)
    # Make prediction
    target_data = X.iloc[-1].values.reshape(1, -1)
    pred_scaled = model.predict(target_data)
    pred = scaler_y.inverse_transform([[pred_scaled[0]]])[0, 0]
    print(f"\nüìà Predicted closing price for {ticker} on {target_date}: **${pred:.2f}**")
    # Generate trading recommendation
    last_known_price = scaler_y.inverse_transform([[y.iloc[-1]]])[0, 0]
    if pred > last_known_price:
        print(f"üìä **Recommendation:** Buy a **CALL** option for {ticker} expiring on {target_date}.")
    elif pred < last_known_price:
        print(f"üìâ **Recommendation:** Buy a **PUT** option for {ticker} expiring on {target_date}.")
    else:
        print(f"‚öñÔ∏è No clear direction for {ticker} on {target_date}.")
    # Plot feature importance
    plot_feature_importance(model, X)
    # Display top 3 positive news articles
    positive_news = news[news['title_sentiment'] > 0].sort_values(by="Date", ascending=False).head(3)
    print("\nüîç **Top 3 Positive News Articles:**")
    for _, row in positive_news.iterrows():
        print(f"\n **Title:** {row['title']}")
        print(f" **Content:** {row['combined_text'][:200]}...")  # Display first 200 characters
# Run the model

execute('TSLA', '2025-12-26')
def execute(ticker, target_date, look_back=60):
    """Execute the stock prediction pipeline with debugging information."""
    print(f"\nüöÄ Fetching stock data for {ticker}...")
    data = fetch_data(ticker)
    print(f"‚úÖ Stock data retrieved: {len(data)} rows\n")
    print(f"üì∞ Fetching news for {ticker}...")
    news = fetch_news(ticker)
    print(f"‚úÖ News data retrieved: {len(news)} articles\n")
    print("üîç Extracting sentiment from news articles...")
    news = extract_sentiment(news)
    print(f"‚úÖ Sentiment analysis applied: {news.shape[0]} rows\n")
    print("üîó Merging stock data with news sentiment...")
    data = merge_news_with_data(data, news)
    print(f"‚úÖ Data after merging: {data.shape}\n")
    print("üìä Preview of merged data:")
    print(data.head())  # Show first few rows
    print("\n‚è≥ Processing data with lag features...")
    try:
        X, y, scaler_x, scaler_y = process_data(data, look_back)
    except ValueError as e:
        print(f"\nüö® Error in process_data(): {e}")
        print("üìä Data before failing:")
        print(data.head())  # Show first few rows before error
        return
    print(f"‚úÖ Processed data: {X.shape} features, {y.shape} targets\n")
    print("üìä Preview of processed features:")
    print(X.head())  # Show first few processed rows
    print("üõ† Training model...")
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X, y)
    print("‚úÖ Model training complete!\n")
    print("üìà Predicting future price...")
    target_data = X.iloc[-1].values.reshape(1, -1)
    pred_scaled = model.predict(target_data)
    pred = scaler_y.inverse_transform([[pred_scaled[0]]])[0, 0]
    print(f"\nüìä **Predicted closing price for {ticker} on {target_date}: ${pred:.2f}**\n")
    last_known_price = scaler_y.inverse_transform([[y.iloc[-1]]])[0, 0]
    if pred > last_known_price:
        print(f"üìä **Recommendation:** Buy a **CALL** option for {ticker} expiring on {target_date}.")
    elif pred < last_known_price:
        print(f"üìâ **Recommendation:** Buy a **PUT** option for {ticker} expiring on {target_date}.")
    else:
        print(f"‚öñÔ∏è No clear direction for {ticker} on {target_date}.")
    # Display feature importance
    plot_feature_importance(model, X)
    # Show top 3 positive news articles
    positive_news = news[news['title_sentiment'] > 0].sort_values(by="Date", ascending=False).head(3)
    print("\nüîç **Top 3 Positive News Articles:**")
    for _, row in positive_news.iterrows():
        print(f"\nüìå **Title:** {row['title']}")
        print(f"üìù **Content:** {row['combined_text'][:200]}...")  # Display first 200 characters
# Run with debug mode

execute('TSLA', '2025-12-26')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


Fetching news from 2025-01-24 to 2025-01-31...


ValueError: ‚ö†Ô∏è Error: After applying lag features, no rows remain. Reduce `look_back` or check dataset. Available rows: 0