In [5]:
import yfinance as yf
import requests
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import backtrader as bt
import numpy as np

# Step 1: Financial Data Collection
def download_financial_data(ticker, start_date, end_date):
    data = yf.download(ticker, start=start_date, end=end_date)
    return data

# Step 2: Sentiment Data Collection (using dummy sentiment for now)
def get_sentiment_data():
    headlines = [
        "Apple stocks surge after earnings report",
        "New iPhone release boosts Apple shares",
        "Apple faces supply chain challenges",
        "Apple stock price declines amidst market selloff"
    ]
    
    sia = SentimentIntensityAnalyzer()
    sentiment_scores = [sia.polarity_scores(headline)['compound'] for headline in headlines]
    
    # Simulating daily sentiment by creating a DataFrame
    dates = pd.date_range(start='2020-01-01', periods=len(sentiment_scores), freq='D')
    sentiment_data = pd.DataFrame({'Date': dates, 'Sentiment_Score': sentiment_scores})
    sentiment_data.set_index('Date', inplace=True)
    return sentiment_data

# Step 3: Weather Data Collection (using dummy data for now)
def get_weather_data():
    dates = pd.date_range(start='2020-01-01', periods=100, freq='D')
    temperature = np.random.uniform(10, 35, size=(100,))  # Random temperatures
    weather_data = pd.DataFrame({'Date': dates, 'Temperature': temperature})
    weather_data.set_index('Date', inplace=True)
    return weather_data

# Step 4: Data Preprocessing
def preprocess_data(financial_data, sentiment_data, weather_data):
    # Merge all data on Date
    data = financial_data.merge(sentiment_data, left_index=True, right_index=True, how='left')
    data = data.merge(weather_data, left_index=True, right_index=True, how='left')
    
    # Fill missing values (due to different time frames)
    data.fillna(method='ffill', inplace=True)
    
    # Feature Engineering
    data['Close_Lag1'] = data['Close'].shift(1)
    data['Sentiment_Lag1'] = data['Sentiment_Score'].shift(1)
    data['MA_50'] = data['Close'].rolling(window=50).mean()
    
    # Drop rows with NaN values caused by shifting and rolling
    data.dropna(inplace=True)
    
    # Target variable: Predicting if tomorrow's price will be higher than today's
    data['Target'] = (data['Close'].shift(-1) > data['Close']).astype(int)
    
    return data

# Step 5: Model Building
def build_and_evaluate_model(data):
    # Features and target
    X = data[['MA_50', 'Volume', 'Sentiment_Lag1', 'Temperature']]
    y = data['Target']
    
    # Split into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train a Random Forest model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Predict and evaluate
    y_pred = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    
    return model, X_test, y_test, y_pred

# Step 6: Backtesting with Backtrader
class AnomalyStrategy(bt.Strategy):
    def __init__(self, model, X_test):
        self.model = model
        self.X_test = X_test
        self.dataclose = self.datas[0].close

    def next(self):
        current_date = self.datas[0].datetime.date(0)
        if current_date in self.X_test.index:
            prediction = self.model.predict(self.X_test.loc[[current_date]])[0]
            if prediction == 1:
                self.buy()
            else:
                self.sell()

def backtest_strategy(model, data, X_test):
    # Convert data to Backtrader format
    data_bt = bt.feeds.PandasData(dataname=data)

    # Initialize Backtrader
    cerebro = bt.Cerebro()
    cerebro.addstrategy(AnomalyStrategy, model=model, X_test=X_test)
    cerebro.adddata(data_bt)
    cerebro.run()
    cerebro.plot()

# Step 7: Visualization
def plot_results(data, y_test, y_pred):
    plt.figure(figsize=(10,6))
    plt.plot(data.index, data['Close'], label='Actual Price')
    plt.scatter(data.index[y_test.index], y_pred, color='red', label='Predicted Anomaly', marker='x')
    plt.legend()
    plt.title("Market Anomalies Prediction vs Actual Price")
    plt.show()

# Main Function
if __name__ == "__main__":
    # Step 1: Collect financial data
    financial_data = download_financial_data('AAPL', '2020-01-01', '2023-01-01')
    
    # Step 2: Collect sentiment data
    sentiment_data = get_sentiment_data()
    
    # Step 3: Collect weather data
    weather_data = get_weather_data()
    
    # Step 4: Preprocess the data
    data = preprocess_data(financial_data, sentiment_data, weather_data)
    
    # Step 5: Build and evaluate the model
    model, X_test, y_test, y_pred = build_and_evaluate_model(data)
    
    # Step 6: Backtest the strategy
    backtest_strategy(model, data, X_test)
    
    # Step 7: Plot the results
    plot_results(data, y_test, y_pred)

[*********************100%***********************]  1 of 1 completed
  data = financial_data.merge(sentiment_data, left_index=True, right_index=True, how='left')


TypeError: Cannot join tz-naive with tz-aware DatetimeIndex

In [4]:
import nltk

# Download the VADER lexicon
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/cash/nltk_data...


True