In [8]:
# Initial imports
import os
import requests
import json
import pandas as pd
from dotenv import load_dotenv
from pathlib import Path
import numpy as np
import hvplot.pandas
import matplotlib.pyplot as plt
%matplotlib inline

In [9]:
#Set path and read BAC daily trading df.
ge_csvpath = Path("./Resources/GE.csv")
ge_close = pd.read_csv(
    ge_csvpath, 
    index_col = 'Date',
    parse_dates = True,
    infer_datetime_format = True
)
ge_close

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-04-02,12.932692,13.019231,12.461538,12.615385,12.248620,74970480
2018-04-03,12.615385,12.634615,12.307692,12.625000,12.257956,62176504
2018-04-04,12.326923,12.788462,12.298077,12.769231,12.397993,59459296
2018-04-05,12.778846,13.038462,12.663462,12.913462,12.538031,55667248
2018-04-06,12.807692,12.951923,12.432692,12.557692,12.192603,66004536
...,...,...,...,...,...,...
2021-03-24,12.760000,13.020000,12.500000,12.500000,12.500000,62073500
2021-03-25,12.400000,12.930000,12.230000,12.850000,12.850000,73391900
2021-03-26,12.950000,13.130000,12.710000,12.990000,12.990000,60452800
2021-03-29,12.920000,13.050000,12.780000,12.950000,12.950000,53762900


In [10]:
#Drop columns except close and volume
ge_close.drop(['Open', 'High', 'Low', 'Adj Close'], axis=1, inplace=True)
ge_close.tail()

Unnamed: 0_level_0,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-03-24,12.5,62073500
2021-03-25,12.85,73391900
2021-03-26,12.99,60452800
2021-03-29,12.95,53762900
2021-03-30,13.3,52925100


In [11]:
#API Call
headers = {
    'accept': 'application/json',
    'Authorization': 'Token 168ac68d07d82b3c04e4fcc9fb1db90bdcf380c5',
}
r = requests.get('https://socialsentiment.io/api/v1/stocks/GE/sentiment/daily/?to_date=2021-03-31&from_date=2020-04-19', headers = headers)
x = r.json()

# Convert to DataFrame
ge_activity_df = pd.read_json(json.dumps(x))
ge_activity_df = ge_activity_df.set_index('date')
ge_activity_df = ge_activity_df.drop(columns = ['stock', 'positive_score', 'negative_score', 'avg_7_days', 'avg_14_days', 'avg_30_days'])
ge_activity_df

Unnamed: 0_level_0,score,activity
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-04-19,-3,10
2020-04-20,22,51
2020-04-21,-3,16
2020-04-22,1,17
2020-04-23,-2,38
...,...,...
2021-03-27,23,109
2021-03-28,12,28
2021-03-29,19,249
2021-03-30,11,195


In [12]:
ge_df = pd.concat([ge_close, ge_activity_df],axis=1).dropna()

# Display sample data
ge_df.head(-5)

Unnamed: 0,Close,Volume,score,activity
2020-04-20,6.51,99691600.0,22.0,51.0
2020-04-21,6.48,77453100.0,-3.0,16.0
2020-04-22,6.43,76844100.0,1.0,17.0
2020-04-23,6.52,82796100.0,-2.0,38.0
2020-04-24,6.26,155976800.0,8.0,28.0
...,...,...,...,...
2021-03-17,13.61,78407900.0,16.0,380.0
2021-03-18,13.25,63668400.0,12.0,214.0
2021-03-19,13.22,68380000.0,27.0,243.0
2021-03-22,13.13,52789100.0,-22.0,647.0


In [13]:
ge_price_line = ge_df["Close"].hvplot(
    ylabel='Price',
    width=500,
    height=200
)
ge_price_line

In [14]:
ge_score_bar = ge_df["score"].hvplot.bar(
    ylabel='Social Media Sentiment Score',
    width=500,
    height=200)
ge_score_bar


In [15]:
# Set the variables for short window and long window periods
short_window = 7
long_window = 14

# Generate the short and long window simple moving averages (by 7 and 14 days, respectively)
ge_df["Avg_Activity_1Week"] = ge_df["activity"].rolling(window=short_window).mean()
ge_df["Avg_Activity_2Week"] = ge_df["activity"].rolling(window=long_window).mean()

# Create a column to hold the trading signal
ge_df["Signal"] = 0.0

# Generate the trading signal 0 or 1,
# where 1 is the short-window (SMA7) greater than the long-window (SMA14)
# and 0 is when the condition is not met
ge_df["Signal"][short_window:] = np.where(
    ge_df["Avg_Activity_1Week"][short_window:] > ge_df["Avg_Activity_2Week"][short_window:], 1.0, 0.0
)

# Calculate the points in time when the Signal value changes
# Identify trade entry (1) and exit (-1) points
ge_df["Entry/Exit"] = bac_df["Signal"].diff()

# Review the DataFrame
ge_df.tail()

NameError: name 'bac_df' is not defined

In [None]:
plot = ge_df[["activity", "Avg_Activity_1Week", "Avg_Activity_2Week"]]
plot.hvplot()

In [None]:
# Visualize exit position relative to close price
exit = ge_df[ge_df['Entry/Exit'] == -1.0]['activity'].hvplot.scatter(
    color= 'purple',
    marker = 'v',
    legend=False,
    width=1000,
    height=400)

# Visualize entry position relative to close price
entry = ge_df[ge_df['Entry/Exit'] == 1.0]['activity'].hvplot.scatter(
    color='green',
    marker = '^',
    legend=False,
    width=1000,
    height=400)

# Visualize the close price for the investment
ge_close = ge_df[['activity']].hvplot(
    line_color='lightgray',
    ylabel='Activity',
    width=1000,
    height=400
)

# Visualize moving averages
moving_avgs = ge_df[["Avg_Activity_1Week", "Avg_Activity_2Week"]].hvplot(
    width=1000,
    height=400)

# Overlay the plots
entry_exit_plot = ge_close * moving_avgs * entry * exit
entry_exit_plot.opts(
    title="GE - Social Media Sentiment Trading Algorithm"
)

In [None]:
# Plot closing prices and social media activity for one year timeframe
ge_close_plot = ge_df['Close'].hvplot(
    title = 'BAC Closing Prices',
    ylabel = 'Price in $')

ge_activity_plot = ge_activity_df['activity'].hvplot(
    title = 'BAC Activity',)

ge_close_plot + ge_activity_plot

### Backtest the Trading Strategy

In [None]:
# Set the initial capital
initial_capital = float(100000)

# Set the share size
share_size = ge_df["score"]

# Take shares in amount of score for position where the dual moving average crossover is 1 (SMA7 is greater than SMA30)
ge_df["Shares"] = abs(share_size) * ge_df["Entry/Exit"]

# Multiply share price by entry/exit positions and get the cumulatively sum
ge_df["Portfolio Holdings"] = (
    ge_df["Close"] * ge_df["Shares"].cumsum()
)

# Subtract the initial capital by the portfolio holdings to get the amount of liquid cash in the portfolio
ge_df["Portfolio Cash"] = (
    initial_capital - (ge_df["Close"] * ge_df["Shares"]).cumsum()
)

# Get the total portfolio value by adding the cash amount by the portfolio holdings (or investments)
ge_df["Portfolio Total"] = (
    ge_df["Portfolio Cash"] + ge_df["Portfolio Holdings"]
)

# Calculate the portfolio daily returns
ge_df["Portfolio Daily Returns"] = ge_df["Portfolio Total"].pct_change()

# Calculate the cumulative returns
ge_df["Portfolio Cumulative Returns"] = (
    1 + ge_df["Portfolio Daily Returns"]
).cumprod() - 1

# Print the DataFrame
ge_df.head()

In [None]:
# Visualize exit positions relative to total portfolio value
entry = ge_df[ge_df["Entry/Exit"] == 1.0]["Portfolio Total"].hvplot.scatter(
    color='purple',
    marker='^',
    legend=False, 
    ylabel="Total Portfolio Value", 
    width=1000, 
    height=400
)

# Visualize entry positions relative to total portfolio value
exit = ge_df[ge_df["Entry/Exit"] == -1.0]["Portfolio Total"].hvplot.scatter(
    color='yellow',
    marker='v',
    legend=False, 
    ylabel="Total Portfolio Value", 
    width=1000, 
    height=400
)

# Visualize the total portoflio value for the investment
total_portfolio_value = ge_df[['Portfolio Total']].hvplot(
    line_color='lightgray',
    ylabel='Total Portfolio Value',
    width=1000,
    height=400
)

# Overlay the plots
portfolio_entry_exit_plot = total_portfolio_value * entry * exit
portfolio_entry_exit_plot.opts(
    title="Social Sentiment Trading Algorithm - GE Total Portfolio Value",
    yformatter='%.0f'
)

In [None]:
# Create the list of the metric names
metrics = [
    'Annualized Return',
    'Cumulative Returns',
    'Annual Volatility',
    'Sharpe Ratio',
]

# Create a list that holds the column name
columns = ['Backtest']

# Initialize the DataFrame with index set to evaluation metrics and columns 
portfolio_evaluation_df = pd.DataFrame(index=metrics, columns=columns)

# Calculate the Annualized return metric
portfolio_evaluation_df.loc['Annualized Return'] = (
    ge_df['Portfolio Daily Returns'].mean() * 252
)

# Calculate the Cumulative returns metric
portfolio_evaluation_df.loc['Cumulative Returns'] = ge_df['Portfolio Cumulative Returns'][-1]

# Calculate the Annual volatility metric
portfolio_evaluation_df.loc['Annual Volatility'] = (
    ge_df['Portfolio Daily Returns'].std() * np.sqrt(252)
)

# Calculate the Sharpe ratio
portfolio_evaluation_df.loc['Sharpe Ratio'] = (
    ge_df['Portfolio Daily Returns'].mean() * 252) / (
    ge_df['Portfolio Daily Returns'].std() * np.sqrt(252)
)

# Review the portfolio evaluation DataFrame
portfolio_evaluation_df

In [None]:
# Initialize the trade evaluation DataFrame
trade_evaluation_df = pd.DataFrame(
    columns=[
        'Stock', 
        'Entry Date', 
        'Exit Date', 
        'Shares', 
        'Entry Share Price', 
        'Exit Share Price', 
        'Entry Portfolio Holding', 
        'Exit Portfolio Holding', 
        'Profit/Loss']
)

# Initialize the iterative variables
entry_date = ""
exit_date = ""
entry_portfolio_holding = 0.0
exit_portfolio_holding = 0.0
share_size = 0
entry_share_price = 0.0
exit_share_price = 0.0

# Loop through the signal DataFrame
# If `Entry/Exit` is 1, set entry trade metrics
# Else if `Entry/Exit` is -1, set exit trade metrics and calculate profit,
# Then append the record to the trade evaluation DataFrame
for index, row in ge_df.iterrows():
    if row['Entry/Exit'] == 1:
        entry_date = index
        entry_portfolio_holding = abs(row['Portfolio Holdings'])
        share_size = row['Shares']
        entry_share_price = row['Close']

    elif row['Entry/Exit'] == -1:
        exit_date = index
        exit_portfolio_holding = abs(row['Close'] * row['Shares'])
        exit_share_price = row['Close']
        profit_loss =  entry_portfolio_holding - exit_portfolio_holding
        trade_evaluation_df = trade_evaluation_df.append(
            {
                'Stock': 'BAC',
                'Entry Date': entry_date,
                'Exit Date': exit_date,
                'Shares': share_size,
                'Entry Share Price': entry_share_price,
                'Exit Share Price': exit_share_price,
                'Entry Portfolio Holding': entry_portfolio_holding,
                'Exit Portfolio Holding': exit_portfolio_holding,
                'Profit/Loss': profit_loss
            },
            ignore_index=True)

# Review the DataFrame
trade_evaluation_df

In [None]:
trade_evaluation_df["Profit/Loss"].sum()

In [None]:
##ML models

In [None]:
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from pandas.tseries.offsets import DateOffset
from sklearn.metrics import classification_report

In [16]:
#Split data into training and testing datasets for aapl_df
# Assign a copy of the Close and activity columns to a features DataFrame called X
X = ge_df[['Close', 'activity']].shift().dropna()

# Review the DataFrame
X.head()

Unnamed: 0,Close,activity
2020-04-21,6.51,51.0
2020-04-22,6.48,16.0
2020-04-23,6.43,17.0
2020-04-24,6.52,38.0
2020-04-30,6.26,28.0


In [18]:
# Create the target set selecting the Signal column and assiging it to y
y = ge_df['Signal']

# Review the value counts
y.value_counts()

1.0    117
0.0    116
Name: Signal, dtype: int64

In [19]:
# Select the start of the training period
training_begin = X.index.min()

# Display the training begin date
print(training_begin)

2020-04-21 00:00:00


In [20]:
# Select the ending period for the training data with an offset of 3 months
training_end = X.index.min() + DateOffset(months=3)

# Display the training end date
print(training_end)

2020-07-21 00:00:00


In [21]:
# Generate the X_train and y_train DataFrames
X_train = X.loc[training_begin:training_end] # df.loc[row, column] 
y_train = y.loc[training_begin:training_end]

# Review the X_train DataFrame
X_train.head()


Unnamed: 0,Close,activity
2020-04-21,6.51,51.0
2020-04-22,6.48,16.0
2020-04-23,6.43,17.0
2020-04-24,6.52,38.0
2020-04-30,6.26,28.0


In [22]:
# Generate the X_test and y_test DataFrames
X_test = X.loc[training_end+DateOffset(hours=1):]
y_test = y.loc[training_end+DateOffset(hours=1):]

# Review the X_test DataFrame
X_test.head()

Unnamed: 0,Close,activity
2020-07-22,7.04,301.0
2020-07-23,7.06,37.0
2020-07-24,7.04,50.0
2020-07-27,6.86,49.0
2020-07-28,6.71,71.0


In [23]:
# Scale the features DataFrames

# Create a StandardScaler instance
scaler = StandardScaler()

# Apply the scaler model to fit the X-train data
X_scaler = scaler.fit(X_train)

# Transform the X_train and X_test DataFrames using the X_scaler
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [24]:
# Use the svc classifier model from SKLearn's support vector machine (SVM) learning method to fit the training data and make predictions.

# From SVM, instantiate SVC classifier model instance
svm_model = svm.SVC()
 
# Fit the model to the data using the training data
svm_model = svm_model.fit(X_train_scaled, y_train)

# Use the testing data to make the model predictions
svm_pred = svm_model.predict(X_test_scaled)

# Review the model's predicted values
svm_pred[:10]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [25]:
# Use a classification report to evaluate the model using the predictions and testing data
svm_testing_report = classification_report(y_test, svm_pred)

# Print the classification report
print(svm_testing_report)

              precision    recall  f1-score   support

         0.0       0.44      1.00      0.61        77
         1.0       0.00      0.00      0.00        97

    accuracy                           0.44       174
   macro avg       0.22      0.50      0.31       174
weighted avg       0.20      0.44      0.27       174



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [26]:
#Using Logistic Regression as second machine learning model.

In [27]:
# Import a new classifier from SKLearn
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

# Initiate the model instance
logistic_regression_model = LogisticRegression()
# logistic_regression_model=DecisionTreeClassifier()
# logistic_regression_model=svm.SVC()
# logistic_regression_model=AdaBoostClassifier()

In [28]:
len(X_test_scaled)

174

In [29]:
# Fit the model using the training data
model = logistic_regression_model.fit(X_train_scaled, y_train)

# Use the testing dataset to generate the predictions for the new model
pred = model.predict(X_test_scaled)

# Review the model's predicted values
pred

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0.])

In [30]:
# Use a classification report to evaluate the model using the predictions and testing data
lr_testing_report = classification_report(y_test, pred)

# Print the classification report
print(lr_testing_report)

              precision    recall  f1-score   support

         0.0       0.45      1.00      0.62        77
         1.0       1.00      0.03      0.06        97

    accuracy                           0.46       174
   macro avg       0.73      0.52      0.34       174
weighted avg       0.76      0.46      0.31       174



In [31]:
y_test

2020-07-22    1.0
2020-07-23    1.0
2020-07-24    1.0
2020-07-27    1.0
2020-07-28    1.0
             ... 
2021-03-24    0.0
2021-03-25    0.0
2021-03-26    0.0
2021-03-29    0.0
2021-03-30    0.0
Name: Signal, Length: 174, dtype: float64