In [294]:
#Libraries
import numpy as np 
import pandas as pd 
import yfinance as yf
import matplotlib.pyplot as plt
from datetime import date
import pandas_datareader as data
from pandas_datareader import *
import math
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

#Model
#Models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [295]:
#Scraping Dataset from Yahoo Finance using yfinance
#Getting data for the Apple Stock
aapl = yf.Ticker("AAPL")

# Apple Company Information
# aapl.info

# Apple's Historical Stock Prices (10 Year Period)
aapl.dataset = aapl.history(period="10y")

#YFinance Bug- Dropped columns appears after being dropped so copying into new dataset
dataset = aapl.dataset [['Open', 'Close', 'High', 'Low','Volume']].copy()

# Calculating %K and %D
dataset['14-high'] = dataset['High'].rolling(14).max()
dataset['14-low'] = dataset['Low'].rolling(14).min()
dataset['%K'] = (dataset['Close'] - dataset['14-low'])*100/(dataset['14-high'] - dataset['14-low'])
dataset['%D'] = dataset['%K'].rolling(3).mean()

#Dropping 14-HIGH AND 14-LOW COLUMNS
dataset.drop(['14-high', '14-low'], axis = 1, inplace=True)

def categorise_so(row):  
    if row['%K'] <= 20 and row['%D'] <= 20:
        return 'Buy'
    elif row['%K'] >= 80 and  row['%D']>= 80:
        return 'Sell'
    else:
        return 'Hold'

    
   
dataset['SO Indicator'] = dataset.apply(lambda row: categorise_so(row), axis=1)


delta = dataset['Close'].diff()
up = delta.clip(lower=0)
down = -1*delta.clip(upper=0)
ema_up = up.ewm(com=13, adjust=False).mean()
ema_down = down.ewm(com=13, adjust=False).mean()
rs = ema_up/ema_down
dataset['RSI'] = 100 - (100/(1 + rs))

# Adding Buy/Sell Signals from RSI Indicator
def categorise_rsi(row):  
    if row['RSI'] <= 30:
        return 'Buy'
    elif row['RSI'] >= 70:
        return 'Sell'
    else:
        return 'Hold'
  
dataset['RSI Indicator'] = dataset.apply(lambda row: categorise_rsi(row), axis=1)

def get_sma(prices, rate):
    return prices.rolling(rate).mean()

def get_bollinger_bands(prices, rate=20):
    # SMA for 20 Days (Middle Band)
    sma = get_sma(prices, rate)
    std = prices.rolling(rate).std()

    # Calculating Upper Band
    bollinger_upper = sma + (std * 2 )

    # Calculate Lower Band
    bollinger_lower = sma - (std * 2 )

    #Middle Band
    bollinger_middle = sma
    return bollinger_upper, bollinger_lower, bollinger_middle


closing_prices = dataset['Close']


bollinger_upper, bollinger_lower, bollinger_middle = get_bollinger_bands(closing_prices)

#Adding Bollinger Bands to the Dataset
dataset['Bollinger_Upper'] = bollinger_upper
dataset['Bollinger_Lower'] = bollinger_lower


# Adding Buy/Sell Signals from Bollinger Bands Indicator
def categorise_bollinger(row):  
    if row['Close'] < row['Bollinger_Lower']:
        return 'Buy'
    elif row['Close'] > row['Bollinger_Upper']:
        return 'Sell'
    else:
        return 'Hold'
    
   
dataset['Bollinger Indicator'] = dataset.apply(lambda row: categorise_bollinger(row), axis=1)

# Calculating the MACD Line and the Signal Line
ema12 = dataset['Close'].ewm(span=12, adjust=False).mean()
ema26 = dataset['Close'].ewm(span=26, adjust=False).mean()
macd = ema12 - ema26
signal = macd.ewm(span=9, adjust=False).mean()


#Appeding the MACD and Signal Data to Dataset
dataset['MACD'] = macd
dataset['Signal'] = signal



In [296]:
# Recommender Based on the three indicators
def recommender(row): 
    # If All Share the Same Signal OutPut That Signal (Buy)
    if row['RSI Indicator'] == "Buy" and row['SO Indicator'] == 'Buy' and  row['Bollinger Indicator'] == 'Buy':
        return 'Buy'
     # If Any 2 Indicators Share the Same Signal Output That Signal (buy)
    elif row['RSI Indicator'] == 'Buy' and row['SO Indicator'] == 'Buy':
        return 'Buy'
    elif row['RSI Indicator'] == 'Buy' and row['Bollinger Indicator'] == 'Buy':
        return 'Buy'
    elif row['SO Indicator'] == 'Buy' and row['Bollinger Indicator'] == 'Buy':
        return 'Buy'   
    # If All Share the Same Signal Output That Signal (Sell)
    elif row['RSI Indicator'] == 'Sell' and row['SO Indicator'] == 'Sell' and  row['Bollinger Indicator'] == 'Sell':
        return 'Sell'
    # If Any 2 Indicators Share the Same Signal Output That Signal (Sell)
    elif row['RSI Indicator'] == 'Sell' and row['SO Indicator'] == 'Sell':
        return 'Sell'
    elif row['RSI Indicator'] == 'Sell' and row['Bollinger Indicator'] == 'Sell':
        return 'Sell'
    elif row['SO Indicator'] == 'Sell' and row['Bollinger Indicator'] == 'Sell':
        return 'Sell'    
    # If All Share the Same Signal OutPut That Signal (Hold)
    elif row['RSI Indicator'] == 'Hold' and row['SO Indicator'] == 'Hold' and  row['Bollinger Indicator'] == 'Hold':
        return 'Hold'
    # If Any 2 Indicators Share the Same Signal Output That Signal (Hold)
    elif row['RSI Indicator'] == 'Hold' and row['SO Indicator'] == 'Hold':
        return 'Hold'
    elif row['RSI Indicator'] == 'Hold' and row['Bollinger Indicator'] == 'Hold':
        return 'Hold'
    elif row['SO Indicator'] == 'Hold' and row['Bollinger Indicator'] == 'Hold':
        return 'Hold'      
    else:
        return 'Unclassed'

dataset['Recommender'] = dataset.apply(lambda row: recommender(row), axis=1)

In [297]:
dataset.drop(['RSI Indicator', 'SO Indicator',  'Bollinger Indicator'], axis = 1, inplace=True)

In [298]:
#Information regarding the dataset
dataset.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2518 entries, 2012-04-30 to 2022-04-29
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Open             2518 non-null   float64
 1   Close            2518 non-null   float64
 2   High             2518 non-null   float64
 3   Low              2518 non-null   float64
 4   Volume           2518 non-null   int64  
 5   %K               2505 non-null   float64
 6   %D               2503 non-null   float64
 7   RSI              2517 non-null   float64
 8   Bollinger_Upper  2499 non-null   float64
 9   Bollinger_Lower  2499 non-null   float64
 10  MACD             2518 non-null   float64
 11  Signal           2518 non-null   float64
 12  Recommender      2518 non-null   object 
dtypes: float64(11), int64(1), object(1)
memory usage: 275.4+ KB


In [299]:
#Dropping Null Values
dataset = dataset.dropna()
dataset.isna().sum()

Open               0
Close              0
High               0
Low                0
Volume             0
%K                 0
%D                 0
RSI                0
Bollinger_Upper    0
Bollinger_Lower    0
MACD               0
Signal             0
Recommender        0
dtype: int64

In [300]:
# #Encode Categorical Variables
signal_dict = {'Hold': 0, 'Sell': 1, 'Buy': 2}

dataset['Recommender'] = dataset['Recommender'].map(signal_dict)

dataset.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2499 entries, 2012-05-25 to 2022-04-29
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Open             2499 non-null   float64
 1   Close            2499 non-null   float64
 2   High             2499 non-null   float64
 3   Low              2499 non-null   float64
 4   Volume           2499 non-null   int64  
 5   %K               2499 non-null   float64
 6   %D               2499 non-null   float64
 7   RSI              2499 non-null   float64
 8   Bollinger_Upper  2499 non-null   float64
 9   Bollinger_Lower  2499 non-null   float64
 10  MACD             2499 non-null   float64
 11  Signal           2499 non-null   float64
 12  Recommender      2499 non-null   int64  
dtypes: float64(11), int64(2)
memory usage: 273.3 KB


In [301]:
y=dataset.iloc[:,12:13]
X=dataset.iloc[:,0:12]

In [302]:
X

Unnamed: 0_level_0,Open,Close,High,Low,Volume,%K,%D,RSI,Bollinger_Upper,Bollinger_Lower,MACD,Signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2012-05-25,17.265854,17.195517,17.304387,17.078696,328507200,73.840536,79.791649,44.076190,18.186332,16.302006,-0.172952,-0.215631
2012-05-29,17.458816,17.500711,17.553618,17.287868,380508800,92.212800,81.823858,50.208127,18.132156,16.320371,-0.130183,-0.198541
2012-05-30,17.406832,17.711727,17.736804,17.326098,529429600,98.581578,88.211638,53.966478,18.116348,16.327127,-0.078357,-0.174504
2012-05-31,17.759736,17.667688,17.782979,17.475944,491674400,93.644704,94.813027,53.066230,18.068708,16.349539,-0.040374,-0.147678
2012-06-01,17.405605,17.155756,17.512334,17.141384,520987600,65.424760,85.883681,43.898707,17.991827,16.362718,-0.050992,-0.128341
...,...,...,...,...,...,...,...,...,...,...,...,...
2022-04-25,161.119995,162.880005,163.169998,158.460007,96046400,22.278221,14.362295,41.927588,180.981810,159.888189,-1.131611,-0.008625
2022-04-26,162.250000,156.800003,162.339996,156.720001,95623200,0.473104,8.152724,34.919440,181.370947,157.619052,-1.831394,-0.373179
2022-04-27,155.910004,156.570007,159.789993,155.380005,88063200,6.618480,9.789935,34.683271,180.707184,156.043816,-2.377134,-0.773970
2022-04-28,159.250000,163.639999,164.520004,158.929993,130216800,50.365839,19.152474,46.631998,179.335595,156.002403,-2.213630,-1.061902


In [303]:
y

Unnamed: 0_level_0,Recommender
Date,Unnamed: 1_level_1
2012-05-25,0
2012-05-29,0
2012-05-30,0
2012-05-31,0
2012-06-01,0
...,...
2022-04-25,0
2022-04-26,2
2022-04-27,0
2022-04-28,0


In [304]:
#spliting the data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=42)

In [305]:
print(X_train.shape)
print(y_train.shape)

(1749, 12)
(1749, 1)


In [306]:
print(X_test.shape)
print(y_test.shape)

(750, 12)
(750, 1)


In [307]:
X_train.columns

Index(['Open', 'Close', 'High', 'Low', 'Volume', '%K', '%D', 'RSI',
       'Bollinger_Upper', 'Bollinger_Lower', 'MACD', 'Signal'],
      dtype='object')

In [308]:
X_test.columns

Index(['Open', 'Close', 'High', 'Low', 'Volume', '%K', '%D', 'RSI',
       'Bollinger_Upper', 'Bollinger_Lower', 'MACD', 'Signal'],
      dtype='object')

In [309]:
y_train['Recommender'].value_counts()

0    1406
1     259
2      84
Name: Recommender, dtype: int64

In [310]:
y_test['Recommender'].value_counts()

0    593
1    130
2     27
Name: Recommender, dtype: int64

In [311]:
#Scaling
from sklearn.preprocessing import StandardScaler 
from sklearn.preprocessing import MinMaxScaler
scaler = sscaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [312]:
from imblearn import under_sampling, over_sampling
from imblearn.over_sampling import SMOTE
strategy = {2:1419, 0:450, 1:450}
#we want Buy and Sell rows to be 2500 and Strong Buy and Strong Sell to be 1500
oversample = SMOTE(sampling_strategy=strategy)
Xsmote, ysmote = oversample.fit_resample(X_train, y_train)



In [313]:
print(Xsmote.shape)
print(ysmote.shape)

(4257, 12)
(4257, 1)


In [314]:
ysmote['Recommender'].value_counts()

0    1419
2    1419
1    1419
Name: Recommender, dtype: int64

In [315]:
#Models
from sklearn.linear_model import LogisticRegression
#Model Result Analysis
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, ConfusionMatrixDisplay, precision_score, recall_score, f1_score, classification_report, roc_curve, plot_roc_curve, auc, precision_recall_curve, plot_precision_recall_curve, average_precision_score
from sklearn.model_selection import cross_val_score

#Data Modelling
#Logistic Regresion
lr = LogisticRegression()
lr.fit(Xsmote,ysmote)


#Predictions
y_pred = lr.predict(X_test)



#Precision
test_precision = precision_score(y_test, y_pred,  average='macro')
print("Test Data Precision Score :", test_precision) 
#Recall
test_recall = recall_score(y_test, y_pred,  average='macro')
print("Test Data Recall Score : ", test_recall)
#F1 Score
test_f1 = f1_score(y_test, y_pred,average='macro')
print("Test  Data F1 Score: ", test_f1)

#Storing Results for the Model
list_1 = []
list_2 = []

list_2.append("Logistic Regression")
list_2.append((accuracy_score(y_test, y_pred))*100) 
list_2.append(test_precision)
list_2.append(test_recall)
list_2.append(test_f1)
list_1.append(list_2)
list_2 = []



Test Data Precision Score : 0.7070353463697489
Test Data Recall Score :  0.9081403970667602
Test  Data F1 Score:  0.7708747875009058


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [316]:
#Data Modelling
#Decision Tree
decision_tree = DecisionTreeClassifier()
decision_tree.fit(Xsmote,ysmote)


#Predictions
y_pred = decision_tree.predict(X_test)


#Confusion Matrix
cm_test = confusion_matrix(y_test, y_pred)  
print("Test Data Confusion Matrix :")
print(cm_test)
#Precision
test_precision = precision_score(y_test, y_pred,  average='macro')
print("Test Data Precision Score :", test_precision) 
#Recall
test_recall = recall_score(y_test, y_pred,  average='macro')
print("Test Data Recall Score : ", test_recall)
#F1 Score
test_f1 = f1_score(y_test, y_pred,average='macro')
print("Test  Data F1 Score: ", test_f1)


list_2.append("Decision Tree")
list_2.append((accuracy_score(y_test, y_pred))*100) 
list_2.append(test_precision)
list_2.append(test_recall)
list_2.append(test_f1)
list_1.append(list_2)
list_2 = []

Test Data Confusion Matrix :
[[558  20  15]
 [ 19 111   0]
 [  8   0  19]]
Test Data Precision Score : 0.7866659758442426
Test Data Recall Score :  0.832842645040509
Test  Data F1 Score:  0.8069646511228136


In [317]:
#Data Modelling
#Random Forest
random_forest = RandomForestClassifier()
random_forest.fit(Xsmote,ysmote)


#Predictions
y_pred = random_forest.predict(X_test)


#Confusion Matrix
cm_test = confusion_matrix(y_test, y_pred)  
print("Test Data Confusion Matrix :")
print(cm_test)
#Precision
test_precision = precision_score(y_test, y_pred,  average='macro')
print("Test Data Precision Score :", test_precision) 
#Recall
test_recall = recall_score(y_test, y_pred,  average='macro')
print("Test Data Recall Score : ", test_recall)
#F1 Score
test_f1 = f1_score(y_test, y_pred,average='macro')
print("Test  Data F1 Score: ", test_f1)


list_2.append("Random Forest")
list_2.append((accuracy_score(y_test, y_pred))*100) 
list_2.append(test_precision)
list_2.append(test_recall)
list_2.append(test_f1)
list_1.append(list_2)
list_2 = []

  random_forest.fit(Xsmote,ysmote)


Test Data Confusion Matrix :
[[567  14  12]
 [ 14 116   0]
 [  4   0  23]]
Test Data Precision Score : 0.8395604395604396
Test Data Recall Score :  0.9001048958328329
Test  Data F1 Score:  0.865630577684907


In [318]:
from sklearn.svm import SVC

#Data Modelling
#Support Vector Machine
svc = SVC()
svc.fit(Xsmote,ysmote)


#Predictions
y_pred = svc.predict(X_test)


#Confusion Matrix
cm_test = confusion_matrix(y_test, y_pred)  
print("Test Data Confusion Matrix :")
print(cm_test)
#Precision
test_precision = precision_score(y_test, y_pred,  average='macro')
print("Test Data Precision Score :", test_precision) 
#Recall
test_recall = recall_score(y_test, y_pred,  average='macro')
print("Test Data Recall Score : ", test_recall)
#F1 Score
test_f1 = f1_score(y_test, y_pred,average='macro')
print("Test  Data F1 Score: ", test_f1)

#Appending Results 
list_2.append("SVM")
list_2.append((accuracy_score(y_test, y_pred))*100) 
list_2.append(test_precision)
list_2.append(test_recall)
list_2.append(test_f1)
list_1.append(list_2)
list_2 = []

  y = column_or_1d(y, warn=True)


Test Data Confusion Matrix :
[[501  58  34]
 [  5 125   0]
 [  1   0  26]]
Test Data Precision Score : 0.7015197076987745
Test Data Recall Score :  0.9231193618489852
Test  Data F1 Score:  0.7691107616876044


In [319]:
#Data Modelling
#K Nearest Neighbours

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(Xsmote, ysmote)
y_pred = knn.predict(X_test)

#Confusion Matrix
cm_test = confusion_matrix(y_test, y_pred)  
print("Test Data Confusion Matrix :")
print(cm_test)
#Precision
test_precision = precision_score(y_test, y_pred,  average='macro')
print("Test Data Precision Score :", test_precision) 
#Recall
test_recall = recall_score(y_test, y_pred,  average='macro')
print("Test Data Recall Score : ", test_recall)
#F1 Score
test_f1 = f1_score(y_test, y_pred,average='macro')
print("Test  Data F1 Score: ", test_f1)

#Appending Results 
list_2.append("KNN")
list_2.append((accuracy_score(y_test, y_pred))*100) 
list_2.append(test_precision)
list_2.append(test_recall)
list_2.append(test_f1)
list_1.append(list_2)
list_2 = []


Test Data Confusion Matrix :
[[522  40  31]
 [  7 123   0]
 [  2   0  25]]
Test Data Precision Score : 0.7280268819600213
Test Data Recall Score :  0.9174498621941005
Test  Data F1 Score:  0.7902752350053414


  return self._fit(X, y)


In [320]:
#Data Modelling
#Niave Bayes
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(Xsmote, ysmote)
y_pred = gnb.predict(X_test)


#Confusion Matrix
cm_test = confusion_matrix(y_test, y_pred)  
print("Test Data Confusion Matrix :")
print(cm_test)
#Precision
test_precision = precision_score(y_test, y_pred,  average='macro')
print("Test Data Precision Score :", test_precision) 
#Recall
test_recall = recall_score(y_test, y_pred,  average='macro')
print("Test Data Recall Score : ", test_recall)
#F1 Score
test_f1 = f1_score(y_test, y_pred,average='macro')
print("Test  Data F1 Score: ", test_f1)

#Appending Results 
list_2.append("Niave Byaes")
list_2.append((accuracy_score(y_test, y_pred))*100) 
list_2.append(test_precision)
list_2.append(test_recall)
list_2.append(test_f1)
list_1.append(list_2)
list_2 = []

Test Data Confusion Matrix :
[[455  87  51]
 [  5 125   0]
 [  2   0  25]]
Test Data Precision Score : 0.6344728315929905
Test Data Recall Score :  0.8849164596775614
Test  Data F1 Score:  0.6929967623189427


  y = column_or_1d(y, warn=True)


In [321]:
#Data Modelling 
#Neural Network
mlp = MLPClassifier()
mlp.fit(Xsmote,ysmote)
y_pred = mlp.predict(X_test)


#Confusion Matrix
cm_test = confusion_matrix(y_test, y_pred)  
print("Test Data Confusion Matrix :")
print(cm_test)
#Precision
test_precision = precision_score(y_test, y_pred,  average='macro')
print("Test Data Precision Score :", test_precision) 
#Recall
test_recall = recall_score(y_test, y_pred,  average='macro')
print("Test Data Recall Score : ", test_recall)
#F1 Score
test_f1 = f1_score(y_test, y_pred,average='macro')
print("Test  Data F1 Score: ", test_f1)

#Appending Results 
list_2.append("NLP")
list_2.append((accuracy_score(y_test, y_pred))*100) 
list_2.append(test_precision)
list_2.append(test_recall)
list_2.append(test_f1)
list_1.append(list_2)
list_2 = []



  y = column_or_1d(y, warn=True)


Test Data Confusion Matrix :
[[543  25  25]
 [ 11 119   0]
 [  2   0  25]]
Test Data Precision Score : 0.7676691979749534
Test Data Recall Score :  0.9189978364233564
Test  Data F1 Score:  0.8210445002768382




In [322]:
#Displaying all the results from the model
model_results = pd.DataFrame(list_1, columns= ['Model', 'Accuracy', 'Precision', 'Recall', 'F1'])
model_results.sort_values(by= ['F1'], inplace= True, ascending= False)
model_results


Unnamed: 0,Model,Accuracy,Precision,Recall,F1
2,Random Forest,94.133333,0.83956,0.900105,0.865631
6,NLP,91.6,0.767669,0.918998,0.821045
1,Decision Tree,91.733333,0.786666,0.832843,0.806965
4,KNN,89.333333,0.728027,0.91745,0.790275
0,Logistic Regression,87.6,0.707035,0.90814,0.770875
3,SVM,86.933333,0.70152,0.923119,0.769111
5,Niave Byaes,80.666667,0.634473,0.884916,0.692997
