# Setup

In [1]:
%matplotlib notebook
import sys
assert sys.version_info >= (3, 5)

import sklearn
assert sklearn.__version__ >= "0.20"

import numpy as np
import pandas as pd
import seaborn as sn
import os

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification, make_blobs
from matplotlib.colors import ListedColormap

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

PROJECT_ROOT_DIR = "."

CHAPTER_ID = 'Naive Bayesian'
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)

os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Datasets

In [2]:
path = '../processeddata/individual_stocks_5yr/individual_stocks_5yr/'
ticker = 'T' # Change Stock to analyze

p = path + ticker + '_data.csv' 
df = pd.read_csv(p)
df.set_index(['date'],inplace=True)

df

Unnamed: 0_level_0,open,high,low,close,volume,Name
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2013-02-08,35.39,35.525,35.160,35.27,18275496,T
2013-02-11,35.26,35.315,35.060,35.23,13687717,T
2013-02-12,35.25,35.670,35.230,35.60,16958584,T
2013-02-13,35.67,35.810,35.340,35.42,17410587,T
2013-02-14,35.42,35.440,35.030,35.29,26205412,T
...,...,...,...,...,...,...
2018-02-01,38.51,39.290,38.401,39.16,59963826,T
2018-02-02,39.03,39.040,37.810,38.07,46098657,T
2018-02-05,37.81,38.310,36.630,36.63,45595537,T
2018-02-06,36.08,37.070,35.500,36.83,62905300,T


In [3]:
df['Movement'] = 'Up'

prevPrice = 0.0
for index, row in df.iterrows():
    if row['close'] < prevPrice:
        df.loc[index, 'Movement'] = 'Down'
    prevPrice = row['close']

df['Follow'] = 'Yes'

prevMove = 'Yes'
for index, row in df.iterrows():
    if row['Movement'] != prevMove:
        df.loc[index, 'Follow'] = 'No'
    prevMove = row['Movement']
    
df['%Change'] = 0

prevClose = df['close'][0]
for index, row in df.iterrows():
    diff = row['close'] - prevClose
    perc = (diff / prevClose) * 100
    df.loc[index, '%Change'] = perc
    prevClose = row['close']
    

print(df.head())
print(df.tail())

             open    high    low  close    volume Name Movement Follow  \
date                                                                     
2013-02-08  35.39  35.525  35.16  35.27  18275496    T       Up     No   
2013-02-11  35.26  35.315  35.06  35.23  13687717    T     Down     No   
2013-02-12  35.25  35.670  35.23  35.60  16958584    T       Up     No   
2013-02-13  35.67  35.810  35.34  35.42  17410587    T     Down     No   
2013-02-14  35.42  35.440  35.03  35.29  26205412    T     Down    Yes   

             %Change  
date                  
2013-02-08  0.000000  
2013-02-11 -0.113411  
2013-02-12  1.050241  
2013-02-13 -0.505618  
2013-02-14 -0.367024  
             open   high     low  close    volume Name Movement Follow  \
date                                                                     
2018-02-01  38.51  39.29  38.401  39.16  59963826    T       Up    Yes   
2018-02-02  39.03  39.04  37.810  38.07  46098657    T     Down     No   
2018-02-05  37.81  38.31

In [4]:
X, y = df[['%Change',"Follow"]], df[['Movement']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [5]:

for i in range(0, len(X_train)):
    if X_train['Follow'][i] == 'Yes':
        X_train['Follow'][i] = 1
    else:
        X_train['Follow'][i] = 0
    if y_train['Movement'][i] == 'Up':
        y_train['Movement'][i] = 1
    else:
        y_train['Movement'][i] = 0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [18]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB


gnb = GaussianNB()
arr = []
for index, row in y_train.iterrows():
    arr.append(row['Movement'])

gnb.fit(X_train.to_numpy(), arr)        
y_pred = gnb.predict(X_test)



print(y_pred)



[1 0 0 0 0 1 0 0 1 0 1 0 1 0 0 0 1 0 1 1 1 1 1 1 1 0 1 0 0 1 1 0 1 1 1 0 0
 1 1 1 1 0 0 0 0 1 1 1 1 1 0 1 1 0 0 1 0 0 1 1 0 1 0 1 1 1 1 1 0 1 1 1 1 1
 1 1 1 1 0 0 1 0 1 0 1 1 0 1 0 0 0 1 1 1 1 1 1 0 0 1 0 0 1 1 0 0 1 1 1 0 0
 1 0 0 1 1 0 0 1 1 0 1 1 0 1 0 0 1 0 0 1 1 1 0 1 1 1 0 1 1 1 1 0 1 0 0 0 0
 1 0 1 1 1 1 1 1 0 1 0 0 0 1 0 1 1 1 1 0 0 0 1 0 0 0 1 1 0 1 0 0 1 0 1 0 0
 1 0 0 0 0 0 1 0 0 1 1 1 0 1 0 1 1 0 0 1 1 0 0 0 1 0 0 1 0 1 1 1 1 0 0 1 0
 0 1 1 0 1 0 1 1 1 1 0 0 1 1 1 1 1 0 1 1 1 0 0 0 1 1 1 1 1 0 0 1 1 1 1 1 1
 1 1 0 0 1 0 1 0 1 1 1 1 1 0 1 1 0 1 1 0 1 1 1 0 1 1 0 0 0 1 1 1 1 0 0 0 0
 1 1 1 1 0 1 1 1 1 1 0 0 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 1 0
 1 1 1 1 0 1 0 0 1 1 1 0 1 1 0 0 0 1 1 0 0 1 0 0 1 0 1 0 1 1 0 0 0 1 1 1 0
 0 1 1 0 1 1 1 1]


In [19]:
# Show confusion matrix
def plot_confusion_matrix(confusion_mat, cln):
    plt.imshow(confusion_mat, interpolation='nearest', cmap=plt.cm.gray)
    plt.title('Confusion matrix')
    plt.colorbar()
    tick_marks = np.arange(cln)
    plt.xticks(tick_marks, tick_marks)
    plt.yticks(tick_marks, tick_marks)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

In [29]:
print(y_test)
for index, row in y_test.iterrows():
    if row['Movement'] == 'Up':
        y_test.loc[index, 'Movement'] = 1
    else:
        y_test.loc[index, 'Movement'] = 0
        
print(y_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
plot_confusion_matrix(cm, 2)

# Print classification report
result_metrics = classification_report(y_test, y_pred)
print(result_metrics)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


           Movement
date               
2017-11-29        0
2015-04-16        0
2015-07-17        0
2016-10-20        0
2014-12-30        0
...             ...
2016-09-29        0
2016-05-10        0
2013-08-07        0
2016-08-02        0
2017-06-30        0

[378 rows x 1 columns]


ValueError: Classification metrics can't handle a mix of unknown and binary targets

In [23]:
# BY HAND

In [24]:
data = {'Movement': ['Down', 'Up'], 'Yes': [0, 0], 'No': [0, 0]}
freq_table = pd.DataFrame(data, columns=['Movement','Yes','No'])
index = 0
for x in X_train:
    if x == 'Down' and y_train[index] == 'No':
        freq_table['No'][0] += 1
    elif x == 'Down' and y_train[index] == 'Yes':
        freq_table['Yes'][0] += 1
    elif x == 'Up' and y_train[index] == 'No':
        freq_table['No'][1] += 1
    else:
        freq_table['Yes'][1] += 1
    index += 1
    
display(freq_table)

yes = sum(freq_table['Yes'])
no = sum(freq_table['No'])

print("Y\'s: ", yes)
print("N\'s: ", no)

downs = freq_table['Yes'][0] + freq_table['No'][0]
ups = freq_table['Yes'][1] + freq_table['No'][1]

print("Downs: ", downs)
print("Ups: ", ups)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


Unnamed: 0,Movement,Yes,No
0,Down,0,0
1,Up,2,0


Y's:  2
N's:  0
Downs:  0
Ups:  2


In [10]:
import random

predictions = []
for x in X_test:
    r = random.random()
    if x == "Down":
        if r < freq_table['Yes'][0] / (freq_table['Yes'][0] + freq_table['No'][0]):
            predictions.append('Yes')
        else:
            predictions.append('No')
    else:
        if r < freq_table['Yes'][1] / (freq_table['Yes'][1] + freq_table['No'][1]):
            predictions.append('Yes')
        else:
            predictions.append('No')

correct = 0
wrong = 0
index = 0
for p in y_test:
    if p == predictions[index]:
        correct += 1
    else:
        wrong += 1
    index += 1

acc = correct / (correct + wrong) 

print("Correct: ", correct)
print("Wrong: ", wrong)
print("Accuracy: ", acc)

Correct:  0
Wrong:  1
Accuracy:  0.0


In [11]:
def plot_confusion_matrix(confusion_mat, cln):
    plt.imshow(confusion_mat, interpolation='nearest', cmap=plt.cm.gray)
    plt.title('Confusion matrix')
    plt.colorbar()
    tick_marks = np.arange(cln)
    plt.xticks(tick_marks, tick_marks)
    plt.yticks(tick_marks, tick_marks)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

In [12]:
y_true = []
y_pred = []

for x in y_test:
    if x == 'Yes':
        y_true.append(1)
    else:
        y_true.append(0)

for x in predictions:
    if x == 'Yes':
        y_pred.append(1)
    else:
        y_pred.append(0)

confusion_mat = confusion_matrix(y_true, y_pred)

print(confusion_mat)
plot_confusion_matrix(confusion_mat, 2)

ValueError: Found input variables with inconsistent numbers of samples: [1, 2]