Predicting Stock Market Prices Using HMM

# DATA GATHERING AND CLEANING 

In [1]:
import pandas as pd
import numpy 
from datetime import datetime

def get_stock_data(file_name):
    """scrapes and cleans the data from the given file and creates a dataframe
    
    Args:
        file_name (string) : name of file
    
    Returns:
        df_stock (dataframe) : dataframe containing stock info scraped from file
    """
    df_stock = pd.DataFrame()
    file = open(file_name)
    txt = file.read()
    file.close()
    
    # split text into list, split by new line character
    txt = txt.split('\n')
    # get column titles
    columns = txt[0].split(',')

    for line in txt[1:]:
        temp_dict = dict()
        line = line.strip()
        line_list = line.split(',')

        # if row does not have sufficient column information, pass over
        if len(columns) != len(line_list):
            continue

        # add column's corresponding values to a temporary dictionary   
        for idx in range(len(columns)):
            column_name = columns[idx]
            
            # change all date column info to datetime object
            if column_name == 'Date':
                temp_dict[column_name] = datetime.strptime(line_list[idx], '%Y-%m-%d')
            else:
                temp_dict[column_name] = line_list[idx]

        # append dictionary to dataframe                                                  
        df_stock = df_stock.append(temp_dict, ignore_index=True)
    
    return df_stock

In [2]:
file_name = 'a.us.txt'
df_a_stock = get_stock_data(file_name)
df_a_stock

Unnamed: 0,Close,Date,High,Low,Open,OpenInt,Volume
0,29.702,1999-11-18,33.754,27.002,30.713,0,66277506
1,27.257,1999-11-19,29.027,26.872,28.986,0,16142920
2,29.702,1999-11-22,29.702,27.044,27.886,0,6970266
3,27.002,1999-11-23,29.446,27.002,28.688,0,6332082
4,27.717,1999-11-24,28.309,27.002,27.083,0,5132147
...,...,...,...,...,...,...,...
4516,68.22,2017-11-06,68.45,68.22,68.22,0,995731
4517,68.25,2017-11-07,68.64,68.04,68.32,0,966466
4518,68.11,2017-11-08,68.33,67.771,68.1,0,972616
4519,67.47,2017-11-09,67.98,66.91,67.92,0,1673083


In [3]:
some_stock_files = ['a.us.txt', 'abc.us.txt', 'aktx.us.txt', 'blue.us.txt', 'bro.us.txt', 'by.us.txt',
                    'casi.us.txt', 'cbu.us.txt', 'cxdc.us.txt', 'dhr.us.txt', 'dxyn.us.txt', 'ebay.us.txt',
                    'eei.us.txt', 'eod.us.txt', 'fox.us.txt', 'ftrpr.us.txt', 'fwonk.us.txt']

In [4]:
def get_emission(stock_dataframe):
    """ Calculates the one day difference between stock closing value (today - yesterday)
        and determines emission symbol based on if stock price increased or decreased from previous day
    
    Args:
        stock_dataframe (dataframe) : dataframe containing stock info(close value, date, high, low, open, etc.)
    
    Returns:
        one_day_dif_df(dataframe) : dataframe containing the difference from the previous day's stock value
                                    as well as the related emission symbol (Increasing or Decreasing)
    """
    
    # Subset the initial DF to obtain only relevant columns
    one_day_dif_df = stock_dataframe.copy()
    one_day_dif_df = one_day_dif_df[['Date','Close']]
    
    # Convert CV to numeric for calculations
    one_day_dif_df['Close'] = pd.to_numeric(one_day_dif_df['Close'])
    one_day_dif_df['Yesterday Close'] = one_day_dif_df['Close'].shift()
    
    # Calculate the stock's closing price difference from the previous day
    one_day_dif_df['Close Value Difference'] = round((one_day_dif_df['Close'] - one_day_dif_df['Yesterday Close']),2)
    
    one_day_dif_df['Emission'] = 'NaN'
    row_indexes_inc = one_day_dif_df[one_day_dif_df['Close Value Difference']>=0].index
    row_indexes_dec = one_day_dif_df[one_day_dif_df['Close Value Difference']<0].index
    
    one_day_dif_df.loc[row_indexes_inc,'Emission']='Increasing'
    one_day_dif_df.loc[row_indexes_dec,'Emission']='Decreasing'
    #one_day_dif_df['Emission'] = ['Increasing' if x > 0 else 'Decreasing' for x in one_day_dif_df['Close Value Difference']]
    
    return one_day_dif_df

In [5]:
get_emission(df_a_stock)

Unnamed: 0,Date,Close,Yesterday Close,Close Value Difference,Emission
0,1999-11-18,29.702,,,
1,1999-11-19,27.257,29.702,-2.45,Decreasing
2,1999-11-22,29.702,27.257,2.45,Increasing
3,1999-11-23,27.002,29.702,-2.70,Decreasing
4,1999-11-24,27.717,27.002,0.71,Increasing
...,...,...,...,...,...
4516,2017-11-06,68.220,68.370,-0.15,Decreasing
4517,2017-11-07,68.250,68.220,0.03,Increasing
4518,2017-11-08,68.110,68.250,-0.14,Decreasing
4519,2017-11-09,67.470,68.110,-0.64,Decreasing


# EMP AND TMP MATRIX INITIALIZATION

In [6]:
def create_TPM(n):
    """creates transition probability matrix and initializes to equal random probabilities
    
    Args:
        n (int) : number of possible states
        
    Returns:
        tpm (array of arrays) : n by n transition probability matrix
                                    s1 s2 s3 s4 s5 s6
                                s1
                                s2
                                s3
                                s4
                                s5
                                s6
    """
    tpm = []
    
    for idx in range(n):
        rand_prob = round(1 / n, 2)
        row = []
        
        for idx in range(n):
            row.append(rand_prob)
            
        tpm.append(row)
    return tpm

In [7]:
create_TPM(6)

[[0.17, 0.17, 0.17, 0.17, 0.17, 0.17],
 [0.17, 0.17, 0.17, 0.17, 0.17, 0.17],
 [0.17, 0.17, 0.17, 0.17, 0.17, 0.17],
 [0.17, 0.17, 0.17, 0.17, 0.17, 0.17],
 [0.17, 0.17, 0.17, 0.17, 0.17, 0.17],
 [0.17, 0.17, 0.17, 0.17, 0.17, 0.17]]

In [8]:
def create_EPM(n, m):
    """creates emissions probability matrix and initializes to equal random probabilities
    
    Args:
        n (int) : number of possible states
        m (int) : number of possible observation symbols
        
    Returns:
        epm (array of arrays) : n by m emission probability matrix
                                    I  D
                                s1
                                s2
                                s3
                                s4
                                s5
                                s6
    """
    
    epm = []
    
    for idx in range(n):
        rand_prob = round(1 / m, 2)
        row = []
        
        for idx in range(m):
            row.append(rand_prob)
            
        epm.append(row)
        
    return epm

In [9]:
create_EPM(6, 2)

[[0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5], [0.5, 0.5]]

# FORWARD BACKWARD ALGORITHM

In [10]:
"""
Forward Backward Algorithm: the probability of an observation sequence occurring given the model. 

Forward: probability that we are in a certain state, given we observed a certain sequence of historical observations
Backward: probability that we will see a certain sequence of future observations, given we are in a certain state

"""
def forward(tpm, epm, init, observations):
    """probability that we are in a certain state, given we observed a certain sequence of historical observations
    
    Args:
        tpm (array of arrays) : transition probability matrix
        epm (array of arrays) : emission probability matrix
        pi (array) : initial distribution (probability of being in each state at the start)
        observations (array) : array of emission symbols observed
    
    Returns:
        frwd (int) : probability of being in a certain state
    """
    
    
    """
    Initialization visualization: 
        S1 S2 S3 S4 S5 S6
    T1  .2 .1 .1 .2 .1 .1  (PI)
        * EPM - probability of emitting whatever our first observation is 
    
    """ 
    
    #Create probability matrix forward[N,T] - initialize with 0s
    NUM_STATES = 6
    NUM_OBSERVATIONS = len(observations)
    OBSERVATIONS = observations["Emissions"]
    alpha = []
    
    for N in range(NUM_STATES):
        row = []
        
        for T in range(NUM_OBSERVATIONS):
            if OBSERVATIONS[T] == "Increasing":
                row.append(pi[N] * EPM[N][0])
            if OBSERVATIONS[T] == "Decreasing":
                row.append(pi[N] * EPM[N][1])
            
        tpm.append(row)
    return tpm
        
    
    
    # initialization step
    for each state s from 1 to N do:
        forward[s,1] = pi * epm at observation 1
    
    # recursion
     for reach time step t from 2 to # do
        for each state s from 1 to N do:
                        # prob of getting to each previous state through all possible paths 
                        # TIMES the transition probability from previous S to current S
                        # TIMES the emission probability of emitting O at current S
           forward[s,t] = sum of forward[s`, t-1] * tpm[]
    
                                         
                                         
                                         
    
    b = # emissions probability at time t
    
    prob_sum = 0 
    hidden_states = [s1, s2, s3, s4, s5, s6]
    
    for state in hidden_states:
        ai = # probability of transitioning from state to current state
        alpha = forward(tpm, epm, init) # for t - 1
        
        prob_sum += ai * alpha
    
    return b * prob_sum

def backward(tpm, epm, pi, observations):
    """probability that we will see a certain sequence of future observations, given we are in a certain state
    
    Args:
        tpm (array of arrays) : transition probability matrix
        epm (array of arrays) : emission probability matrix
        init () : initial distribution (probability of being in each state at the start)
        observations (array) : array of emission symbols observed
    
    Returns:
        backwd (int) : probability of being in a certain state
    """
    prob_sum = 0
    hidden_states = [s1, s2, s3, s4, s5, s6]
    
    for state in hidden_states:
        beta = backward(tpm, epm, pi, observations) # for t + 1
        b = # emissions probability at time t + 1
        a_ij = # transition probability from current state to hidden state
        
        prob_sum += beta * b * a_ij
    
    return prob_sum

def frwd_backwd(tpm, epm, init, observations):
    """the probability of an observation sequence occurring given the model
    
    Args:
        tpm (array of arrays) : transition probability matrix
        epm (array of arrays) : emission probability matrix
        init () : initial distribution (probability of being in each state at the start)
        observations (array) : array of emission symbols observed
    
    Returns:
        frwd_backwd (int) : probability of an observation sequence occurring
    """
    forward = forward(tpm, epm, pi, observations)
    backward = backward(tpm, epm, pi, observations)
    
    return forward * backward

SyntaxError: invalid syntax (<ipython-input-10-dca93f636dcf>, line 20)