# Ch4. Algorithm Trading

## predictor.py

In [2]:
# -*- coding: utf-8 -*-
from __future__ import division

import os,sys,datetime
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import pandas as pd
import pprint
import statsmodels.tsa.stattools as ts
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.lda import LDA
from sklearn.metrics import confusion_matrix
from sklearn.qda import QDA
from sklearn.svm import LinearSVC, SVC

parentPath = os.path.abspath("..")
if parentPath not in sys.path:
    sys.path.insert(0, parentPath)

from common import *


def make_dataset(df, time_lags=5):
    df_lag = pd.DataFrame(index=df.index)
    df_lag["Close"] = df["Close"]
    df_lag["Volume"] = df["Volume"]

    df_lag["Close_Lag%s" % str(time_lags)] = df["Close"].shift(time_lags)
    df_lag["Close_Lag%s_Change" % str(time_lags)] = df_lag["Close_Lag%s" % str(time_lags)].pct_change()*100.0

    df_lag["Volume_Lag%s" % str(time_lags)] = df["Volume"].shift(time_lags)
    df_lag["Volume_Lag%s_Change" % str(time_lags)] = df_lag["Volume_Lag%s" % str(time_lags)].pct_change()*100.0

    df_lag["Close_Direction"] = np.sign(df_lag["Close_Lag%s_Change" % str(time_lags)])
    df_lag["Volume_Direction"] = np.sign(df_lag["Volume_Lag%s_Change" % str(time_lags)])

    return df_lag.dropna(how='any')


def split_dataset(df,input_column_array,output_column,spllit_ratio):
    split_date = get_date_by_percent(df.index[0],df.index[df.shape[0]-1],spllit_ratio)

    input_data = df[input_column_array]
    output_data = df[output_column]

    # Create training and test sets
    X_train = input_data[input_data.index < split_date]
    X_test = input_data[input_data.index >= split_date]
    Y_train = output_data[output_data.index < split_date]
    Y_test = output_data[output_data.index >= split_date]

    return X_train,X_test,Y_train,Y_test


def get_date_by_percent(start_date,end_date,percent):
    days = (end_date - start_date).days
    target_days = np.trunc(days * percent)
    target_date = start_date + datetime.timedelta(days=target_days)
    #print days, target_days,target_date
    return target_date


def do_logistic_regression(x_train,y_train):
    classifier = LogisticRegression()
    classifier.fit(x_train, y_train)
    return classifier


def do_random_forest(x_train,y_train):
    classifier = RandomForestClassifier()
    classifier.fit(x_train, y_train)
    return classifier


def do_svm(x_train,y_train):
    classifier = SVC()
    classifier.fit(x_train, y_train)
    return classifier


def test_predictor(classifier,x_test,y_test):
    pred = classifier.predict(x_test)

    hit_count = 0
    total_count = len(y_test)
    for index in range(total_count):
        if (pred[index]) == (y_test[index]):
            hit_count = hit_count + 1
    
    hit_ratio = hit_count/total_count
    score = classifier.score(x_test, y_test)
    #print "hit_count=%s, total=%s, hit_ratio = %s" % (hit_count,total_count,hit_ratio)

    return hit_ratio, score
    # Output the hit-rate and the confusion matrix for each model
    
    #print("%s\n" % confusion_matrix(pred, y_test))



if __name__ == "__main__":
    # Calculate and output the CADF test on the residuals

    avg_hit_ratio = 0    
    for time_lags in range(1,6):
        print "- Time Lags=%s" % (time_lags)

        for company in ['samsung','hanmi']:
            df_company = load_stock_data('%s.data'%(company))

            df_dataset = make_dataset(df_company,time_lags)
            X_train,X_test,Y_train,Y_test = split_dataset(df_dataset,["Close_Lag%s"%(time_lags),"Volume_Lag%s"%(time_lags)],"Close_Direction",0.75)
            #print X_test

            lr_classifier = do_logistic_regression(X_train,Y_train)
            lr_hit_ratio, lr_score = test_predictor(lr_classifier,X_test,Y_test)

            rf_classifier = do_random_forest(X_train,Y_train)
            rf_hit_ratio, rf_score = test_predictor(rf_classifier,X_test,Y_test)

            svm_classifier = do_svm(X_train,Y_train)
            svm_hit_ratio, svm_score = test_predictor(rf_classifier,X_test,Y_test)

            print "%s : Hit Ratio - Logistic Regreesion=%0.2f, RandomForest=%0.2f, SVM=%0.2f" % (company,lr_hit_ratio,rf_hit_ratio,svm_hit_ratio)


SyntaxError: invalid syntax (<ipython-input-2-15f152c748f1>, line 108)

## stationarity_test.py

In [3]:
# -*- coding: utf-8 -*-

import os,sys,datetime
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import pandas as pd
import pprint
import statsmodels.tsa.stattools as ts

parentPath = os.path.abspath("..")
if parentPath not in sys.path:
    sys.path.insert(0, parentPath)

from common import *




def get_hurst_exponent(df,lags_count=100):
    lags = range(2, lags_count)
    ts = np.log(df)

    tau = [np.sqrt(np.std(np.subtract(ts[lag:], ts[:-lag]))) for lag in lags]
    poly = np.polyfit(np.log(lags), np.log(tau), 1)

    result = poly[0]*2.0

    return result


def get_half_life(df):
    price = pd.Series(df)  
    lagged_price = price.shift(1).fillna(method="bfill")  
    delta = price - lagged_price  
    beta = np.polyfit(lagged_price, delta, 1)[0] 
    half_life = (-1*np.log(2)/beta) 

    return half_life

def random_walk(seed=1000, mu = 0.0, sigma = 1, length=1000):
    """ this function creates a series of independent, identically distributed values
    with the form of a random walk. Where the best prediction of the next value is the present
    value plus some random variable with mean and variance finite 
    We distinguish two types of random walks: (1) random walk without drift (i.e., no constant
    or intercept term) and (2) random walk with drift (i.e., a constant term is present).  
    The random walk model is an example of what is known in the literature as a unit root process.
    RWM without drift: Yt = YtÃ¢ÂÂ1 + ut
    RWM with drift: Yt = ÃÂ´ + YtÃ¢ÂÂ1 + ut
    """
    
    ts = []
    for i in range(length):
        if i == 0:
            ts.append(seed)
        else:    
            ts.append(mu + ts[i-1] + random.gauss(0, sigma))

    return ts


def draw_moving_average(df):
    df.plot(style='k--')
    pd.rolling_mean(df, 20).plot(style='k')

    plt.show()


def do_mean_reversion(df,window_size,index):
    df_ma = pd.rolling_mean(df,window_size)
    df_std = pd.rolling_std(df,window_size)

    diff = df.loc[index,0] - df_ma.loc[index,0]
    print diff


if __name__ == "__main__":
    """
    start = datetime.datetime(2012, 1, 1)
    end = datetime.datetime(2013, 1, 1)

    arex = web.DataReader("AREX", "yahoo", start, end)
    wll = web.DataReader("WLL", "yahoo", start, end)

    df = pd.DataFrame(index=arex.index)
    df["AREX"] = arex["Adj Close"]
    df["WLL"] = wll["Adj Close"]

    # Plot the two time series
    plot_price_series(df, "AREX", "WLL")

    # Display a scatter plot of the two time series
    plot_scatter_series(df, "AREX", "WLL")

    # Calculate optimal hedge ratio "beta"
    res = ols(y=df['WLL'], x=df["AREX"])
    beta_hr = res.beta.x

    # Calculate the residuals of the linear combination
    df["res"] = df["WLL"] - beta_hr*df["AREX"]

    # Plot the residuals
    plot_residuals(df)
    """
    # Calculate and output the CADF test on the residuals
    df_samsung = load_stock_data('samsung.data')
    df_hanmi = load_stock_data('hanmi.data')
    #adf_result = ts.adfuller(df_samsung["Close"])
    #pprint.pprint(adf_result)
    """
    hurst_samsung = get_hurst_exponent(df_samsung['Close'])
    hurst_hanmi = get_hurst_exponent(df_hanmi['Close'])
    print "Hurst Exponent : Samsung=%s, Hanmi=%s" % (hurst_samsung,hurst_hanmi)

    half_life_samsung = get_half_life(df_samsung['Close'])
    half_life_hanmi = get_half_life(df_hanmi['Close'])
    print "Half_life : Samsung=%s, Hanmi=%s" % (half_life_samsung,half_life_hanmi)
    """
    #print df_samsung['Close']
    #draw_moving_average(df_samsung['Close'])
    do_mean_reversion(df_samsung['Close'],10,100)

SyntaxError: Missing parentheses in call to 'print'. Did you mean print(diff)? (<ipython-input-3-7e84789a890e>, line 74)