<img src="http://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

# Machine Learning for Finance

## Basic Statistical Methods

Dr Yves J Hilpisch | The Python Quants GmbH

<a href="http://tpq.io" target="_blank">http://tpq.io</a> | <a href="http://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:ai@tpq.io">ai@tpq.io</a>

## Descriptive Statistics

In [None]:
import numpy as np
import pandas as pd
from pylab import plt, mpl

In [None]:
plt.style.use('seaborn')
mpl.rcParams['font.family'] = 'serif'
%config InlineBackend.figure_format = 'svg'

In [None]:
url = 'https://certificate.tpq.io/mlfin.csv'

In [None]:
raw = pd.read_csv(url, index_col=0, parse_dates=True)
del raw['BTC=']
raw.dropna(inplace=True)

In [None]:
rets = np.log(raw / raw.shift(1))

In [None]:
rets.describe()

In [None]:
rets.aggregate([np.min, np.mean, np.median, np.max])

In [None]:
rets.iloc[:, :4].hist(bins=35);

In [None]:
sym = 'AAPL.O'

In [None]:
rets[sym].hist(bins=50);

In [None]:
rets[sym].cumsum().apply(np.exp).plot();

In [None]:
mean = rets[sym].mean()
mean

In [None]:
std = rets[sym].std()
std

In [None]:
tail = mean + 2 * std
tail

In [None]:
tail_ = mean - 2 * std
tail_

In [None]:
no = len(rets[sym][rets[sym].sort_values() > tail].values)
no

In [None]:
plt.bar(np.arange(len(rets)),
        np.exp(rets.sort_values(sym)[sym].values.cumsum()));
plt.axvline(len(rets) - no, c='r', lw=1);

## Approximation

In [None]:
def f(x):
    return np.sin(x) + 0.5 * x

In [None]:
def create_plot(x, y, styles, labels, axlabels):
    plt.figure(figsize=(10, 6))
    for i in range(len(x)):
        plt.plot(x[i], y[i], styles[i], label=labels[i])
        plt.xlabel(axlabels[0])
        plt.ylabel(axlabels[1])
    plt.legend(loc=0)

In [None]:
x = np.linspace(-2 * np.pi, 2 * np.pi, 50)  

In [None]:
create_plot([x], [f(x)], ['b'], ['f(x)'], ['x', 'f(x)'])

### Regression

#### Monomials as Basis Functions

In [None]:
res = np.polyfit(x, f(x), deg=1, full=True)  

In [None]:
res  

In [None]:
ry = np.polyval(res[0], x)  

In [None]:
create_plot([x, x], [f(x), ry], ['b', 'r.'],
            ['f(x)', 'regression'], ['x', 'f(x)'])

In [None]:
reg = np.polyfit(x, f(x), deg=5)
ry = np.polyval(reg, x)

In [None]:
create_plot([x, x], [f(x), ry], ['b', 'r.'],
            ['f(x)', 'regression'], ['x', 'f(x)'])

In [None]:
reg = np.polyfit(x, f(x), 7)
ry = np.polyval(reg, x)

In [None]:
np.allclose(f(x), ry)  

In [None]:
np.mean((f(x) - ry) ** 2)  

In [None]:
create_plot([x, x], [f(x), ry], ['b', 'r.'],
            ['f(x)', 'regression'], ['x', 'f(x)'])

#### Individual Basis Functions

In [None]:
matrix = np.zeros((3 + 1, len(x)))  
matrix[3, :] = x ** 3  
matrix[2, :] = x ** 2  
matrix[1, :] = x  
matrix[0, :] = 1  

In [None]:
reg = np.linalg.lstsq(matrix.T, f(x), rcond=None)[0]  

In [None]:
reg.round(4)  

In [None]:
ry = np.dot(reg, matrix)  

In [None]:
create_plot([x, x], [f(x), ry], ['b', 'r.'],
            ['f(x)', 'regression'], ['x', 'f(x)'])

In [None]:
matrix[3, :] = np.sin(x)  

In [None]:
reg = np.linalg.lstsq(matrix.T, f(x), rcond=None)[0]

In [None]:
reg.round(4)  

In [None]:
ry = np.dot(reg, matrix)

In [None]:
np.allclose(f(x), ry)  

In [None]:
np.mean((f(x) - ry) ** 2)  

In [None]:
create_plot([x, x], [f(x), ry], ['b', 'r.'],
            ['f(x)', 'regression'], ['x', 'f(x)'])

#### Noisy Data

In [None]:
xn = np.linspace(-2 * np.pi, 2 * np.pi, 50)  
xn = xn + 0.15 * np.random.standard_normal(len(xn))  
yn = f(xn) + 0.25 * np.random.standard_normal(len(xn))  

In [None]:
reg = np.polyfit(xn, yn, 7)
ry = np.polyval(reg, xn)

In [None]:
create_plot([x, x], [f(x), ry], ['b', 'r.'],
            ['f(x)', 'regression'], ['x', 'f(x)'])

#### Unsorted Data

In [None]:
xu = np.random.rand(50) * 4 * np.pi - 2 * np.pi  
yu = f(xu)

In [None]:
print(xu[:10].round(2))  
print(yu[:10].round(2))  

In [None]:
reg = np.polyfit(xu, yu, 5)
ry = np.polyval(reg, xu)

In [None]:
create_plot([xu, xu], [yu, ry], ['b.', 'ro'],
            ['f(x)', 'regression'], ['x', 'f(x)'])

#### Multiple Dimensions

In [None]:
def fm(p):
    x, y = p
    return np.sin(x) + 0.25 * x + np.sqrt(y) + 0.05 * y ** 2

In [None]:
x = np.linspace(0, 10, 20)
y = np.linspace(0, 10, 20)
X, Y = np.meshgrid(x, y)  

In [None]:
Z = fm((X, Y))
x = X.flatten()  
y = Y.flatten()  

In [None]:
from mpl_toolkits.mplot3d import Axes3D  

In [None]:
fig = plt.figure(figsize=(10, 6))
ax = plt.subplot(projection='3d')
surf = ax.plot_surface(X, Y, Z, rstride=2, cstride=2,
                       cmap='coolwarm', linewidth=0.5,
                       antialiased=True)
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_zlabel('f(x, y)')
fig.colorbar(surf, shrink=0.5, aspect=5);

In [None]:
matrix = np.zeros((len(x), 6 + 1))
matrix[:, 6] = np.sqrt(y)  
matrix[:, 5] = np.sin(x)  
matrix[:, 4] = y ** 2
matrix[:, 3] = x ** 2
matrix[:, 2] = y
matrix[:, 1] = x
matrix[:, 0] = 1

In [None]:
reg = np.linalg.lstsq(matrix, fm((x, y)), rcond=None)[0]

In [None]:
RZ = np.dot(matrix, reg).reshape((20, 20))  

In [None]:
fig, ax = plt.subplots(subplot_kw={"projection": "3d"}, figsize=(10, 6))
surf1 = ax.plot_surface(X, Y, Z, rstride=2, cstride=2,
            cmap=mpl.cm.coolwarm, linewidth=0.5,
            antialiased=True)  
surf2 = ax.plot_wireframe(X, Y, RZ, rstride=2, cstride=2,
                          label='regression')  
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_zlabel('f(x, y)')
ax.legend()
fig.colorbar(surf, shrink=0.5, aspect=5, ax=ax);

### Interpolation

In [None]:
import scipy.interpolate as spi  

In [None]:
x = np.linspace(-2 * np.pi, 2 * np.pi, 25)

In [None]:
def f(x):
    return np.sin(x) + 0.5 * x

In [None]:
ipo = spi.splrep(x, f(x), k=1)  

In [None]:
iy = spi.splev(x, ipo)  

In [None]:
np.allclose(f(x), iy)  

In [None]:
create_plot([x, x], [f(x), iy], ['b', 'ro'],
            ['f(x)', 'interpolation'], ['x', 'f(x)'])

In [None]:
xd = np.linspace(1.0, 3.0, 50)  
iyd = spi.splev(xd, ipo)

In [None]:
create_plot([xd, xd], [f(xd), iyd], ['b', 'ro'],
            ['f(x)', 'interpolation'], ['x', 'f(x)'])

In [None]:
ipo = spi.splrep(x, f(x), k=3)  
iyd = spi.splev(xd, ipo)  

In [None]:
np.allclose(f(xd), iyd)  

In [None]:
np.mean((f(xd) - iyd) ** 2)  

In [None]:
create_plot([xd, xd], [f(xd), iyd], ['b', 'ro'],
            ['f(x)', 'interpolation'], ['x', 'f(x)'])

## Pattern Frequencies

In [None]:
f = 10
n = 25000

In [None]:
np.random.seed(100)

In [None]:
x = np.random.randint(0, 2, (n, f))
x[:4]

In [None]:
y = np.random.randint(0, 2, n)
y[:4]

In [None]:
2 ** f

In [None]:
fcols = [f'f{_}' for _ in range(f)]
fcols

In [None]:
data = pd.DataFrame(x, columns=fcols)
data['l'] = y

In [None]:
data.info()

In [None]:
grouped = data.groupby(list(data.columns))

In [None]:
freq = grouped['l'].size().unstack(fill_value=0)

In [None]:
freq['sum'] = freq[0] + freq[1]

In [None]:
freq.head(10)

In [None]:
freq['sum'].describe().astype(int)

<img src='http://hilpisch.com/tpq_logo.png' width="35%" align="right">

<br><br><a href="http://tpq.io" target="_blank">http://tpq.io</a> | <a href="http://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:ai@tpq.io">ai@tpq.io</a>