In [None]:
!wget --no-cache -O init.py -q https://raw.githubusercontent.com/rramosp/20192.ai4eng/master/init.py
import init; init.init(force_download=False); init.get_weblink()

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import local.lib.timeseries as ts
%matplotlib inline

# The data

In [2]:
d = pd.read_csv("local/data/eurcop.csv")
d.index = pd.to_datetime(d.Date)
del(d["Date"])
d.head()

In [3]:
d.plot(figsize=(15,3))

In [4]:
d[["Rate"]].plot(figsize=(15,3))

In [241]:
d = d[["Rate"]]
d.head(10)

# A predictive model

### First create a time series dataset with look back

In [421]:
dt = ts.timeseries_as_many2one(d, columns=["Rate"], nb_timesteps_in=4, timelag=0)
dt.head()

### Split dataset for trian and for test

In [425]:
trds = dt[:"2008"]
tsds = dt["2009":]
print (dt.shape, trds.shape, tsds.shape)
plt.figure(figsize=(15,3))
plt.plot(trds.index.values, trds.Rate.values, color="black", lw=2, label="train", alpha=.5)
plt.plot(tsds.index.values, tsds.Rate.values, color="red", lw=2, label="test", alpha=.5)
plt.grid();
plt.legend();

### Create `X` and `y` matrices for train and test

In [426]:
Xtr, ytr = trds[[i for i in trds.columns if i!="Rate"]].values, trds.Rate.values
Xts, yts = tsds[[i for i in tsds.columns if i!="Rate"]].values, tsds.Rate.values

In [427]:
trds[:5]

In [428]:
print (Xtr[:10])
print (ytr[:10])

In [429]:
tsds[:5]

In [430]:
print (Xts[:10])
print (yts[:20])

### convert target into classification task for TREND PREDICTION (1 up, 0 down)

In [431]:
yts = (yts>Xts[:,-1]).astype(int)
ytr = (ytr>Xtr[:,-1]).astype(int)
print (ytr[:20])
print (yts[:20])

### inspect target class distributions

In [432]:
print ("1's in train %.2f%s"%(np.mean(ytr)*100, "%"))
print ("1's in test  %.2f%s"%(np.mean(yts)*100, "%"))

### train a predictive model

In [436]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
estimator = RandomForestClassifier(n_estimators=5, max_depth=30)
#estimator = DecisionTreeClassifier(max_depth=2)
#estimator = LogisticRegression()
#estimator = Pipeline((("pca", PCA(n_components=2)), ("estimator", estimator)))
estimator.fit(Xtr,ytr);

### get predictive accuracy in train and test

In [437]:
print ("train accuracy %.2f"%estimator.score(Xtr,ytr))
print ("test accuracy  %.2f"%estimator.score(Xts,yts))


### inspect confusion matrix

In [438]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

cm = confusion_matrix(yts, estimator.predict(Xts))
sns.heatmap(cm,annot=True,cbar=False, fmt="d")
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.title('Confusion Matrix')


# A strategy

- if model predicts 1 (price up) we buy 10 EUR today and sell them tomorrow
- if model predicts 0 (price down) we sell 10 EUR today and buy them tomorrow

In [439]:
def trade(d, date_close, op, qty):
    assert op in ["buy", "sell"]
    assert qty>=0
    
    r = (d.loc[:date_close].iloc[-2].Rate-d.loc[date_close].Rate)*qty    
    if op=="buy":
        r = -r
    return r

example: a **buy** operation on 2011-01-03 closed (with a sell operation) on 2011-01-04

In [440]:
trade(tsds, "2011-01-04", "buy", 100)

In [441]:
trade(tsds, "2011-01-05", "buy", 100)

In [442]:
tsds["2011-01-02":].iloc[:5]

In [443]:
yts

In [444]:
def compute_pnl(d, y, predictions, qty=10):
    pnl = []
    for date,prediction in zip(d.index[1:], predictions[1:]):
        pnl.append(trade(d, date, "sell" if prediction==0 else "buy", qty))
    pnl = pd.DataFrame(np.r_[[pnl]].T, index=d.index[1:], columns=["pnl"])
    pnl["prediction"]=predictions[1:]
    pnl["y"]=y[1:]
    return pnl

In [445]:
preds = estimator.predict(Xts)
pnl = compute_pnl(tsds, yts, preds)


In [446]:
pnl.pnl.plot()
plt.title("TOTAL PNL %.2f COP"%pnl.pnl.sum())
plt.ylabel("PNL")
plt.grid();
plt.ylim(-5000,5000);

In [447]:
def plot_pnlhist(pnl_series, label=""):
    k = pnl_series.values
    total = np.sum(k); 
    k = k[np.abs(k)<50000]
    plt.hist(k, bins=30);
    plt.title("PNL for %s, total %.2f COP"%(label, total))

In [448]:
plt.figure(figsize=(12,8))
plt.subplot(221); plot_pnlhist(pnl[pnl.y==1].pnl, "REAL = 1 (up)"); plt.grid();
plt.subplot(222); plot_pnlhist(pnl[pnl.y==0].pnl, "REAL = 0 (down)"); plt.grid();
plt.subplot(223); plot_pnlhist(pnl[preds[1:]==1].pnl, "PREDS = 1 (up)"); plt.grid();
plt.subplot(224); plot_pnlhist(pnl[preds[1:]==0].pnl, "PREDS = 0 (down)"); plt.grid();