In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import datetime
from quantopian.pipeline.classifiers.morningstar import Sector
from quantopian.pipeline import Pipeline
from quantopian.pipeline.data.builtin import USEquityPricing
from quantopian.research import run_pipeline
from quantopian.pipeline.data import morningstar, Fundamentals
from quantopian.pipeline.factors import CustomFactor,AverageDollarVolume,SimpleMovingAverage, ExponentialWeightedMovingAverage, EWMA
from quantopian.pipeline.filters.morningstar import IsPrimaryShare
from quantopian.pipeline.factors import AverageDollarVolume
from quantopian.pipeline.factors.morningstar import MarketCap
from quantopian.pipeline.experimental import QTradableStocksUS
from statsmodels.tsa.stattools import coint
from scipy import stats as stats
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
import scipy
import statsmodels.api as sm

In [None]:
def make_pipe(start,end):        
    mcap = MarketCap()
    min_mcap = mcap > 5e10
    
    price = USEquityPricing.close.latest
    can_trade = QTradableStocksUS()
    sector = Sector()
    
    asset_filter = can_trade & min_mcap
    pipe = Pipeline(screen = asset_filter)

    pipe.add(price,'price')
    pipe.add(sector,"Sector")
    

    res2 = run_pipeline(pipe, start, end)
    return res2

In [None]:
MORNINGSTAR_SECTOR_CODES = {  
     -1: 'Misc',  
    101: 'Basic Materials',  
    102: 'Consumer Cyclical',  
    103: 'Financial Services',  
    104: 'Real Estate',  
    205: 'Consumer Defensive',  
    206: 'Healthcare',  
    207: 'Utilities',  
    308: 'Communication Services',  
    309: 'Energy',  
    310: 'Industrials',  
    311: 'Technology' ,  
}

In [None]:
end = datetime.datetime(2019,5,5)  - datetime.timedelta(2)
start = datetime.datetime(2019,5,5) - datetime.timedelta(18)
result = make_pipe(start,end)
result.head()

In [None]:
date = result.index[0][0]

companies = []
for company in result.loc[date]['Sector'].index:
    if result.loc[date]['Sector'][company] ==  311:
#     if result.loc[date]['Sector'][company] ==  206:
#     if result.loc[date]['Sector'][company] ==  310:
        companies.append(company)

In [None]:
companies

In [None]:
prices = get_pricing(companies, start_date=start, end_date=end, frequency='minute', fields='price')

In [None]:
coints = []
corrs = []
for c1 in range(len(companies)):
    for c2 in range(c1,len(companies)):
        if c1==c2: continue
        coints.append(coint(prices[companies[c1]],prices[companies[c2]])[0])
        corrs.append(np.corrcoef(prices[companies[c1]],prices[companies[c2]])[0][1])
        print(c1,c2,coints[-1])

In [None]:
print(np.mean(coints),np.mean(corrs))

In [None]:
def backtest(prices,max_pos=1,num_factors=1,initial_cash=1e6,lkbk=500):
    pr = np.asarray(prices.T)
    entry = {}
    pnls = []
    dates = []
    #resids = run_pca(pr,num_factors)
    
    if max_pos > pr.shape[0]/2:
        print('max_pos too large!')
        return

    for i,pri in enumerate(pr.T):

        if i < 60: continue
 
        resids, factors = run_pca(pr[:,max(0,i-lkbk):i],num_factors,log_prices=True)
        zs = {}
        for inst in range(len(pri)):
            #zs[inst] = Zscore(resids[inst])[i]
            zs[inst] = Zscore(resids[inst])[-1]

        idx_long = (np.argsort([zs[j] for j in zs])[:max_pos])
        idx_short = (np.argsort([zs[j] for j in zs])[-max_pos:])
        
        pnl = 0
        for j,idx in enumerate(entry):
            wgt = np.round((initial_cash/len(pri))/entry[idx])
            #pnl += ((pri[idx]-np.abs(entry[idx]))/np.abs(entry[idx]))*wgt/initial_cash
            pnl += ((pri[idx]-np.abs(entry[idx])))*wgt
            #print pnl
        pnls.append(pnl)
        dates.append(prices.index[i])
            
        entry = {}
        

        #print(idx_long, idx_short)
        for idx in idx_long:
            entry[idx] = pri[idx]
        for idx in idx_short:
            entry[idx] = -pri[idx]
        #print(i,entry)
        
        print(i,sum(pnls))
    return pnls,dates

In [None]:
def Zscore(X):
    return np.array((X - np.mean(X)) / np.std(X))

In [None]:
def run_pca(pr,components=1,log_prices=True):
    pca = PCA(n_components=components)
    if log_prices:
        comps = pca.fit(np.log(pr.T)).components_.T
    else:
        comps = pca.fit(pr.T).components_.T
    factors = sm.add_constant(pr.T.dot(comps))
    mm = [sm.OLS(s.T, factors).fit() for s in pr]
    resids = list(map(lambda x: x.resid, mm))
    return resids, factors

In [None]:
for p in np.asarray(prices.T):
    plt.plot((p-p[0])/np.std(p))

In [None]:
pnls,dates = backtest(prices,max_pos=2,num_factors=2,initial_cash=1e6,lkbk=400)

In [None]:
plt.plot(np.cumsum(pnls));

In [None]:
plt.plot(dates,np.cumsum(pnls));