In [None]:
import numpy as np
import statsmodels.api as sm
import math
import pandas as pd
from quantopian.research import run_pipeline, returns, get_pricing
from quantopian.pipeline import Pipeline
from quantopian.pipeline.data.builtin import USEquityPricing
from quantopian.pipeline.factors import CustomFactor, Returns, PercentChange, SimpleMovingAverage, BusinessDaysSincePreviousEvent, BusinessDaysUntilNextEvent, VWAP
from quantopian.pipeline.filters import QTradableStocksUS
from quantopian.pipeline.data import Fundamentals
from quantopian.pipeline.data import factset
from quantopian.pipeline import factors, filters, classifiers
import alphalens
from scipy.stats.mstats import winsorize
from sklearn import preprocessing
from scipy.stats.mstats import gmean
from zipline.utils.numpy_utils import (
    repeat_first_axis,
    repeat_last_axis,
)
from quantopian.pipeline.data.factset import RBICSFocus
import quantopian.pipeline.data.factset.estimates as fe
from quantopian.pipeline.data.factset.ownership import Form3AggregatedTrades
from quantopian.pipeline.data.factset.ownership import Form4and5AggregatedTrades

from quantopian.pipeline.domain import US_EQUITIES



In [None]:
WIN_LIMIT = 0
QL = 66

def signalize(df):
    z = (df.rank() - 0.5)/df.count()
    return z.replace(np.nan, z.mean())

def preprocess(a):
    a = a.astype(np.float64)
    a[np.isinf(a)] = np.nan
    a = np.nan_to_num(a - np.nanmean(a))
    a = winsorize(a, limits=[WIN_LIMIT,WIN_LIMIT])
    
    return preprocessing.scale(a)


class Previous(CustomFactor):  
    # Returns value of input x trading days ago where x is the window_length  
    # Both the inputs and window_length must be specified as there are no defaults

    def compute(self, today, assets, out, inputs):  
        out[:] = inputs[0]


sd = '2014-01-04'
ed = '2017-05-01'
        
        
insider_txns_form3_1d = Form3AggregatedTrades.slice(False, 1)
insider_txns_form4and5_1d = Form4and5AggregatedTrades.slice(False, 1)

insider_txns_form3_7d = Form3AggregatedTrades.slice(False, 7)
insider_txns_form4and5_7d = Form4and5AggregatedTrades.slice( False, 7)

insider_txns_form3_30d = Form3AggregatedTrades.slice(False, 30)
insider_txns_form4and5_30d = Form4and5AggregatedTrades.slice(False, 30)

insider_txns_form3_90d = Form3AggregatedTrades.slice(False, 90)
insider_txns_form4and5_90d = Form4and5AggregatedTrades.slice(False, 90)

#Get unique buyers/sellers    
filers_form3_1d = insider_txns_form3_1d.num_unique_filers
buyers_form4and5_1d = insider_txns_form4and5_1d.num_unique_buyers
sellers_form4and5_1d = insider_txns_form4and5_1d.num_unique_sellers

filers_form3_7d = insider_txns_form3_7d.num_unique_filers
buyers_form4and5_7d = insider_txns_form4and5_7d.num_unique_buyers
sellers_form4and5_7d = insider_txns_form4and5_7d.num_unique_sellers

filers_form3_30d = insider_txns_form3_30d.num_unique_filers
buyers_form4and5_30d = insider_txns_form4and5_30d.num_unique_buyers
sellers_form4and5_30d = insider_txns_form4and5_30d.num_unique_sellers

filers_form3_90d = insider_txns_form3_90d.num_unique_filers
buyers_form4and5_90d = insider_txns_form4and5_90d.num_unique_buyers
sellers_form4and5_90d = insider_txns_form4and5_90d.num_unique_sellers

ML=25
QL=75

# class Factor(CustomFactor):
#     inputs=[filers_form3_30d,buyers_form4and5_30d,sellers_form4and5_30d]
#     window_length = ML*6 
#     window_safe=True
#     def compute(self, today, assets, out, b1, b2, s2):  
#         b1=np.nan_to_num(b1)
#         b2=np.nan_to_num(b2)
#         s2=np.nan_to_num(s2)
#         nb = b1 + b2 - s2
#         arr = [nb[-1],nb[-ML*1],nb[-ML*2],nb[-ML*3],nb[-ML*4],nb[-ML*5],nb[-ML*6]]
#         mean = np.nanmean(arr,axis=0)
#         out[:] = mean  
        
# class Factor(CustomFactor):
#     inputs=[filers_form3_30d,buyers_form4and5_30d,sellers_form4and5_30d]
#     window_length = 1 
#     def compute(self, today, assets, out, b1, b2, s2):  
#         b1=np.nan_to_num(b1)
#         b2=np.nan_to_num(b2)
#         s2=np.nan_to_num(s2)
#         nb = b1 + b2 - s2
#         out[:] = nb[-1]  

        
# class AvgDlyTxnMean(CustomFactor):
#     window_safe = True
#     def compute(self, today, assets, out, b1, b2, s2):  
#         b1=np.nan_to_num(b1)
#         b2=np.nan_to_num(b2)
#         s2=np.nan_to_num(s2)
#         nb = b1 + b2 - s2
#         mean = np.nansum(nb, axis=0)
#         out[:] = mean  

        
# class AvgDlyTxnMean_f3(CustomFactor):
#     window_safe = True
#     def compute(self, today, assets, out, b1):  
#         b1=np.nan_to_num(b1)
#         mean = np.nanmean(b1, axis=0)
#         out[:] = mean  
        
        
# class AvgDlyTxnRatio(CustomFactor):
#     window_safe = True
#     def compute(self, today, assets, out, b1, b2, s2):  
#         b1=np.nan_to_num(b1)
#         b2=np.nan_to_num(b2)
#         s2=np.nan_to_num(s2)
#         rt1 = (b1 + b2 - s2)
#         rt2 = (b1 + b2 + s2)
#         mean = np.nanmean(rt1, axis=0)/np.nanmean(rt2, axis=0)
#         out[:] = mean  
        
        
# class Factor(CustomFactor):
#     inputs=[filers_form3_30d,buyers_form4and5_30d,sellers_form4and5_30d]
#     window_length = ML*6 
#     window_safe=True
#     def compute(self, today, assets, out, b1, b2, s2):  
#         b1=np.nan_to_num(b1)
#         b2=np.nan_to_num(b2)
#         s2=np.nan_to_num(s2)
#         nb = b1 + b2 - s2
#         arr = [nb[-1],nb[-ML*1],nb[-ML*2],nb[-ML*3],nb[-ML*4],nb[-ML*5],nb[-ML*6]]
#         arr = np.diff(arr, axis=0)/arr[1:]
#         mean = np.nanmean(arr,axis=0)
#         out[:] = mean  
        
        
WL = 7        
        
class TxnsCount(CustomFactor):
#     inputs=[filers_form3_1d]
    window_length = 3
    window_safe = True
    def compute(self, today, assets, out, b):  
        b=np.nan_to_num(b)
        m = b[-1]
        
        idx_zero = np.where(m == 0)[0]
        m[idx_zero] = b[-2][idx_zero]

        idx_zero = np.where(m == 0)[0]
        m[idx_zero] = b[-3][idx_zero]

        out[:] = m

        
# class Form5Buyers(CustomFactor):
#     inputs=[buyers_form4and5_1d]
#     window_length = 3
#     window_safe = True
#     def compute(self, today, assets, out, b):  
#         b=np.nan_to_num(b)
#         mean = b[-1] or b[-2] or b[-3]

        
# class Form5Sellers(CustomFactor):
#     inputs=[sellers_form4and5_1d]
#     window_length = 3
#     window_safe = True
#     def compute(self, today, assets, out, b):  
#         b = np.nan_to_num(b)
#         mean = b[-1] or b[-2] or b[-3]
        
        
base_universe = QTradableStocksUS()
# earnings_ann_days_prev = BusinessDaysSincePreviousEvent(inputs=[fe.Actuals.slice('SALES', 'qf', 0).asof_date])  
# guidance_ann_days_prev = BusinessDaysUntilNextEvent(inputs=[fe.Guidance.slice('SALES', 'qf', 1).asof_date])  

f3_prev = BusinessDaysSincePreviousEvent(inputs=[insider_txns_form3_7d.asof_date])  
f5_prev = BusinessDaysSincePreviousEvent(inputs=[insider_txns_form4and5_7d.asof_date])  


class Asof(CustomFactor):
    window_length = 1
    window_safe = True
    def compute(self, today, assets, out, b):  
        out[:] = b[-1]

# vwap_1d = VWAP(inputs=[USEquityPricing.close, USEquityPricing.volume], window_length=1) 
# vwap_7d = VWAP(inputs=[USEquityPricing.close, USEquityPricing.volume], window_length=7) 
# vwap_30d = VWAP(inputs=[USEquityPricing.close, USEquityPricing.volume], window_length=30) 
# vwap_90d = VWAP(inputs=[USEquityPricing.close, USEquityPricing.volume], window_length=90) 

# f1 = AvgDlyTxnMean(inputs=[filers_form3_1d,buyers_form4and5_1d,sellers_form4and5_1d], window_length=5)
# f2 = AvgDlyTxnMean(inputs=[filers_form3_1d,buyers_form4and5_1d,sellers_form4and5_1d], window_length=10)
# f3 = AvgDlyTxnMean(inputs=[filers_form3_1d,buyers_form4and5_1d,sellers_form4and5_1d], window_length=15)
# f4 = AvgDlyTxnMean(inputs=[filers_form3_1d,buyers_form4and5_1d,sellers_form4and5_1d], window_length=20)
# f5 = AvgDlyTxnMean(inputs=[filers_form3_1d,buyers_form4and5_1d,sellers_form4and5_1d], window_length=30)
# f6 = AvgDlyTxnMean(inputs=[filers_form3_1d,buyers_form4and5_1d,sellers_form4and5_1d], window_length=45)
# f7 = AvgDlyTxnMean(inputs=[filers_form3_1d,buyers_form4and5_1d,sellers_form4and5_1d], window_length=70)
# f8 = AvgDlyTxnMean(inputs=[filers_form3_1d,buyers_form4and5_1d,sellers_form4and5_1d], window_length=90) #W (opposite)

# f1 = AvgDlyTxnRatio(inputs=[filers_form3_1d,buyers_form4and5_1d,sellers_form4and5_1d], window_length=5)
# f2 = AvgDlyTxnRatio(inputs=[filers_form3_1d,buyers_form4and5_1d,sellers_form4and5_1d], window_length=10)
# f3 = AvgDlyTxnRatio(inputs=[filers_form3_1d,buyers_form4and5_1d,sellers_form4and5_1d], window_length=15)
# f4 = AvgDlyTxnRatio(inputs=[filers_form3_1d,buyers_form4and5_1d,sellers_form4and5_1d], window_length=20)
# f5 = AvgDlyTxnRatio(inputs=[filers_form3_1d,buyers_form4and5_1d,sellers_form4and5_1d], window_length=30)
# f6 = AvgDlyTxnRatio(inputs=[filers_form3_1d,buyers_form4and5_1d,sellers_form4and5_1d], window_length=45)
# f7 = AvgDlyTxnRatio(inputs=[filers_form3_1d,buyers_form4and5_1d,sellers_form4and5_1d], window_length=70)
# f8 = AvgDlyTxnRatio(inputs=[filers_form3_1d,buyers_form4and5_1d,sellers_form4and5_1d], window_length=90) #W (opposite)


# f1 = AvgDlyTxnMean_f3(inputs=[filers_form3_1d], window_length=5)
# f2 = AvgDlyTxnMean_f3(inputs=[filers_form3_1d], window_length=10)
# f3 = AvgDlyTxnMean_f3(inputs=[filers_form3_1d], window_length=15)
# f4 = AvgDlyTxnMean_f3(inputs=[filers_form3_1d], window_length=20)
# f5 = AvgDlyTxnMean_f3(inputs=[filers_form3_1d], window_length=30)
# f6 = AvgDlyTxnMean_f3(inputs=[filers_form3_1d], window_length=45)
# f7 = AvgDlyTxnMean_f3(inputs=[filers_form3_1d], window_length=70)
# f8 = AvgDlyTxnMean_f3(inputs=[filers_form3_1d], window_length=90) 

pipe = Pipeline(
    columns={
        
        'f3_prev_days': f3_prev,
        'f5_prev_days': f5_prev,
        
        'f3_asof': Asof(inputs=[insider_txns_form3_7d.asof_date]),
        'f5_asof': Asof(inputs=[insider_txns_form4and5_7d.asof_date]),
        
        'f3_buyers': TxnsCount(inputs=[filers_form3_7d]),
        'f5_buyers': TxnsCount(inputs=[buyers_form4and5_7d]),
        'f5_sellers': TxnsCount(inputs=[sellers_form4and5_7d]),
    },
    screen = base_universe & (filers_form3_90d.latest.notnull() | buyers_form4and5_90d.latest.notnull() | sellers_form4and5_90d.latest.notnull()),
)


output = run_pipeline(pipe, sd, ed)  #2017-12-19 (actual last date available

output['f5_asof'] = pd.to_datetime(output['f5_asof'])
output['f3_asof'] = pd.to_datetime(output['f3_asof'])

In [None]:
print output[['f5_sellers', 'f5_buyers', 'f5_prev_days']].head(10)

In [None]:
#Factor 1 
output['alpha1'] = signalize((output['f5_buyers'] - output['f5_sellers'])/output['f5_prev_days'])

# output['alpha1'] = output['f1']
# output['alpha2'] = output['f2']
# output['alpha3'] = output['f3']
# output['alpha4'] = output['f4']
# output['alpha5'] = output['f5']
# output['alpha6'] = output['f6']
# output['alpha7'] = output['f7']
# output['alpha8'] = output['f8']

In [None]:
print output['alpha1'].head(10)

In [None]:
alphas = pd.DataFrame()

for name in ['alpha1']: #,'alpha2','alpha3','alpha4','alpha5','alpha6','alpha7','alpha8']:
    alphas[name] = output[name].sub(output[name].mean(level=0), level=0)
#     alphas[name] = output[name]


In [None]:
assets = output.index.levels[1].unique()
# We need to get a little more pricing data than the 
# length of our factor so we can compare forward returns.
# We'll tack on another month in this example.
pricing = get_pricing(assets, start_date=sd, end_date=ed, fields='close_price')


In [None]:
returns = pricing.pct_change(periods=5).shift(-5)
returns = returns.sub(returns.mean(axis=1), axis=0).stack()
#print returns

data = pd.concat([returns.rename('returns'), output], axis=1).dropna()

# print data.head()

In [None]:
#factors = ['alpha3','alpha4']
fname = 'alpha1'

df = data[data[fname] != 0] 
# df['alpha'] = signalize(df[fname])
# df['alpha'] = df['alpha'].sub(df['alpha'].mean(level=0), level=0)

df.reset_index(inplace=True)
df = df.rename(columns={'level_0':'dates', 'level_1': 'sid'})

est = sm.OLS(
   100*df[['returns']], 
   df[['alpha1']])

est2 = est.fit(cov_type='cluster', cov_kwds={'groups': df['dates']})

print est2.summary()


In [None]:
d = df.set_index(['dates', 'sid'])
fname = 'alpha1'

net_alpha = d[fname]
factor_data = alphalens.utils.get_clean_factor_and_forward_returns(net_alpha,
                                                                   pricing,
                                                                   quantiles=5,
                                                                   periods=(5,10))

# alphalens.tears.create_returns_tear_sheet(factor_data)
# alphalens.tears.create_full_tear_sheet(factor_data)
alphalens.tears.create_summary_tear_sheet(factor_data)

In [None]:
# print df['f2'].head(100)
print net_alpha
# print signalize(df[fname]).head(100)

In [None]:
print data['f1'].head(100)

In [None]:
print df['f8'].head(100)

In [None]:
print len(df.index.levels[1].unique())

In [None]:
print len(data.index.levels[1].unique())

In [None]:
np.nan and 1.0 or 2.0

In [None]:
0.0 or 1.0 or 2.0