# Japan challenge submission template

+ ライブラリとドメインを import 


In [None]:
from quantopian.pipeline import Pipeline, CustomFactor
from quantopian.pipeline.data import EquityPricing, factset
from quantopian.pipeline.factors import Returns, SimpleMovingAverage
from quantopian.pipeline.domain import (
    AT_EQUITIES, # Austria
    AU_EQUITIES, # Australia
    BE_EQUITIES, # Belgium
    BR_EQUITIES, # Brazil
    CA_EQUITIES, # Canada
    CH_EQUITIES, # Switzerland
    CN_EQUITIES, # China
    DE_EQUITIES, # Germany
    DK_EQUITIES, # Denmark
    ES_EQUITIES, # Spain
    FI_EQUITIES, # Finland
    FR_EQUITIES, # France
    GB_EQUITIES, # Great Britain
    HK_EQUITIES, # Hong Kong
    IE_EQUITIES, # Ireland
    IN_EQUITIES, # India
    IT_EQUITIES, # Italy
    JP_EQUITIES, # Japan
    KR_EQUITIES, # South Korea
    NL_EQUITIES, # Netherlands
    NO_EQUITIES, # Norway
    NZ_EQUITIES, # New Zealand
    PT_EQUITIES, # Portugal
    SE_EQUITIES, # Sweden
    SG_EQUITIES, # Singapore
    US_EQUITIES, # United States
)
from quantopian.research import run_pipeline

import pandas as pd
import numpy as np

import time

import matplotlib.pyplot as plt
import seaborn as sns

import empyrical as ep
import alphalens as al
import pyfolio as pf

Helper functions

ヘルパー関数。ここは一歳触る必要なし。


In [None]:
def evaluate_factor(factor, 
                    domain, 
                    start_date, 
                    end_date,
                    factor_screen=None,
                    quantiles=5,
                    returns_lengths=(1, 5, 10)):
    """Analyze a Pipeline Factor using Alphalens.
    
    Parameters
    ----------
    factor : quantopian.pipeline.factors.Factor
        Factor producing scores to be evaluated.
    domain : quantopian.pipeline.domain.Domain
        Domain on which the factor should be evaluated.
    start_date : str or pd.Timestamp
        Start date for evaluation period.
    end_date : str or pd.Timestamp
        End date for evaluation period.
    standardize : 
    factor_screen : quantopian.pipeline.filters.Filter, optional
        Filter defining which assets ``factor`` should be evaluated on.
        Default is ``factor.notnull()``.
    quantiles : int, optional
        Number of buckets to use for quantile groups. Default is 5
    returns_lengths : sequence[int]
        Forward-returns horizons to use when evaluating ``factor``. 
        Default is 1-day, 5-day, and 10-day returns.
        
    Returns
    -------
    factor_data : pd.DataFrame
        A (date, asset)-indexed DataFrame with the following columns:
            'factor': float64
                Values produced by ``factor``.
            'factor_quantiles': int64
                Daily quantile label for each
    """
    calendar = domain.calendar
    # Roll input dates to the next trading session.
    # 入力された日付の「次の」トレーディングセッション日.
    # マーケットオープン前であればその日、クローズ後であれば次の日
    start_date = calendar.minute_to_session_label(pd.Timestamp(start_date, tz='UTC'))
    end_date = calendar.minute_to_session_label(pd.Timestamp(end_date, tz='UTC'))
    
    if factor_screen is None:
        factor_screen = factor.notnull()
        
    # Run pipeline to get factor values and quantiles.
    # 【Factor パイプライン作成】
    # ここで、Factor計算と、その結果をquantilesに分けてどのQuantileに入っているかを
    factor_pipe = Pipeline(
        # 各銘柄のファクター 
        {'factor': factor, 
         #そのファクターがどのQuantileに入っているか。つまり、0,1,2,3,4 のいずれか
         'factor_quantile': factor.quantiles(quantiles, mask=factor_screen)},
        screen=factor_screen,
        domain=domain,
    )
    
    # 【Factor パイプライン実行】    
    factor_results = run_pipeline(factor_pipe, start_date, end_date, chunksize=250)
    
    # 【結果パイプラインを作成】
    # Universeに入っている銘柄の毎日のReturnを計算する。
    column_order = []
    returns_cols = {}
    
    # まずは、returns_lengths（1日、5日、10日）のコラムを持つDataFrameを作り
    # そこにいったんDayReturnの結果を入れる。箱だけ作っている状態。
    for length in returns_lengths:
        colname = '{}D'.format(length)
        column_order.append(colname)
        # Here we are not computing cumulative returns, this could be done
        # more efficiently
        returns_cols[colname] = Returns(window_length=2)
    # Universe を渡して、Pipelineを作る。
    returns_pipe = Pipeline(returns_cols, domain=domain)
    
    
    # Compute returns for the period after the factor pipeline, then 
    # shift the results back to align with our factor values.
    # 【結果パイプライン実行】    
    returns_start_date = start_date
    returns_end_date = end_date + domain.calendar.day * max(returns_lengths)
    raw_returns = run_pipeline(returns_pipe, returns_start_date, returns_end_date, chunksize=500)

    # 【Factorパイプライン結果と、結果パイプライン結果をマージ】
    # returns_lengthsに入っている日付で shift しながらデータをマージ
    # DataFrameを作る
    shifted_returns = {}
    for name, length in zip(column_order, returns_lengths):
        # Shift 1-day returns back by a day, 5-day returns back by 5 days, etc.
        raw = raw_returns[name]
        shifted_returns[name] = backshift_returns_series(raw, length)
        
    # Merge backshifted returns into a single frame indexed like our desired output.
    merged_returns = pd.DataFrame(
        data=shifted_returns, 
        index=factor_results.index, 
        columns=column_order,
    )
    
    # Concat factor results and forward returns column-wise.
    merged = pd.concat([factor_results, merged_returns], axis=1)
    merged.index.set_names(['date', 'asset'], inplace=True)
    
    # Drop NaNs
    merged = merged.dropna(how='any')
    
    # Add a Business Day Offset to the DateTimeIndex
    merged.index.levels[0].freq = pd.tseries.offsets.BDay()
    
    # merged は、date と asset をindexに、 コラムに factor / factor quantile / 1日〜１５日の shiftしたDay Return
    # print merged.head()
    # date                      asset                                       
    # 2015-06-01 00:00:00+00:00 Equity(1178883450164305 [4118]) -0.194696   
    #                           Equity(1178883465819716 [7458]) -0.078316   
    #                           Equity(1178883518248535 [9438])  1.049684   
    #                           Equity(1178883767611984 [1950])  0.440113   
    #                           Equity(1178883868471374 [8715])  1.565080   

    #                                                            factor_quantile  \
    # date                      asset                                              
    # 2015-06-01 00:00:00+00:00 Equity(1178883450164305 [4118])                1   
    #                           Equity(1178883465819716 [7458])                2   
    #                           Equity(1178883518248535 [9438])                4   
    #                           Equity(1178883767611984 [1950])                3   
    #                           Equity(1178883868471374 [8715])                4   

    #                                                                  1D        2D  \
    # date                      asset                                                 
    # 2015-06-01 00:00:00+00:00 Equity(1178883450164305 [4118])  0.014301 -0.004338   
    #                           Equity(1178883465819716 [7458])  0.012270  0.001212   
    #                           Equity(1178883518248535 [9438])  0.021357  0.003690   
    #                           Equity(1178883767611984 [1950]) -0.014299 -0.003744   
    #                           Equity(1178883868471374 [8715])  0.017296  0.017450   
    
    return merged

def backshift_returns_series(series, N):
    """Shift a multi-indexed series backwards by N observations in the first level.
    
    This can be used to convert backward-looking returns into a forward-returns series.
    """
    ix = series.index
    dates, sids = ix.levels
    date_labels, sid_labels = map(np.array, ix.labels)
    # Output date labels will contain the all but the last N dates.
    new_dates = dates[:-N]
    # Output data will remove the first M rows, where M is the index of the
    # last record with one of the first N dates.
    cutoff = date_labels.searchsorted(N)
    new_date_labels = date_labels[cutoff:] - N
    new_sid_labels = sid_labels[cutoff:]
    new_values = series.values[cutoff:]
    assert new_date_labels[0] == 0
    new_index = pd.MultiIndex(
        levels=[new_dates, sids],
        labels=[new_date_labels, new_sid_labels],
        sortorder=1,
        names=ix.names,
    )
    return pd.Series(data=new_values, index=new_index)

結果を算出する関数群。これも触る必要はない。


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

import empyrical as ep
import alphalens as al
import pyfolio as pf

def compute_turnover(df):
    # df は、
    # index に、 date と asset 
    # columns に factor(s) で算出された各銘柄の値を持つDataFrame
    # 例：
    #     date                       asset                          
    # 2015-06-01 00:00:00+00:00  Equity(1178883450164305 [4118])   -0.000231
    #                            Equity(1178883465819716 [7458])   -0.000093
    #                            Equity(1178883518248535 [9438])    0.001245
    #                            Equity(1178883767611984 [1950])    0.000522
    #                            Equity(1178883868471374 [8715])    0.001857
    # 
    
    # よって、この返り値は、欠損値を削除した後、unstackして date を index 
    # asset をコラム名にした DataFrameを作り
    # 各銘柄の毎日の diff (前日からの引き算)を取得して、絶対値をとり、
    # 足し算している。 sum(level=1) なので row の計算。つまり、毎日全銘柄の結果を足し合わせて
    # date 毎に出している。
    
    #     date
    #     2015-06-01 00:00:00+00:00    0.000000
    #     2015-06-02 00:00:00+00:00    0.076373
    #     2015-06-03 00:00:00+00:00    0.032417
    #     2015-06-04 00:00:00+00:00    0.029175
    #     2015-06-05 00:00:00+00:00    0.045035
    #     Freq: B, dtype: float64
        
    return df.dropna().unstack().dropna(how='all').fillna(0).diff().abs().sum(1)

def get_max_median_position_concentration(expos):
    # expos は毎日のFactorの計算結果
    # date                       asset                          
    # 2015-06-01 00:00:00+00:00  Equity(1178883450164305 [4118])   -0.000231
    #                            Equity(1178883465819716 [7458])   -0.000093
    #                            Equity(1178883518248535 [9438])    0.001245
    #                            Equity(1178883767611984 [1950])    0.000522
    #                            Equity(1178883868471374 [8715])    0.001857    
    longs = expos.loc[expos > 0]
    shorts = expos.loc[expos < 0]

    # 返り値は index に date, コラムに factor結果の quantile を持つ dataframe 
    #                                0.05      0.25      0.50      0.75      0.95
    # date                                                                       
    # 2015-06-01 00:00:00+00:00 -0.002378 -0.000809  0.000150  0.000846  0.001991
    # 2015-06-02 00:00:00+00:00 -0.002366 -0.000812  0.000138  0.000830  0.002056
    # 2015-06-03 00:00:00+00:00 -0.002368 -0.000814  0.000138  0.000832  0.002063
    # 2015-06-04 00:00:00+00:00 -0.002372 -0.000814  0.000141  0.000833  0.002071
    # 2015-06-05 00:00:00+00:00 -0.002364 -0.000816  0.000131  0.000821  0.002120
            
    return expos.groupby(level=0).quantile([.05, .25, .5, .75, .95]).unstack()

def compute_factor_stats(factor_data_total, periods=range(1, 15)):
    portfolio_returns_total = al.performance.factor_returns(factor_data_total)
    # http://quantopian.github.io/alphalens/alphalens.html?highlight=performance#alphalens.performance.factor_returns
    # print portfolio_returns_total.head()
    #                                  1D        2D        3D        4D        5D  \
    # date                                                                          
    # 2015-06-01 00:00:00+00:00  0.002649  0.000449  0.003411  0.000520  0.000578   
    # 2015-06-02 00:00:00+00:00  0.000442  0.002991  0.000836  0.000215  0.001074   
    # 2015-06-03 00:00:00+00:00  0.002945  0.001080  0.000713  0.000693 -0.000618   
    # 2015-06-04 00:00:00+00:00  0.001002  0.000604  0.000880 -0.000612  0.002325   
    # 2015-06-05 00:00:00+00:00  0.000831  0.000502 -0.000662  0.001882  0.000483   

    #                                  6D        7D        8D        9D       10D  \
    # date                                                                          
    # 2015-06-01 00:00:00+00:00  0.001675  0.000180  0.001827  0.000246  0.002402   
    # 2015-06-02 00:00:00+00:00 -0.000442  0.002082  0.000783  0.002465  0.000328   
    # 2015-06-03 00:00:00+00:00  0.002147  0.000660  0.002547  0.000589 -0.001250   
    # 2015-06-04 00:00:00+00:00  0.000651  0.002452  0.000482 -0.001096  0.003159   
    # 2015-06-05 00:00:00+00:00  0.002385  0.000312 -0.000931  0.002899 -0.001469   

    #                                 11D       12D       13D       14D  
    # date                                                               
    # 2015-06-01 00:00:00+00:00  0.000070 -0.001366  0.002941 -0.001755  
    # 2015-06-02 00:00:00+00:00 -0.001240  0.003079 -0.001384 -0.001057  
    # 2015-06-03 00:00:00+00:00  0.003016 -0.001379 -0.000775 -0.001095  
    # 2015-06-04 00:00:00+00:00 -0.001151 -0.001089 -0.000949 -0.003318  
    # 2015-06-05 00:00:00+00:00 -0.001145 -0.001061 -0.003247  0.001657      
    
    
    portfolio_returns_total.columns = portfolio_returns_total.columns.map(lambda x: int(x[:-1]))

    # Factorが掛けられたポートフォリオが生み出す毎日の損益を１日〜１５日分シフトする
    # これは、その日Factorによって作られたポートフォリオが１日〜１５日後にどのくらいの
    # 利益を生んでいるのか確認するため
    for i in portfolio_returns_total.columns:
        portfolio_returns_total[i] = portfolio_returns_total[i].shift(i)

    # シフトしたデータでシャープレシオを算出
    delay_sharpes_total = portfolio_returns_total.apply(ep.sharpe_ratio)
    
    factor = factor_data_total.factor
    turnover = compute_turnover(factor)
    n_holdings = factor.groupby(level=0).count()
    perc_holdings = get_max_median_position_concentration(factor)
    
    return {'factor_data_total': factor_data_total, 
            'portfolio_returns_total': portfolio_returns_total,
            'delay_sharpes_total': delay_sharpes_total,
            'turnover': turnover,
            'n_holdings': n_holdings,
            'perc_holdings': perc_holdings,
    }

def plot_overview_tear_sheet(factor_data, periods=range(1, 15)):
    # We assume portfolio weights, so make sure factor scores sum to 1
    factor_data['factor'] = factor_data.factor.div(factor_data.abs().groupby(level='date').sum()['factor'])
    
    fig = plt.figure(figsize=(16, 16))
    gs = plt.GridSpec(3, 4)
    ax1 = plt.subplot(gs[0:2, 0:2])
    
    factor_stats = compute_factor_stats(factor_data, periods=periods)
    
    # 描画１
    # １日〜１５日後のシャープレシオを棒グラフで描画
    pd.DataFrame({'total': factor_stats['delay_sharpes_total']}).plot.bar(ax=ax1)
    ax1.set(xlabel='delay', ylabel='IR')

    # 描画2
    # １日目〜５日目のCumulative Returnを線グラフで描画
    ax2a = plt.subplot(gs[0:2, 2:4])
    delay_cum_rets_total = factor_stats['portfolio_returns_total'][list(range(1, 5))].apply(ep.cum_returns)
    delay_cum_rets_total.plot(ax=ax2a)
    ax2a.set(title='Total returns', ylabel='Cumulative returns')
    
    # 描画３の左軸。毎日の保有銘柄数
    ax6 = plt.subplot(gs[-1, 0:2])
    factor_stats['n_holdings'].plot(color='b', ax=ax6)
    ax6.set_ylabel('# holdings', color='b')
    ax6.tick_params(axis='y', labelcolor='b')

    # 描画３の右軸。turnover。Factorの数値が毎日どのように変化するかを表現した線グラフ
    # この変化が小さいほうがそのFactor がどんな市場状況でも安定した数字をだすので、信頼できるという意味になる（という事だと思う）
    ax62 = ax6.twinx()
    factor_stats['turnover'].plot(color='r', ax=ax62)
    ax62.set_ylabel('turnover', color='r')
    ax62.tick_params(axis='y', labelcolor='r')
    
    # Quantile毎の銘柄数。どのQuantileに何銘柄入っているかを描画
    # 今回は売買simulation するわけではないですが、売買はこの一番高いQuantileに入っている銘柄を買い、一番低いQuantileの銘柄を
    # 売るというポートフォリオを組みます
    ax7 = plt.subplot(gs[-1, 2:4])
    factor_stats['perc_holdings'].plot(ax=ax7)
    ax7.set(ylabel='Long/short perc holdings')
    
    gs.tight_layout(fig)
    
    return fig, factor_stats

Universe definition

日本株用のユニバースを作っているところ。ここもさわらなくていい。

ユニバースに入っている銘柄：

+ 【過去2週間の最小出来高✕終値】のトップ30％（出来高が0以上の銘柄のみ）
+ 最小出来高* 終値を算出することで、超低位株だから出来高が高い銘柄をユニバースから外している
+ Japan（JP_EQUITIES）: Tokyo Stock Exchange, JASDAQ, Osaka Exchange, Nagoya Stock Exchange, Fukuoka Stock Exchange, Sapporo Securities Exchange



In [None]:
# Custom factor that gets the minimum volume traded over the last two weeks.
class MinVolume(CustomFactor):
    inputs=[EquityPricing.volume]
    window_length=10
    def compute(self, today, asset_ids, out, values):
        # Calculates the column-wise standard deviation, ignoring NaNs
        out[:] = np.min(values, axis=0)

# Create a volume and price filter that filters for stocks in the top 30%.
# We multiply by price to rule out penny stocks that trade in huge volume.
volume_min = MinVolume()
price = EquityPricing.close.latest
univ_filter = ((price * volume_min).percentile_between(70, 100, mask=(volume_min > 0)))

Enter your alpha factor here. Make sure to delete the following cell before making your submission!

In [None]:
# Our alpha factor is a size-based factor.
alpha_factor = -factset.Fundamentals.mkt_val.latest.log1p()
alpha_factor = factset.Fundamentals.roe_af.latest

alpha_winsorized = alpha_factor.winsorize(min_percentile=0.05,
                                          max_percentile=0.95,
                                          mask=univ_filter)
    
# Zscore to get long and short (positive and negative) alphas to use as weights
alpha_zscore = alpha_winsorized.zscore()

In [None]:
# Call evaluate_factor on our factor to get Alphalens-formatted data.
al_data = evaluate_factor(
    alpha_zscore, 
    JP_EQUITIES, 
    '2015-06-1', 
    '2016-06-1', 
    factor_screen=univ_filter,
    returns_lengths=range(1, 15),
)

In [None]:
fig, factor_stats = plot_overview_tear_sheet(al_data);

In [None]:
factor_stats.keys()

In [None]:
factor_stats['factor_data_total'].head()

In [None]:
factor_stats['delay_sharpes_total']

In [None]:
factor_stats['turnover'].head()

In [None]:
factor_stats['portfolio_returns_total'].head(20)

In [None]:
factor_stats['n_holdings'].head()

In [None]:
factor_stats["perc_holdings"].head()

In [None]:

x = factor_stats['factor_data_total'].reset_index()
x.head()

In [None]:
x["asset"][0].sid

In [None]:
sid(1178883450164305)

In [None]:
x[x["asset"] == x["asset"][0]].head()

In [None]:
x[x["asset"] == sid(1178883450164305)]