In [1]:
import sys
import os
cwd = os.path.dirname(os.getcwd())
sys.path.append(cwd)
import numpy as np
import tushare as ts
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import analyst as al
import assistant as at
import messenger as ms
import datetime
import trader as tr
import random
import tensorflow as tf



In [2]:
sns.set()

In [3]:
# 买盘与卖盘比例
def buy_rate(df):
    result = df['type'].value_counts()['买盘'] / len(df)
    return result

In [4]:
# 根据条件筛选DF
def _subset(df, cls='volume', percent=0.05, top=True):
    if top == True:
        sig = df[cls].quantile(1-percent)
        df = df.loc[df[cls]>sig, :]
    if top == False:
        sig = df[cls].quantile(percent)
        df = df.loc[df[cls]<sig, :]
    return df

In [5]:
# 集中度
def central_tendency(df, cls='volume', percent=0.05):
    sub = _subset(df, cls, percent)
    result = sub[cls].sum() / df[cls].sum()
    return result

In [6]:
# 资金流向
def net_flow(df):
    in_amount = df.loc[df['type']=='买盘', :]['amount'].sum()
    out_amount = df.loc[df['type']=='卖盘', :]['amount'].sum()
    net_amount = in_amount - out_amount
    return net_amount

In [7]:
# 价格变化
def act_price_change(df):
    price_list = df['price'].tolist()
    price_change = price_list[0] - price_list[-1]
    return price_change    

In [8]:
# 资金导致的价格变动
def theo_price_change(df):
    result = net_flow(df) / df['volume'].sum() / 100
    return result

In [9]:
# 资金振幅
def max_price_change(df):
    price_list = df['price'].tolist()
    result = max(price_list) - min(price_list)
    return result

In [18]:
# 获的characteristics
def get_characteristics(df, top_percent=0.05, bottom_percent=0.5, cls='volume'):
    dic = {}
    dic['act_price_change'] = act_price_change(df)
    dic['theo_price_change'] = theo_price_change(df)
    dic['resistence'] = dic['theo_price_change'] - dic['act_price_change']
    dic['buy_rate'] = buy_rate(df)
    dic['central_tendency'] = central_tendency(df, cls, top_percent)
    dic['net_flow'] = net_flow(df)
    dic['close'] = df['price'].tolist()[0]
    
    df_top = _subset(df, cls, top_percent)
    dic['top_buy_rate'] = buy_rate(df_top)
    dic['top_net_flow'] = net_flow(df_top)
    
    df_bottom = _subset(df, cls, bottom_percent, False)
    dic['bottom_buy_rate'] = buy_rate(df_bottom)
    dic['bottom_net_flow'] = net_flow(df_bottom)
    
    return dic

In [19]:
def get_dataframe(code, start_date='', days=30, top_percent=0.05, bottom_percent=0.5, cls='volume', multi_threads=20):
    days_list = at.opening_days(days=days, start_date=start_date, multi_threads=multi_threads)
    lis = []
    for i in days_list:
        try:
            dicc = {}
            dicc['date'] = i
            dicc['code'] = code
            df = ms.get_tick_data(code, i)
            dic = get_characteristics(df, top_percent, bottom_percent, cls)
            dic.update(dicc)
            lis.append(dic)
        except:
            pass
    frame = pd.DataFrame(lis)
    frame = frame[[
        'code', 'date', 'close', 'act_price_change', 'theo_price_change', 'resistence', 'buy_rate', 'central_tendency', 
        'net_flow', 'top_buy_rate', 'top_net_flow', 'bottom_buy_rate', 'bottom_net_flow'
    ]]
    return frame

In [34]:
def to_same_scale(val, lis):
    mean_val = np.mean(lis)
    min_val = min(lis)
    max_val = max(lis)
    result = (val - mean_val) / (max_val - min_val)
    return result

In [52]:
# 把DF分成若干等份
def split_frame(df, length):
    df_list = []
    for i in range(0, len(df)-length):
        df_list.append(df.loc[i: i+length, :])
    return df_list

In [59]:
# 计算特征得分
def get_score(df, col, thread=0):
    score = len(df[df[col]>thread]) / len(df)
    return score

In [55]:
# 提取特征函数
def get_features(df):
    dic = {}
    dic['emotion'] = get_score(df, 'smoothed difference')
    dic['cash'] = get_score(df, 'smoothed actual')
    dic['close'] = np.average(df['close'])
    dic['volume'] = np.max(df['volume'])
    return dic

In [24]:
dff = get_dataframe('601318', '2017-11-15', 300)

In [91]:
dfff = get_dataframe('000625', '2017-11-15', 300)

In [103]:
t.update('600313')
dffff = get_dataframe('600313', '2017-11-15', 300)

||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||  100%


In [152]:
dfc = pd.DataFrame.copy(dfff)

In [153]:
dfc['net_flow'] = dfc['net_flow'].apply(lambda x: to_same_scale(x, dfc['net_flow'].tolist()))
dfc['top_net_flow'] = dfc['top_net_flow'].apply(lambda x: to_same_scale(x, dfc['net_flow'].tolist()))
dfc['bottom_net_flow'] = dfc['bottom_net_flow'].apply(lambda x: to_same_scale(x, dfc['bottom_net_flow'].tolist()))
dfc['resistence'] = dfc['resistence'].apply(lambda x: to_same_scale(x, dfc['resistence'].tolist()))

In [154]:
dfc_list = split_frame(dfc, 5)

In [155]:
up_thread = 0.01
feature_list = []
for i in dfc_list:
    _dict = {}
    _dict['code'] = i['code'].tolist()[-1]
    _dict['date'] = i['date'].tolist()[-1]
    close_list = i['close'].tolist()
    _dict['close'] = close_list[-1]
    _dict['up'] = int(((close_list[-1] - close_list[0]) / close_list[0]) > up_thread)
    _dict['resistence'] = get_score(i, 'resistence')
    _dict['buy_rate'] = get_score(i, 'buy_rate', 0.5)
    _dict['central_tendency'] = get_score(i, 'central_tendency', 0.4)
    _dict['net_flow'] = get_score(i, 'net_flow')
    _dict['top_buy_rate'] = get_score(i, 'top_buy_rate', 0.5)
    _dict['top_net_flow'] = get_score(i, 'top_net_flow')
    _dict['bottom_buy_rate'] = get_score(i, 'bottom_buy_rate', 0.5)
    _dict['bottom_net_flow'] = get_score(i, 'bottom_net_flow')
    feature_list.append(_dict)

In [156]:
feature_df = pd.DataFrame(feature_list)

In [157]:
x_col = ['resistence', 'buy_rate', 'central_tendency', 'net_flow', 'top_buy_rate', 'top_net_flow', 'bottom_buy_rate', 'bottom_net_flow']
y_col = 'up'
x_train = feature_df.loc[:200, x_col]
y_train = feature_df.loc[:200, y_col]
x_test = feature_df.loc[200:, x_col]
y_test = feature_df.loc[200:, y_col]

In [158]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [159]:
print(clf.score(x_train, y_train))
print(clf.score(x_test, y_test))

0.791044776119
0.795698924731


In [160]:
feature_df['up'].value_counts()[1]/len(feature_df)

0.28668941979522183