In [4]:
import pandas as pd
import numpy as np
import itertools

# 2. 시각화
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


# 3. 유틸
# tqdm 패키지는 반복문에 대해 얼마나 진척되었는지를 가시적으로 확인할 수 있도록 도와줍니다.
# https://github.com/tqdm/tqdm 사용법은 정말 간단합니다.
from tqdm.auto import tqdm


# 4. 설정
# 경고가 나와서, 출력이 많아지지 않기 위해 ignore를 설정해주었습니다.
import warnings
warnings.filterwarnings('ignore')


# 5. stats models
# 시계열 모델을 위한 ARIMA를 임포트 해주었습니다.
from statsmodels.tsa.arima_model import ARIMA
from pmdarima.arima import auto_arima

In [21]:
test_x = pd.read_csv("test_x_df.csv")


In [5]:
train_x = pd.read_csv("train_x_df.csv")
train_y = pd.read_csv("train_y_df.csv")

In [6]:
def make_mfi(my_df):
    period = 14
    result = []

    for sample_id in my_df['sample_id'].unique().tolist():
      df = my_df[my_df['sample_id'] == sample_id]

      df['typical_price'] = (df['close'] + df['high'] + df['low']) / 3
      df['money_flow'] = df['typical_price'] * df['volume']
      df['price_diff'] = df.groupby(['sample_id'])['typical_price'].diff(1).shift(-1)
      df['pf'] = np.where(df['price_diff'] > 0, df['money_flow'], 0)
      df['nf'] = np.where(df['price_diff'] < 0, df['money_flow'], 0)

      # 상승분의 14일 평균을 구해줍니다.
      # 하락분의 14일 평균을 구해줍니다.
      df["pmf"] = df["pf"].rolling(window=period, min_periods=period).sum()
      df["nmf"] = df["nf"].rolling(window=period, min_periods=period).sum()

      MFI = df['pmf'] / (df['pmf'] + df['nmf'])
      df['mfi'] = MFI
      df['mfi'] = df['mfi'].shift(1)

      result.append(df)
      
    output = pd.concat(result, axis=0)

    return output

In [22]:
test_x = make_mfi(test_x)

In [None]:
test_x

In [8]:
train_x = make_mfi(train_x[train_x['sample_id'] < 760])
train_y = train_y[train_y['sample_id'] < 760]

In [43]:
train_x

Unnamed: 0,sample_id,time,coin_index,open,high,low,close,volume,quote_av,trades,tb_base_av,tb_quote_av,typical_price,money_flow,price_diff,pf,nf,pmf,nmf,mfi
0,0,0,0,0.993147,0.993546,0.992857,0.992966,1379.478027,3778.584961,11.240029,329.655548,903.091614,0.993123,1369.991487,-0.000133,0.000000,1369.991487,,,
1,0,1,0,0.993256,0.993546,0.992712,0.992712,3438.807373,9419.426758,11.602611,1363.999268,3737.512695,0.992990,3414.701802,0.001039,3414.701802,0.000000,,,
2,0,2,0,0.992748,0.994815,0.992458,0.994815,3714.949463,10173.972656,19.579407,1222.802856,3350.688721,0.994030,3692.769439,0.000761,3692.769439,0.000000,,,
3,0,3,0,0.994779,0.995286,0.994090,0.994996,2430.264648,6666.315430,15.591008,520.159546,1426.920776,0.994791,2417.605282,-0.000363,0.000000,2417.605282,,,
4,0,4,0,0.994561,0.994779,0.993727,0.994779,3062.139404,8395.172852,15.228427,2166.334473,5939.279785,0.994428,3045.078194,0.000520,3045.078194,0.000000,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
551995,399,1375,6,1.006870,1.008103,1.006341,1.006518,21.015982,1201.546631,2.060948,11.966638,684.212280,1.006987,21.162826,-0.002936,0.000000,21.162826,114.031268,149.229067,0.482678
551996,399,1376,6,1.006518,1.006518,1.002642,1.002995,29.148287,1661.261963,2.871235,9.344916,532.389954,1.004051,29.266378,-0.003347,0.000000,29.266378,114.031268,176.229006,0.433150
551997,399,1377,6,1.002995,1.002995,0.996829,1.002290,88.265602,5008.012695,10.551348,34.899464,1979.980103,1.000705,88.327796,0.000528,88.327796,0.000000,202.359065,173.961499,0.392859
551998,399,1378,6,1.002114,1.003347,1.000176,1.000176,28.551916,1624.406128,3.434913,13.287203,755.970886,1.001233,28.587120,-0.004932,0.000000,28.587120,202.359065,182.177143,0.537731


In [24]:
TEST_SAMPLE_ID_LIST = test_x["sample_id"].unique().tolist()

In [9]:
TRAIN_SAMPLE_ID_LIST = train_x["sample_id"].unique().tolist()

In [10]:
def get_typical_price(df,sample_id):    
    return df[df["sample_id"] == sample_id]['typical_price'].values

In [11]:
def get_mfi(df,sample_id):
  return df[df["sample_id"] == sample_id]['mfi'].values

In [12]:
# AIC 값이 최소인 p,d,q 값 직접 구하기
def model_fit(df, sample_id_list):
    result = []

    for sample_id in tqdm(sample_id_list):

        price_x = get_typical_price(df, sample_id)
        mfi_series = get_mfi(df, sample_id)

        # 2. ARIMA
        # 1) 모델 정의
        ARIMA_MODEL = {}
        ARIMA_MODEL_FIT = {}

        # 2) AR 모델 적용
        try:
            ARIMA_MODEL = ARIMA(price_x, order = (5,1,1))
            ARIMA_MODEL_FIT = ARIMA_MODEL.fit(trend = 'nc', full_output = True, disp = True)

        # 3) 수렴하지 않을 경우 p d q 를 1, 1, 0으로 사용
        except:
            ARIMA_MODEL = ARIMA(price_x, order = (1,1,0))
            ARIMA_MODEL_FIT = ARIMA_MODEL.fit(trend = 'nc', full_output = True, disp = True)

        # 4) ARIMA 예측
        ARIMA_FORECAST  = ARIMA_MODEL_FIT.predict(1,120, typ='levels')

        # 3. 데이처 처리
        # 1) 최대 부분인 인덱스를 찾는데 해당 시점에 매도를 진행합니다.
        sell_time = np.argmax(ARIMA_FORECAST)

        # 2) 최대값을 찾습니다.
        max_val = np.max(ARIMA_FORECAST)

        mfi_last_val = mfi_series[1379]

        # 4. 투자 전략
        buy_quantity = 0

        # 1) typical_price가 1.1 이상이면 투자합니다.
        if  max_val > 1:
            buy_quantity = 1

        # if mfi_last_val > 0.7:
        #     buy_quantity = 0
            
        # if mfi_last_val < 0.2:
        #     buy_quantity = 1

        # 5. 결과
        result_list = [
                        sample_id,
                        buy_quantity,
                        sell_time,
                        max_val,
                    ]

        result.append(result_list)

    return result

In [13]:
# result = model_fit(test_x, TEST_SAMPLE_ID_LIST)
result = model_fit(train_x, TRAIN_SAMPLE_ID_LIST)

100%|██████████| 760/760 [06:27<00:00,  1.96it/s]


In [14]:
submit_columns = [
                  "sample_id", 
                  "buy_quantity", 
                  "sell_time",
                  "max_val"
                  ]

submit = pd.DataFrame(data=result, columns=submit_columns)

In [15]:
#투자할 sample_id 갯수 확인
submit[submit["buy_quantity"] == 1].shape[0]  

456

In [28]:
submit.buy_quantity.value_counts()

0    731
1     29
Name: buy_quantity, dtype: int64

In [17]:
sample_id = submit[submit["buy_quantity"] == 1]['sample_id']
sample_id

0        0
1        1
2        2
3        3
4        4
      ... 
755    755
756    756
757    757
758    758
759    759
Name: sample_id, Length: 456, dtype: int64

In [18]:
for i in sample_id:
    dfy = train_y[train_y['sample_id'] == i]
    sell_time = submit[submit['sample_id'] == i]['sell_time'].values[0]
    print(dfy[dfy['time'] == sell_time]['close'].values[0], end=' ')
    val = submit[submit['sample_id'] == i]['max_val'].values[0]
    print(val)

1.0088469982147217 1.0097849075114167
1.0005611181259155 1.0470076342228418
0.9998448491096495 1.0114892632651848
1.015200138092041 1.0502568824795357
0.9852941036224364 1.2198848844762473
0.9973055124282836 1.0172897974650066
1.0019501447677612 1.0092321956730397
0.9937760233879088 1.0017716213056902
0.9985129833221436 1.0062921444574993
1.002584457397461 1.016767420518018
1.006462335586548 1.005882862222097
1.0139777660369873 1.0172126360850502
1.0035873651504517 1.0660667791926175
1.0031946897506714 1.0272566516988653
1.00019109249115 1.0093605978013835
1.0009727478027344 1.0184824466705322
0.999977171421051 1.0095606825493406
0.9977954626083374 1.007828874929586
1.0001671314239502 1.0289630095163982
1.0037580728530884 1.009050268157268
0.9868937730789183 1.0287486374688855
1.0009417533874512 1.1659874825535426
0.998242735862732 1.0014343628024724
0.9818181991577148 1.181508283997616
0.9953882098197936 1.033817502777302
1.0008950233459473 1.0276854866316079
0.9903112649917604 1.0009

In [19]:
def df2d_to_answer(df_2d):
    # valid_y_df로부터
    # open 가격 정보가 포함된
    # [샘플 수, 120분] 크기의 
    # 2차원 array를 반환하는 함수
    feature_size = df_2d.iloc[:,2:].shape[1]
    time_size = len(df_2d.time.value_counts())
    sample_size = len(df_2d.sample_id.value_counts())
    sample_index = df_2d.sample_id.value_counts().index
    array_2d = df_2d.open.values.reshape([sample_size, time_size])
    sample_index = list(sample_index)
    return array_2d, sample_index


def COIN(y_df, submission, df2d_to_answer = df2d_to_answer):
    # 2차원 데이터프레임에서 open 시점 데이터만 추출하여 array로 복원
    # sample_id정보를 index에 저장
    y_array, index = df2d_to_answer(y_df)

    
    # index 기준으로 submission을 다시 선택
    submission = submission.set_index(submission.columns[0])
    submission = submission.iloc[index, :]    
    
    # 초기 투자 비용은 10000 달러
    total_momey      = 10000 # dolors
    total_momey_list = []
    
    # 가장 처음 sample_id값
    start_index = submission.index[0]
    for row_idx in submission.index:
        sell_time  = submission.loc[row_idx, 'sell_time']
        buy_price  = y_array[row_idx - start_index, 0]
        sell_price = y_array[row_idx - start_index, sell_time]
        buy_quantity = submission.loc[row_idx, 'buy_quantity'] * total_momey
        residual = total_momey - buy_quantity
        ratio = sell_price / buy_price
        total_momey = buy_quantity * ratio * 0.9995 * 0.9995 + residual        
        total_momey_list.append(total_momey)
        
    return total_momey, total_momey_list

In [20]:
total_money, total_momey_list = COIN(train_y, submit)

In [21]:
total_money

5025.8024118951935

In [23]:
total_momey_list

[10092.11634282645,
 10008.127292207366,
 9900.0999782314,
 10168.124147650953,
 10161.103837159251,
 10161.103837159251,
 10138.279528284489,
 10138.279528284489,
 10149.95007167222,
 10174.956075336531,
 10164.783663000215,
 10164.783663000215,
 10161.334649406785,
 10142.752355121567,
 10107.674538451774,
 10036.91433664188,
 9949.792717418946,
 9977.959235881306,
 9719.37285649186,
 9731.813789120433,
 9710.927638040243,
 9727.379085175347,
 9727.379085175347,
 9727.379085175347,
 9730.63252768466,
 9694.642012672703,
 9896.00481553652,
 9819.113739343113,
 9819.113739343113,
 9819.113739343113,
 9899.797625683204,
 9895.481398070031,
 9895.481398070031,
 9850.7549262916,
 9814.164778687025,
 9723.811376097698,
 9725.059455197696,
 9750.73764226815,
 9762.974713577032,
 9762.974713577032,
 9762.974713577032,
 9762.974713577032,
 9762.974713577032,
 9762.974713577032,
 9746.449849879451,
 9746.449849879451,
 9646.604267706463,
 9646.604267706463,
 9646.604267706463,
 9188.7712562706

In [38]:
FILE_NAME = "/0617_AUTO_ARIMA_MFI_20UNDER_MAXVAL_110_SUBMIT.csv"

In [39]:
SUBMIT_PATH = "./data"
RESULT_PATH = SUBMIT_PATH + FILE_NAME

submit.to_csv(RESULT_PATH, index=False)

In [None]:
def plot_series(x_series, y_series):
    #입력 series와 출력 series를 연속적으로 연결하여 시각적으로 보여주는 코드 입니다.
    plt.plot(x_series, label = 'input_series')
    plt.plot(np.arange(len(x_series), len(x_series)+len(y_series)),
             y_series, label = 'output_series')
    plt.axhline(1, c = 'red')
    plt.legend()