In [1]:
import pandas as pd
import numpy as np
import itertools

# 2. 시각화
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


# 3. 유틸
# tqdm 패키지는 반복문에 대해 얼마나 진척되었는지를 가시적으로 확인할 수 있도록 도와줍니다.
# https://github.com/tqdm/tqdm 사용법은 정말 간단합니다.
from tqdm.auto import tqdm


# 4. 설정
# 경고가 나와서, 출력이 많아지지 않기 위해 ignore를 설정해주었습니다.
import warnings
warnings.filterwarnings('ignore')


# 5. stats models
# 시계열 모델을 위한 ARIMA를 임포트 해주었습니다.
from statsmodels.tsa.arima_model import ARIMA
from pmdarima.arima import auto_arima

In [2]:
DATA_PATH = "./data"

In [2]:
train_x = pd.read_csv("train_x_df.csv")
train_y = pd.read_csv("train_y_df.csv")
# test_x = pd.read_csv("test_x_df.csv")

In [3]:
def make_mfi(my_df):
    period = 14
    result = []

    for sample_id in my_df['sample_id'].unique().tolist():
      df = my_df[my_df['sample_id'] == sample_id]

      df['typical_price'] = (df['close'] + df['high'] + df['low']) / 3
      df['money_flow'] = df['typical_price'] * df['volume']
      df['price_diff'] = df.groupby(['sample_id'])['typical_price'].diff(1).shift(-1)
      df['pf'] = np.where(df['price_diff'] > 0, df['money_flow'], 0)
      df['nf'] = np.where(df['price_diff'] < 0, df['money_flow'], 0)

      # 상승분의 14일 평균을 구해줍니다.
      # 하락분의 14일 평균을 구해줍니다.
      df["pmf"] = df["pf"].rolling(window=period, min_periods=period).sum()
      df["nmf"] = df["nf"].rolling(window=period, min_periods=period).sum()

      MFI = df['pmf'] / (df['pmf'] + df['nmf'])
      df['mfi'] = MFI
      df['mfi'] = df['mfi'].shift(1)

      result.append(df)
      
    output = pd.concat(result, axis=0)

    return output

In [4]:
train_x = make_mfi(train_x[train_x['sample_id'] < 320])
train_y = make_mfi(train_y[train_y['sample_id'] < 320])
# test_x = make_mfi(test_x)

In [5]:
train_x
# test_x

Unnamed: 0,sample_id,time,coin_index,open,high,low,close,volume,quote_av,trades,tb_base_av,tb_quote_av,typical_price,money_flow,price_diff,pf,nf,pmf,nmf,mfi
0,0,0,0,0.993147,0.993546,0.992857,0.992966,1.379478e+03,3778.584961,11.240029,3.296555e+02,903.091614,0.993123,1.369991e+03,-0.000133,0.000000e+00,1.369991e+03,,,
1,0,1,0,0.993256,0.993546,0.992712,0.992712,3.438807e+03,9419.426758,11.602611,1.363999e+03,3737.512695,0.992990,3.414702e+03,0.001039,3.414702e+03,0.000000e+00,,,
2,0,2,0,0.992748,0.994815,0.992458,0.994815,3.714949e+03,10173.972656,19.579407,1.222803e+03,3350.688721,0.994030,3.692769e+03,0.000761,3.692769e+03,0.000000e+00,,,
3,0,3,0,0.994779,0.995286,0.994090,0.994996,2.430265e+03,6666.315430,15.591008,5.201595e+02,1426.920776,0.994791,2.417605e+03,-0.000363,0.000000e+00,2.417605e+03,,,
4,0,4,0,0.994561,0.994779,0.993727,0.994779,3.062139e+03,8395.172852,15.228427,2.166334e+03,5939.279785,0.994428,3.045078e+03,0.000520,3.045078e+03,0.000000e+00,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
441595,319,1375,7,0.995770,1.002737,0.995521,1.000498,1.536502e+07,617587.375000,2488.181152,1.170627e+07,470406.187500,0.999585,1.535865e+07,-0.000249,0.000000e+00,1.535865e+07,1.197525e+08,3.570702e+07,0.856653
441596,319,1376,7,1.000249,1.000249,0.997761,1.000000,4.870637e+06,195647.390625,845.981567,1.909104e+06,76699.218750,0.999336,4.867405e+06,0.001161,4.867405e+06,0.000000e+00,1.246199e+08,3.439480e+07,0.770313
441597,319,1377,7,0.999751,1.001493,0.999502,1.000498,2.086322e+06,83860.093750,348.345367,2.029246e+06,81565.171875,1.000498,2.087361e+06,0.000415,2.087361e+06,0.000000e+00,1.244979e+08,3.439480e+07,0.783701
441598,319,1378,7,1.000995,1.002488,0.999502,1.000746,5.720814e+06,230307.265625,796.217957,2.625001e+06,105717.906250,1.000912,5.726033e+06,-0.001410,0.000000e+00,5.726033e+06,1.137117e+08,4.012083e+07,0.783534


In [6]:
TRAIN_SAMPLE_ID_LIST = train_x["sample_id"].unique().tolist()
# TEST_SAMPLE_ID_LIST = test_x["sample_id"].unique().tolist()

In [7]:
def get_typical_price(df,sample_id):    
    return df[df["sample_id"] == sample_id]['typical_price'].values

In [9]:
def get_mfi(df,sample_id):
  return df[df["sample_id"] == sample_id]['mfi'].values

In [16]:
# AIC 값이 최소인 p,d,q 값 직접 구하기
def model_fit(df, sample_id_list):
    result = []

    for sample_id in tqdm(sample_id_list):

        price_x = get_typical_price(df, sample_id)
        # 2. ARIMA
        # 1) 모델 정의
        ARIMA_MODEL = {}
        ARIMA_MODEL_FIT = {}

        # 2) AR 모델 적용
        try:
            ARIMA_MODEL = ARIMA(price_x, order = (4,1,1))
            ARIMA_MODEL_FIT = ARIMA_MODEL.fit(trend = 'nc', full_output = True, disp = True)

        # 3) 수렴하지 않을 경우 p d q 를 1, 1, 0으로 사용
        except:
            ARIMA_MODEL = ARIMA(price_x, order = (1,1,0))
            ARIMA_MODEL_FIT = ARIMA_MODEL.fit(trend = 'nc', full_output = True, disp = True)

        # 4) ARIMA 예측
        ARIMA_FORECAST  = ARIMA_MODEL_FIT.predict(1,120, typ='levels')

        # 3. 데이처 처리
        # 1) 최대 부분인 인덱스를 찾는데 해당 시점에 매도를 진행합니다.
        sell_time = np.argmax(ARIMA_FORECAST)

        # 2) 최대값을 찾습니다.
        max_val = np.max(ARIMA_FORECAST)

        # mfi_last_val = mfi_series[1379]

        # 4. 투자 전략
        buy_quantity = 0

        # 1) typical_price가 1.15 이상이면 투자합니다.
        # if  max_val > 1.1:
        #     buy_quantity = 1

        # if mfi_last_val > 0.7:
        #     buy_quantity = 0
            
        # # 3) 만약 mfi의 값이 65 보다 크면, 초과매수 상태로 판단하여 투자하지 않습니다.
        # if mfi_last_val < 0.3  :
        #     buy_quantity = 1

        # 5. 결과
        # result_list = [
        #                 sample_id,
        #                 buy_quantity,
        #                 sell_time,
        #             ]

        # result.append(result_list)

        result.append(ARIMA_FORECAST)
    return result

In [28]:
# result = model_fit(train_x, TRAIN_SAMPLE_ID_LIST)
val_list = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3]
for val in val_list:
    result = model_fit(test_x, TEST_SAMPLE_ID_LIST, val)

100%|██████████| 50/50 [09:21<00:00, 11.23s/it]


In [10]:
# auto_arima를 이용한 p,d,q 값 구하기
def model_fit(df, sample_id_list):
  result = []
  for sample_id in tqdm(sample_id_list):
    price_x = get_typical_price(df, sample_id)
    mfi_series = get_mfi(df, sample_id)

    auto_arima_model = auto_arima(price_x, start_p=0, start_q=0, max_p=5, max_q=5, seasonal=False, 
                              d=1,
                              trace=True,
                              error_action='ignore',
                              suppress_warnings=True,
                              stepwise=False)
    
    # 2. ARIMA
    # 1) 모델 정의
    ARIMA_MODEL = {}
    ARIMA_MODEL_FIT = {}

    # 2) AR 모델 적용
    try:
        ARIMA_MODEL = ARIMA(price_x, order = auto_arima_model.to_dict()['order'])
        ARIMA_MODEL_FIT = ARIMA_MODEL.fit(trend = 'nc', full_output = True, disp = True)

    # 3) 수렴하지 않을 경우 p d q 를 1, 1, 0으로 사용
    except:
        ARIMA_MODEL = ARIMA(price_x, order = (1, 1, 0))
        ARIMA_MODEL_FIT = ARIMA_MODEL.fit(trend = 'nc', full_output = True, disp = True)

    # 4) ARIMA 예측
    ARIMA_FORECAST  = ARIMA_MODEL_FIT.predict(1,120, typ='levels')

    # 3. 데이처 처리
    # 1) 최대 부분인 인덱스를 찾는데 해당 시점에 매도를 진행합니다.
    sell_time = np.argmax(ARIMA_FORECAST)

    # 2) 최대값을 찾습니다.
    max_val = np.max(ARIMA_FORECAST)
    
    mfi_last_val = mfi_series[1379]

    # 4. 투자 전략
    buy_quantity = 0

    # 1) typical_price가 1.15 이상이면 투자합니다.
    if  max_val > 1.15:
        buy_quantity = 1

    # mfi값이 0.7이상이면 투자를 하지 않는다.
    if mfi_last_val > 0.7:
        buy_quantity = 0
        
    # 3) 만약 mfi의 0.3보다 작으면 투자한다.
    if mfi_last_val < 0.3:
        buy_quantity = 1

    # 5. 결과
    result_list = [
                    sample_id,
                    buy_quantity,
                    sell_time
                  ]

    result.append(result_list)
  return result

In [11]:
# result = model_fit(train_x, TRAIN_SAMPLE_ID_LIST)
result = model_fit(test_x, TEST_SAMPLE_ID_LIST)

 intercept   : AIC=-14595.984, Time=0.75 sec
 ARIMA(0,1,4)(0,0,0)[0] intercept   : AIC=-14594.764, Time=1.02 sec
 ARIMA(0,1,5)(0,0,0)[0] intercept   : AIC=-14596.926, Time=1.38 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=-14578.977, Time=0.26 sec
 ARIMA(1,1,1)(0,0,0)[0] intercept   : AIC=-14597.401, Time=0.51 sec
 ARIMA(1,1,2)(0,0,0)[0] intercept   : AIC=-14596.118, Time=0.48 sec
 ARIMA(1,1,3)(0,0,0)[0] intercept   : AIC=-14488.075, Time=1.00 sec
 ARIMA(1,1,4)(0,0,0)[0] intercept   : AIC=-14590.774, Time=0.40 sec
 ARIMA(2,1,0)(0,0,0)[0] intercept   : AIC=-14594.703, Time=0.35 sec
 ARIMA(2,1,1)(0,0,0)[0] intercept   : AIC=-14595.414, Time=0.87 sec
 ARIMA(2,1,2)(0,0,0)[0] intercept   : AIC=-14488.074, Time=0.44 sec
 ARIMA(2,1,3)(0,0,0)[0] intercept   : AIC=-14486.074, Time=0.99 sec
 ARIMA(3,1,0)(0,0,0)[0] intercept   : AIC=-14596.534, Time=0.47 sec
 ARIMA(3,1,1)(0,0,0)[0] intercept   : AIC=-14594.826, Time=0.59 sec
 ARIMA(3,1,2)(0,0,0)[0] intercept   : AIC=-14486.074, Time=0.89 sec
 AR

In [12]:
submit_columns = [
                  "sample_id", 
                  "buy_quantity", 
                  "sell_time"
                  ]

submit = pd.DataFrame(data=result, columns=submit_columns)

In [13]:
submit

Unnamed: 0,sample_id,buy_quantity,sell_time
0,7929,0,116
1,7930,0,47
2,7931,0,32
3,7932,0,21
4,7933,0,2
...,...,...,...
755,8684,0,5
756,8685,1,55
757,8686,0,23
758,8687,0,0


In [14]:
#투자할 sample_id 갯수 확인
submit[submit["buy_quantity"] == 1].shape[0]  

109

In [15]:
submit.buy_quantity.value_counts()

0    651
1    109
Name: buy_quantity, dtype: int64

In [12]:
def df2d_to_answer(df_2d):
    # valid_y_df로부터
    # open 가격 정보가 포함된
    # [샘플 수, 120분] 크기의 
    # 2차원 array를 반환하는 함수
    feature_size = df_2d.iloc[:,2:].shape[1]
    time_size = len(df_2d.time.value_counts())
    sample_size = len(df_2d.sample_id.value_counts())
    sample_index = df_2d.sample_id.value_counts().index
    array_2d = df_2d.open.values.reshape([sample_size, time_size])
    sample_index = list(sample_index)
    return array_2d, sample_index


def COIN(y_df, submission, df2d_to_answer = df2d_to_answer):
    # 2차원 데이터프레임에서 open 시점 데이터만 추출하여 array로 복원
    # sample_id정보를 index에 저장
    y_array, index = df2d_to_answer(y_df)

    
    # index 기준으로 submission을 다시 선택
    submission = submission.set_index(submission.columns[0])
    submission = submission.iloc[index, :]    
    
    # 초기 투자 비용은 10000 달러
    total_momey      = 10000 # dolors
    total_momey_list = []
    
    # 가장 처음 sample_id값
    start_index = submission.index[0]
    for row_idx in submission.index:
        sell_time  = submission.loc[row_idx, 'sell_time']
        buy_price  = y_array[row_idx - start_index, 0]
        sell_price = y_array[row_idx - start_index, sell_time]
        buy_quantity = submission.loc[row_idx, 'buy_quantity'] * total_momey
        residual = total_momey - buy_quantity
        ratio = sell_price / buy_price
        total_momey = buy_quantity * ratio * 0.9995 * 0.9995 + residual        
        total_momey_list.append(total_momey)
        
    return total_momey, total_momey_list

In [17]:
# train_y = train_y[train_y['sample_id'] < 50]

NameError: name 'train_y' is not defined

In [50]:
total_momey, total_momey_list = COIN(train_y,
                                     submit)

In [18]:
result = model_fit(train_x, TRAIN_SAMPLE_ID_LIST)
result
# val_list = [1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5]
# my_res = []
# for i in range(len(val_list)):
#     result = model_fit(train_x, TRAIN_SAMPLE_ID_LIST, val_list[i])
#     submit_columns = [
#                   "sample_id", 
#                   "buy_quantity", 
#                   "sell_time"
#                   ]

#     submit = pd.DataFrame(data=result, columns=submit_columns)
#     total_money, total_momey_list = COIN(train_y,
#                                      submit)
#     my_res.append([val_list[i], total_money])
#     print(val_list[i], total_money)

100%|██████████| 320/320 [01:50<00:00,  2.89it/s]


6, 1.18596224, 1.18632486, 1.18700383, 1.18765201,
        1.18749905, 1.18761072, 1.1877386 , 1.18859977, 1.18892768,
        1.18978521, 1.18970248, 1.18992091, 1.1907584 , 1.19132546,
        1.19264247, 1.19358668, 1.1908855 , 1.19102038, 1.19087387,
        1.19152679, 1.192419  , 1.19276912, 1.19342211, 1.19284651,
        1.19140042, 1.19104229, 1.19150029, 1.19036073, 1.1899855 ]),
 array([0.9885929 , 0.98744065, 0.9875403 , 0.98819237, 0.98713293,
        0.98752302, 0.98796293, 0.98728804, 0.98453976, 0.98424156,
        0.98540689, 0.98626435, 0.98575732, 0.98783279, 0.98720774,
        0.98603917, 0.9852632 , 0.98751322, 0.98756346, 0.98722098,
        0.98587095, 0.98783414, 0.98841486, 0.99114151, 0.9900772 ,
        0.98791928, 0.9878227 , 0.9853225 , 0.98588915, 0.98363979,
        0.98360682, 0.98433599, 0.98381578, 0.98585428, 0.98533641,
        0.98476753, 0.98524235, 0.98651342, 0.98717856, 0.98626841,
        0.98715872, 0.98766631, 0.98721615, 0.98639577, 0.98582

In [30]:
val_list = [1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5]
total_money_list = []
for val in val_list:
    my_result = []
    for i in range(len(result)):
        sell_time = np.argmax(result[i])

        # 2) 최대값을 찾습니다.
        max_val = np.max(result[i])
        print(max_val)

        # 4. 투자 전략
        buy_quantity = 0

        # 1) typical_price가 1.15 이상이면 투자합니다.
        if  max_val > val:
            buy_quantity = 1

        # 5. 결과
        result_list = [
                        i,
                        buy_quantity,
                        sell_time,
                    ]
        my_result.append(result_list)

    submit_columns = [
                "sample_id", 
                "buy_quantity", 
                "sell_time"
                ]

    submit = pd.DataFrame(data=my_result, columns=submit_columns)
    total_money, total_momey_list = COIN(train_y,
                                    submit)
    total_money_list.append([val, total_money])

In [31]:
total_money_list

[[1.1, 8532.067367766338],
 [1.2, 9589.83450572615],
 [1.3, 9990.0025],
 [1.4, 9990.0025],
 [1.5, 9990.0025],
 [1.6, 9990.0025],
 [1.7, 9990.0025],
 [1.8, 9990.0025],
 [1.9, 10000.0],
 [2.0, 10000.0],
 [2.1, 10000.0],
 [2.2, 10000.0],
 [2.3, 10000.0],
 [2.4, 10000.0],
 [2.5, 10000.0]]

In [51]:
print(total_momey)
print(total_momey_list)

10020.847077868026
[10092.11634282645, 10092.11634282645, 10092.11634282645, 10092.11634282645, 10092.11634282645, 10092.11634282645, 10089.02524577841, 10089.02524577841, 10091.797852589892, 10091.797852589892, 10091.797852589892, 10091.797852589892, 10250.533136582384, 10089.692593550026, 10089.692593550026, 10089.692593550026, 10089.692593550026, 10089.692593550026, 10089.692593550026, 10089.692593550026, 10089.692593550026, 10089.692593550026, 10089.692593550026, 10089.692593550026, 10089.692593550026, 10089.692593550026, 10089.692593550026, 10089.692593550026, 10089.692593550026, 10089.692593550026, 10089.692593550026, 10079.605423379626, 10079.605423379626, 10079.605423379626, 10096.765397020912, 10096.765397020912, 10223.423277233, 10223.423277233, 10223.423277233, 10223.423277233, 10213.202409811587, 10213.202409811587, 10213.202409811587, 10213.202409811587, 10213.202409811587, 10213.202409811587, 10213.202409811587, 10020.847077868026, 10020.847077868026, 10020.847077868026]


In [18]:
FILE_NAME = "/0617_AUTO_ARIMA_MFI_30UNDER_MAXVAL_115_SUBMIT.csv"

In [19]:
SUBMIT_PATH = "./data"
RESULT_PATH = SUBMIT_PATH + FILE_NAME

submit.to_csv(RESULT_PATH, index=False)