In [1]:
!pip install statsmodels==0.12.2

Collecting statsmodels==0.12.2
[?25l  Downloading https://files.pythonhosted.org/packages/da/69/8eef30a6237c54f3c0b524140e2975f4b1eea3489b45eb3339574fc8acee/statsmodels-0.12.2-cp37-cp37m-manylinux1_x86_64.whl (9.5MB)
[K     |████████████████████████████████| 9.5MB 7.7MB/s 
Installing collected packages: statsmodels
  Found existing installation: statsmodels 0.10.2
    Uninstalling statsmodels-0.10.2:
      Successfully uninstalled statsmodels-0.10.2
Successfully installed statsmodels-0.12.2


In [2]:
# 1. 기본
# 데이터를 다루기 위한 pandas와 numpy를 import 해주었습니다.
import pandas as pd
import numpy as np


# 2. 시각화
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


# 3. 유틸
# tqdm 패키지는 반복문에 대해 얼마나 진척되었는지를 가시적으로 확인할 수 있도록 도와줍니다.
# https://github.com/tqdm/tqdm 사용법은 정말 간단합니다.
from tqdm.auto import tqdm


# 4. 설정
# 경고가 나와서, 출력이 많아지지 않기 위해 ignore를 설정해주었습니다.
import warnings
warnings.filterwarnings('ignore')


# 5. stats models
# 시계열 모델을 위한 ARIMA를 임포트 해주었습니다.
from statsmodels.tsa.arima_model import ARIMA


# 6. 구글 드라이브
# colab이 구글 드라이와 연결되어 있기 때문에 임포트해주었습니다.
from google.colab import drive

In [3]:
# 1) 구글 드라이브 경로, 2) 데이터 경로, 3) 전체 경로를 상수로 정해주고 사용했습니다.
GOOGLE_DRIVE_PATH = "/content/drive"
DATA_PATH = "/content/drive/MyDrive/dacon/season2_2"

In [4]:
 drive.mount(GOOGLE_DRIVE_PATH)

Mounted at /content/drive


In [6]:
# train_x = pd.read_csv(DATA_PATH  + "/train_x_df.csv")
# train_y = pd.read_csv(DATA_PATH  + "/train_y_df.csv")
test_x = pd.read_csv(DATA_PATH  + "/test_x_df.csv")
# test_x = pd.read_csv(DATA_PATH  + "/small_test.csv")

In [7]:
def make_mfi(my_df):
    period = 14
    result = []

    for sample_id in my_df['sample_id'].unique().tolist():
      df = my_df[my_df['sample_id'] == sample_id]

      df['typical_price'] = (df['close'] + df['high'] + df['low']) / 3
      df['money_flow'] = df['typical_price'] * df['volume']
      df['price_diff'] = df.groupby(['sample_id'])['typical_price'].diff(1).shift(-1)
      df['pf'] = np.where(df['price_diff'] > 0, df['money_flow'], 0)
      df['nf'] = np.where(df['price_diff'] < 0, df['money_flow'], 0)

      # 상승분의 14일 평균을 구해줍니다.
      # 하락분의 14일 평균을 구해줍니다.
      df["pmf"] = df["pf"].rolling(window=period, min_periods=period).sum()
      df["nmf"] = df["nf"].rolling(window=period, min_periods=period).sum()

      MFI = df['pmf'] / (df['pmf'] + df['nmf'])
      df['mfi'] = MFI
      df['mfi'] = df['mfi'].shift(1)

      result.append(df)
      
    output = pd.concat(result, axis=0)

    return output


      # test_x.loc[test_x['sample_id'] == sample_id, 'typical_price'] = (df['close'] + df['high'] + df['low']) / 3
      # test_x.loc[test_x['sample_id'] == sample_id, 'money_flow'] = df['typical_price'] * df['volume']
      # test_x.loc[test_x['sample_id'] == sample_id, 'price_diff'] = df.groupby(['sample_id'])['typical_price'].diff(1).shift(-1)
      # test_x.loc[test_x['sample_id'] == sample_id, 'pf'] = np.where(df['price_diff'] > 0, df['money_flow'], 0)
      # test_x.loc[test_x['sample_id'] == sample_id, 'nf'] = np.where(df['price_diff'] < 0, df['money_flow'], 0)

      # df = test_x[test_x['sample_id'] == sample_id]
      # print(MFI)

In [7]:
train_x = train_x[train_x['sample_id'] < 2000]
train_x = make_mfi(train_x)
train_x = train_x.fillna(0)

In [8]:
test_x = make_mfi(test_x)
test_x = test_x.fillna(0)

In [9]:
test_x

Unnamed: 0,sample_id,time,coin_index,open,high,low,close,volume,quote_av,trades,tb_base_av,tb_quote_av,typical_price,money_flow,price_diff,pf,nf,pmf,nmf,mfi
0,7929,0,1,0.941549,0.941773,0.940431,0.941176,0.169500,85.595390,0.089465,0.108427,54.745537,0.941127,0.159521,-0.000155,0.00000,0.159521,0.000000,0.000000,0.000000
1,7929,1,1,0.941586,0.941736,0.940282,0.940897,0.237560,119.955322,0.119287,0.105256,53.145061,0.940971,0.223538,-0.000485,0.00000,0.223538,0.000000,0.000000,0.000000
2,7929,2,1,0.941270,0.941586,0.938940,0.940934,0.231588,116.848953,0.149109,0.163196,82.363953,0.940487,0.217805,-0.000242,0.00000,0.217805,0.000000,0.000000,0.000000
3,7929,3,1,0.940971,0.941363,0.939052,0.940319,0.238199,120.162766,0.201297,0.157981,79.687485,0.940244,0.223965,-0.000168,0.00000,0.223965,0.000000,0.000000,0.000000
4,7929,4,1,0.940077,0.940561,0.939760,0.939909,0.082302,41.511501,0.055916,0.009733,4.908966,0.940077,0.077370,0.000801,0.07737,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048795,8688,1375,5,1.005905,1.007405,1.005392,1.006122,32184.292969,23957.617188,66.215317,4388.184082,3269.062256,1.006306,32387.253916,-0.001239,0.00000,32387.253916,815101.896991,441397.594525,0.649240
1048796,8688,1376,5,1.006243,1.006554,1.004040,1.004608,46183.425781,34342.394531,93.241982,8526.371094,6343.348145,1.005067,46417.456964,-0.001396,0.00000,46417.456964,815101.896991,380956.938350,0.648708
1048797,8688,1377,5,1.004608,1.005284,1.002865,1.002865,55310.468750,41101.046875,97.295982,16837.339844,12520.099609,1.003671,55513.516250,-0.002680,0.00000,55513.516250,815101.896991,297428.568114,0.681490
1048798,8688,1378,5,1.002865,1.002905,1.000013,1.000054,114685.742188,85007.203125,127.025307,35383.035156,26229.859375,1.000991,114799.389690,-0.002676,0.00000,114799.389690,702086.136427,412227.957804,0.732656


In [None]:
train_x

In [10]:
# TRAIN_SAMPLE_ID_LIST = train_x["sample_id"].unique().tolist()
TEST_SAMPLE_ID_LIST = test_x["sample_id"].unique().tolist()

In [11]:
def get_typical_price(df,sample_id):    
    return df[df["sample_id"] == sample_id]['typical_price'].values

In [12]:
def get_mfi(df,sample_id):
  return df[df["sample_id"] == sample_id]['mfi'].values

In [None]:
# ARIMA의 (p,d,q) 값이 (4,0,1)에서 수렴하지 않을 경우
# (1,1,0)로 변경하여 다시 학습 및 추론
# result = np.zeros([300,120])
result = []
for sample_id in tqdm(TEST_SAMPLE_ID_LIST):
    price_x = get_typical_price(test_x, sample_id)
    mfi_series = get_mfi(test_x, sample_id)

    # 2. ARIMA
    # 1) 모델 정의
    ARIMA_MODEL = {}
    ARIMA_MODEL_FIT = {}

    # 2) AR 모델 적용
    try:
      ARIMA_MODEL = ARIMA(price_x, order = (4,0,1))
      ARIMA_MODEL_FIT = ARIMA_MODEL.fit(trend = 'nc', full_output = True, disp = True)

    # 3) 수렴하지 않을 경우 p d q 를 1, 1, 0으로 사용
    except:
      ARIMA_MODEL = ARIMA(price_x, order = (1,1,0))
      ARIMA_MODEL_FIT = ARIMA_MODEL.fit(trend = 'nc', full_output = True, disp = True)

    # 4) ARIMA 예측
    ARIMA_FORECAST  = ARIMA_MODEL_FIT.predict(1,120, typ='levels')
    # result[sample_id,:] = ARIMA_FORECAST

    # 3. 데이처 처리
    # 1) 최대 부분인 인덱스를 찾는데 해당 시점에 매도를 진행합니다.
    sell_time = np.argmax(ARIMA_FORECAST)

    # 2) 최대값을 찾습니다.
    max_val = np.max(ARIMA_FORECAST)
    
    mfi_last_val = mfi_series[1379]

    # 4. 투자 전략
    buy_quantity = 0

    # 1) typical_price가 1.15 이상이면 투자합니다.
    if  max_val > 1.15:
        buy_quantity = 1

    # 3) 만약 mfi의 값이 65 보다 크면, 초과매수 상태로 판단하여 투자하지 않습니다.
    if mfi_last_val < 0.3  :
      buy_quantity = 1
    # if mfi_last_val > 0.65 and sell_time < 50:
    # if mfi_last_val > 0.65:
    #     buy_quantity = 0

    # 5. 결과
    result_list = [
                   sample_id,
                   buy_quantity,
                   sell_time
                  ]

    result.append(result_list)

In [23]:
# 1. 학습 결과를 데이터 프레임으로 만듭니다.

submit_columns = [
                  "sample_id", 
                  "buy_quantity", 
                  "sell_time"
                  ]


submit = pd.DataFrame(data=result, columns=submit_columns)

In [None]:
submit.head(40)

In [24]:
# 3. 투자 개수 확인

submit[submit["buy_quantity"] == 1].shape[0]  

113

In [25]:
submit.buy_quantity.value_counts()

0    647
1    113
Name: buy_quantity, dtype: int64

In [15]:
# 4. sell_time 50미만에서 구매하는 개수 확인
cond1 = (submit["buy_quantity"] == 1)
cond2 = (submit["sell_time"] < 50)

submit[cond1 & cond2].shape[0]

54

In [17]:
def df2d_to_answer(df_2d):
    # valid_y_df로부터
    # open 가격 정보가 포함된
    # [샘플 수, 120분] 크기의 
    # 2차원 array를 반환하는 함수
    feature_size = df_2d.iloc[:,2:].shape[1]
    time_size = len(df_2d.time.value_counts())
    sample_size = len(df_2d.sample_id.value_counts())
    sample_index = df_2d.sample_id.value_counts().index
    array_2d = df_2d.open.values.reshape([sample_size, time_size])
    sample_index = list(sample_index)
    return array_2d, sample_index


def COIN(y_df, submission, df2d_to_answer = df2d_to_answer):
    # 2차원 데이터프레임에서 open 시점 데이터만 추출하여 array로 복원
    # sample_id정보를 index에 저장
    y_array, index = df2d_to_answer(y_df)

    
    # index 기준으로 submission을 다시 선택
    submission = submission.set_index(submission.columns[0])
    submission = submission.iloc[index, :]    
    
    # 초기 투자 비용은 10000 달러
    total_momey      = 10000 # dolors
    total_momey_list = []
    
    # 가장 처음 sample_id값
    start_index = submission.index[0]
    for row_idx in submission.index:
        sell_time  = submission.loc[row_idx, 'sell_time']
        buy_price  = y_array[row_idx - start_index, 0]
        sell_price = y_array[row_idx - start_index, sell_time]
        buy_quantity = submission.loc[row_idx, 'buy_quantity'] * total_momey
        residual = total_momey - buy_quantity
        ratio = sell_price / buy_price
        total_momey = buy_quantity * ratio * 0.9995 * 0.9995 + residual        
        total_momey_list.append(total_momey)
        
    return total_momey, total_momey_list

In [17]:
train_y = train_y[train_y['sample_id'] < 2000]

In [18]:
total_momey, total_momey_list = COIN(train_y,
                                     submit)

In [19]:
# 투자 후 금액
print(total_momey)
print(total_momey_list)

9741.54832152288
[10000.0, 10000.0, 10000.0, 10000.0, 10000.0, 10000.0, 10000.0, 10000.0, 10000.0, 10001.706023252867, 10001.706023252867, 10001.706023252867, 10001.706023252867, 10001.706023252867, 10001.706023252867, 10001.706023252867, 10001.706023252867, 10001.706023252867, 10001.706023252867, 10001.706023252867, 10001.706023252867, 10001.706023252867, 10001.706023252867, 10001.706023252867, 10001.706023252867, 10001.706023252867, 10001.706023252867, 10001.706023252867, 10001.706023252867, 10001.706023252867, 10001.706023252867, 10001.706023252867, 10001.706023252867, 10001.706023252867, 10001.706023252867, 10001.706023252867, 10001.706023252867, 10001.706023252867, 10001.706023252867, 10001.706023252867, 10001.706023252867, 10001.706023252867, 10001.706023252867, 10001.706023252867, 10001.706023252867, 10001.706023252867, 10001.706023252867, 10001.706023252867, 10001.706023252867, 10133.11705787449, 10133.11705787449, 10133.11705787449, 10133.11705787449, 10133.11705787449, 10133.

In [26]:
# 파일의 이름을 지정해줍니다.
FILE_NAME = "/0616_ARIMA_MFI_30UNDER_MAXVAL_115_SUBMIT.csv"

In [27]:
# 제출경로에 파일을 생성해줍니다.
SUBMIT_PATH = "/content/drive/MyDrive/dacon/season2_2"
RESULT_PATH = SUBMIT_PATH + FILE_NAME

submit.to_csv(RESULT_PATH, index=False)