In [4]:
import pandas as pd
from datetime import datetime
from datetime import timedelta
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import matplotlib.pyplot as plt
import FinanceDataReader as fdr

## Cosine similarity를 통해 비슷한 패턴 찾기

In [99]:
train1 = pd.read_csv('./train.csv')
train2 = pd.read_csv('./train_additional.csv')
train = pd.concat([train1,train2])
train["일자"] = pd.to_datetime(train["일자"], format="%Y%m%d").dt.strftime("%Y-%m-%d")
train['종목코드'] = train['종목코드'].str.replace('A', '')
codes = train['종목코드'].unique()

In [100]:
class PatternFinder():
    def __init__(self,period=15):
        self.period = period

    def set_stock(self, code:str):
        self.code = code
        self.data = fdr.DataReader(code, '2000-01-01', '2023-07-28')
        self.close = self.data['Close']
        self.change = self.data['Change']
        return self.data
    def search(self, start_date, end_date, threshold = 0.98):
        base = self.close[start_date:end_date]
        self.base_norm = (base-base.min()) / (base.max() - base.min())
        self.base = base

        window_size = len(base)
        moving_cnt = len(self.data)-window_size-self.period -1
        cos_sims = self.__cosine_sims(moving_cnt,window_size)
        
        self.window_size = window_size
        cos_sims = cos_sims[cos_sims>threshold]
        return cos_sims
    def __cosine_sims (self, moving_cnt, window_size):
        def cosine_similarity(x,y):
            return np.dot(x,y) / (np.sqrt(np.dot(x,x))*np.sqrt(np.dot(y,y)))
        
        sim_list = []
        for i in range(moving_cnt):
            target = self.close[i:i+window_size]
            target_norm = (target-target.min())/(target.max() - target.min())
            cos_similarity = cosine_similarity(self.base_norm, target_norm)
            sim_list.append(cos_similarity)
        return pd.Series(sim_list).sort_values(ascending=False)

    def plot_pattern(self, idx, period=15):
        if period != self.period:
            self.period = period
        top=self.close[idx:idx + self.window_size+period]
        top_norm = (top - top.min()) / (top.max()-top.min())

        preds = self.change[idx+self.window_size: idx+self.window_size+period]
        return preds.mean()

    def stat_prediction(self,result,period=15):
        idx_list = list(result.keys())
        mean_list = []
        for idx in idx_list:
            pred = self.change[idx+self.window_size: idx+self.window_size+period]
            mean_list.append(pred.mean())
        return np.array(mean_list)
        

In [101]:
p = PatternFinder()

In [102]:
dict= {}

for code in tqdm(codes): 
    p = PatternFinder()
    p.set_stock(code)
    result = p.search('2023-07-03','2023-07-28')
    similarity = result.index
    sum = 0
    if len(similarity) > 0:
        for i in similarity:
            returns = p.plot_pattern(i)
            sum+=returns
        avg = sum/len(similarity)
        dict[code] = avg
    else:
        dict[code] = 0

dict

100%|██████████| 2000/2000 [22:01<00:00,  1.51it/s] 


{'060310': 0.0045614531971518695,
 '095570': 0.0013737314049773039,
 '006840': 0,
 '054620': -0.0027498764347176612,
 '265520': 0,
 '211270': 0.00022559698943762675,
 '027410': 0.001938312109420378,
 '282330': 0,
 '126600': 0.002979659249662473,
 '138930': 0,
 '001460': -0.0016005261994667271,
 '013720': -0.00048168958445916974,
 '001040': 0,
 '079160': 0,
 '035760': 0,
 '311690': 0,
 '000120': 0,
 '011150': 0.00333727713121938,
 '097950': 0,
 '051500': -0.001061511798845698,
 '058820': 0,
 '023460': -0.022402936576269142,
 '056730': 0,
 '083660': 0,
 '000590': 0,
 '012030': 0,
 '016610': 0,
 '005830': 0,
 '000990': 0,
 '139130': 0,
 '060900': 0.00170600021132346,
 '025440': 0.0029750096422414976,
 '001530': 0,
 '000210': 0,
 '001880': 0.0007821925014358418,
 '375500': 0,
 '068790': 0.0019919839208606734,
 '007340': 0,
 '004840': 0,
 '241520': -0.0036029825849673105,
 '065150': 0.002514371835993708,
 '155660': 0.00425755255969625,
 '069730': 0.0008317646021455892,
 '180400': 0,
 '01794

In [103]:
lst = list(dict.items())

cos_sim = pd.DataFrame(lst,columns=['종목코드','순위'])
cos_sim['종목코드'] = 'A' + cos_sim['종목코드']
# cos_sim.set_index('종목코드', inplace=True)
cos_sim = cos_sim.sort_values(by='순위', ascending=False)
cos_sim['순위'] = cos_sim['순위'].rank(ascending=False, method='first').astype(int)

# df.to_csv('cos_sim_3m_jul.csv')
cos_sim

Unnamed: 0,종목코드,순위
140,A000680,1
1200,A053980,2
210,A071970,3
1099,A203400,4
424,A131220,5
...,...,...
963,A297090,1996
1232,A332570,1997
1989,A263920,1998
21,A023460,1999


## Moving average를 통해 상승세 / 하락세에 있는 종목 구분하기

In [104]:
train1 = pd.read_csv('./train.csv')
train2 = pd.read_csv('./train_additional.csv')
train = pd.concat([train1,train2])
train["일자"] = pd.to_datetime(train["일자"], format="%Y%m%d").dt.strftime("%Y-%m-%d")
train

Unnamed: 0,일자,종목코드,종목명,거래량,시가,고가,저가,종가
0,2021-06-01,A060310,3S,166690,2890,2970,2885,2920
1,2021-06-01,A095570,AJ네트웍스,63836,5860,5940,5750,5780
2,2021-06-01,A006840,AK홀딩스,103691,35500,35600,34150,34400
3,2021-06-01,A054620,APS,462544,14600,14950,13800,14950
4,2021-06-01,A265520,AP시스템,131987,29150,29150,28800,29050
...,...,...,...,...,...,...,...,...
83995,2023-07-28,A001080,만호제강,12964,35550,36000,34700,36000
83996,2023-07-28,A104700,한국철강,72644,5780,6030,5780,6030
83997,2023-07-28,A045100,한양이엔지,59562,16230,16390,15970,16330
83998,2023-07-28,A000020,동화약품,86169,9870,10080,9700,9800


In [110]:
codes = train['종목코드'].unique()
rows = []
columns = ['종목코드','MA5','MA20','MA60','MA120','MA240','new_price_5', 'new_price_20', 'new_price_60','new_price_120', 'new_price_240' ]

for code in tqdm(codes):
    df = train[train['종목코드'].str.contains(code)]
    df['MA5']= df['종가'].rolling(window=5).mean()
    df['MA20']= df['종가'].rolling(window=20).mean()
    df['MA60']= df['종가'].rolling(window=60).mean()
    df['MA120']= df['종가'].rolling(window=120).mean()
    df['MA240']= df['종가'].rolling(window=240).mean()
    df.dropna()

    MA5= df['MA5'].iloc[-1]
    MA20= df['MA20'].iloc[-1]
    MA60= df['MA60'].iloc[-1]
    MA120= df['MA120'].iloc[-1]
    MA240= df['MA240'].iloc[-1]
    new_price_5 = (df['MA5'].iloc[-1] - df['MA5'].iloc[-3])/df['MA5'].iloc[-3]
    new_price_20 = (df['MA20'].iloc[-1] - df['MA20'].iloc[-10])/df['MA20'].iloc[-10]
    new_price_60 = (df['MA60'].iloc[-1] - df['MA60'].iloc[-15])/df['MA60'].iloc[-15]
    new_price_120 = (df['MA120'].iloc[-1] - df['MA120'].iloc[-20])/df['MA120'].iloc[-20]
    new_price_240 = (df['MA240'].iloc[-1] - df['MA240'].iloc[-30])/df['MA240'].iloc[-30]
    
    rows.append([code,MA5, MA20, MA60, MA120, MA240, new_price_5, new_price_20, new_price_60,new_price_120, new_price_240 ])

df = pd.DataFrame(rows, columns=columns)

df

100%|██████████| 2000/2000 [05:32<00:00,  6.02it/s]


Unnamed: 0,종목코드,MA5,MA20,MA60,MA120,MA240,new_price_5,new_price_20,new_price_60,new_price_120,new_price_240
0,A060310,2458.0,2687.50,2670.416667,2453.283333,2469.725000,-0.018371,-0.051108,0.035447,0.028041,-0.010533
1,A095570,4194.0,4294.00,4422.583333,4761.958333,5675.437500,-0.004982,-0.027407,-0.029213,-0.058289,-0.055510
2,A006840,19834.0,19838.50,19606.500000,18811.833333,17067.375000,-0.026409,0.017881,0.002224,0.023230,0.026082
3,A054620,8300.0,8944.00,10146.500000,11727.166667,12135.958333,-0.023300,-0.064435,-0.079780,-0.061094,0.003051
4,A265520,19712.0,20445.50,20753.500000,20773.166667,19536.333333,-0.020473,-0.020809,-0.007690,0.010671,0.021773
...,...,...,...,...,...,...,...,...,...,...,...
1995,A189980,2515.0,2717.50,2849.833333,2793.666667,2757.750000,-0.045178,-0.052145,0.007038,-0.008825,-0.009377
1996,A000540,2953.0,3015.75,3171.000000,3289.750000,3259.500000,0.001356,-0.023792,-0.023181,-0.021551,-0.007800
1997,A003280,2106.0,2319.25,1791.983333,1603.041667,1572.016667,-0.048351,0.039533,0.149850,0.093082,0.020989
1998,A037440,7118.0,7815.00,8728.000000,9184.583333,9473.416667,-0.039147,-0.076405,-0.042895,-0.048468,-0.004218


In [111]:
df.columns=['종목코드','MA5','MA20','MA60','MA120','MA240','수익률_MA5','수익률_MA20','수익률_MA60','수익률_MA120','수익률_MA240']


filtered = df[((df['MA5']/df['MA240'])>2)|((df['MA5']/df['MA240'])<0.50)]
filtered_df = df.drop(filtered.index)
filtered_df

# long = filtered_df[(filtered_df['수익률_MA240'] > -0.01) & (filtered_df['수익률_MA120'] > 0) & (filtered_df['수익률_MA60']>0) & (filtered_df['수익률_MA20']>0)]
# short = filtered_df[(filtered_df['수익률_MA240'] < 0.01) & (filtered_df['수익률_MA120'] < 0) & (filtered_df['수익률_MA60']<0) & (filtered_df['수익률_MA20']<0) & (filtered_df['MA5']<filtered_df['MA20']) ]

long = filtered_df[(filtered_df['수익률_MA240'] > -0.01) & (filtered_df['수익률_MA120'] > 0) & (filtered_df['수익률_MA60']>0) & (filtered_df['수익률_MA20']>0)]
long = long.sort_values('수익률_MA120',ascending=False)
short = filtered_df[(filtered_df['수익률_MA240'] < 0.01) & (filtered_df['수익률_MA120'] < 0) & (filtered_df['수익률_MA60']<0) & (filtered_df['수익률_MA20']<0) ]
short = short.sort_values('수익률_MA120',ascending=False)

long_short = pd.concat([long,short])
merged_df = df.merge(long_short, on=['종목코드'], how='outer', indicator=True)
missing_in_longshort = merged_df[merged_df['_merge'] == 'left_only']

result = pd.concat([long,missing_in_longshort,short])
MA_result = pd.DataFrame(result['종목코드'])
MA_result['순위']= range(1,len(MA_result)+1)
# MA_result.set_index('종목코드', inplace=True)
# MA_result.to_csv('stock_MA_jul_3.csv')


In [112]:
MA_result

Unnamed: 0,종목코드,순위
612,A140410,1
959,A004920,2
785,A011230,3
1758,A053610,4
1023,A013310,5
...,...,...
1007,A069920,1996
22,A056730,1997
1052,A238090,1998
1365,A083470,1999


## 두개의 모델 반영하여 순위 정렬하기

In [114]:
a = MA_result
b = cos_sim

l=200
s=200
num = 2

long_cos= a[:200]
long_MA=b[:200]
long_cos_MA = pd.concat([long_cos,long_MA])

duplicated_L = long_cos_MA.duplicated(subset=['종목코드'],keep='first')
duplicated_rows_L = long_cos_MA[duplicated_L]

long1=a[:int(l/num)]
long2=b[:int(l/num)]
long12 = pd.concat([long1,long2])

long = pd.concat([duplicated_rows_L,long12])
long = long.drop_duplicates(subset=['종목코드'], keep='first')

short_cos= a[1800:]
short_MA=b[1800:]
short_cos_MA = pd.concat([short_cos,short_MA])

duplicated_S = short_cos_MA.duplicated(subset=['종목코드'],keep='first')
duplicated_rows_S = short_cos_MA[duplicated_S]

short1=a[2000-int(s/num):]
short2=b[2000-int(s/num):]
short12 = pd.concat([short1,short2])

short = pd.concat([short12,duplicated_rows_S])
short = short.drop_duplicates(subset=['종목코드'], keep='last')

long['L/S']='long'
short['L/S']='short'

long_short = pd.concat([long,short])
long_short = long_short.drop_duplicates(subset=['종목코드'], keep='first')

merged_df = a.merge(long_short, on=['종목코드'], how='outer', indicator=True)
missing_in_longshort = merged_df[merged_df['_merge'] == 'left_only']
missing_in_longshort 

result = pd.concat([long_short[long_short['L/S']=='long'],missing_in_longshort,long_short[long_short['L/S']=='short']])
result=result.drop(columns=['L/S','순위_x','순위_y','_merge'])

result['순위']= range(1,len(result)+1)
result.set_index('종목코드', inplace=True)

result
# result.to_csv('test.csv')
result.to_csv('submission_private.csv')