In [11]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse, mean_absolute_error as mae, r2_score as r2

import torch
from torch import nn, optim
from torchinfo import summary

from tqdm import tqdm

import requests
import json
from datetime import datetime
import os

In [13]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

BASE_URL = 'http://192.168.1.222:8999'

print(requests.get(f'{BASE_URL}/').text)

cuda:0
Hello Server



In [17]:
def list_files(directory):
    files_and_directories = os.listdir(directory)
    files = [file for file in files_and_directories if os.path.isfile(os.path.join(directory, file))]
    return files

def extract_before_dash(files):
    original_files = []
    modified_files = []
    for file in files:
        original_files.append(file)
        # '-'が含まれている場合、その前の文字列を抽出
        if '-' in file:
            modified_files.append(file.split('-')[0])
        else:
            # '-'が含まれていない場合はファイル名全体をリストに追加
            modified_files.append(file)
    return original_files, modified_files

# ここでディレクトリパスを指定
directory_path = './models_lstm/2024-08-04_20-58/'
file_list = list_files(directory_path)

# 元のファイル名と加工後のリストを取得
original_files, modified_files = extract_before_dash(file_list)

# 結果を表示
print("元のファイル名リスト:", original_files)
print("加工後の文字列リスト:", modified_files)
print(file_list)


元のファイル名リスト: ['EMN--1178.87-0.00.pth', 'BA--309.84-0.00.pth', 'NKE--101.39-0.00.pth', 'TJX--36.71-0.00.pth', 'HII--1912.09-0.01.pth', 'L--15699.50-0.02.pth', 'CRWD--401.89-0.02.pth', 'PPG--50.53-0.00.pth', 'NCLH--0.08-0.00.pth', 'PSX--2.80-0.00.pth', 'CE--275.74-0.00.pth', 'SBUX--10.65-0.00.pth', 'YUM--174.77-0.00.pth', 'ELV--1548.66-0.00.pth', 'SNPS--1362.12-0.01.pth', 'STLD--2280.12-0.01.pth', 'FOXA--544.34-0.00.pth', 'RMD--178.08-0.00.pth', 'PAYX--571.04-0.00.pth', 'ECL--507.73-0.00.pth', 'NOW--197.83-0.00.pth', 'ADM--195.18-0.00.pth', 'BKNG--113.66-0.00.pth', 'WBD--24.46-0.00.pth', 'ISRG--1041.87-0.02.pth', 'SYF--675.94-0.00.pth', 'AFL--6006.80-0.02.pth', 'RF--2686.43-0.01.pth', 'CSX--1567.88-0.00.pth', 'NVR--6850.06-0.01.pth', 'PPL--93.30-0.00.pth', 'EBAY--186.65-0.00.pth', 'HOLX--4364.99-0.01.pth', 'FI--7456.21-0.00.pth', 'BK--479.99-0.00.pth', 'AMCR--2714.99-0.02.pth', 'HLT--441.49-0.00.pth', 'MCHP--2686.14-0.01.pth', 'C--5913.84-0.01.pth', 'AMP--8227.12-0.02.pth', 'CRL--435.29-0

In [15]:
def norm(column):
    norm_value = np.linalg.norm(column)
    if norm_value == 0:
        return column  # norm が 0 の場合、そのままの値を返す
    else:
        return column / norm_value, norm_value

In [16]:
for stock_code in modified_files:
    print(stock_code)

    df = pd.DataFrame(requests.get(f'{BASE_URL}/ml_data/{stock_code}').json())
    print(df.info())
    
    df['date'] = pd.to_datetime(df['date'])
    df.set_index('date', inplace=True)
    
    path = f'./ml_data/{stock_code}.csv'
    # df.to_csv(f'./ml_data/{stock_code}_pre.csv', index=False, encoding='utf-8')

    exclude_columns = ['date', 'time']
    columns = [col for col in df.columns if col not in exclude_columns]
    print(columns)
    
    # 各列を個別に正規化して、辞書に格納
    ny_dow = norm(df['NY_Dow'])[0]
    sp_500 = norm(df['SP_500'])[0]
    content_concern = df['content_concern']
    content_despair = df['content_despair']
    content_excitement = df['content_excitement']
    content_optimism = df['content_optimism']
    content_stability = df['content_stability']
    headline_concern = df['headline_concern']
    headline_despair = df['headline_despair']
    headline_excitement = df['headline_excitement']
    headline_optimism = df['headline_optimism']
    headline_stability = df['headline_stability']
    value = norm(df['value'])[0]
    vix = norm(df['vix'])[0]

    # 正規化されたデータを辞書にまとめる
    data_dict = {
        'NY_Dow': ny_dow,
        'SP_500': sp_500,
        'content_concern': content_concern,
        'content_despair': content_despair,
        'content_excitement': content_excitement,
        'content_optimism': content_optimism,
        'content_stability': content_stability,
        'headline_concern': headline_concern,
        'headline_despair': headline_despair,
        'headline_excitement': headline_excitement,
        'headline_optimism': headline_optimism,
        'headline_stability': headline_stability,
        'vix': vix,
        'value': value
    }

    # 辞書をデータフレームに変換
    data = pd.DataFrame(data_dict)

    # 同じ日の要素ごとに和を取る列
    sum_columns = ['content_concern', 'content_despair', 'content_excitement', 
                    'content_optimism', 'content_stability', 'headline_concern',
                    'headline_despair', 'headline_excitement', 'headline_optimism',
                    'headline_stability']
    
    # 和を取る列は日毎に集約
    summed_data = data[sum_columns].resample('D').sum()

    # それ以外の列は最大値を取る
    max_columns = [col for col in data.columns if col not in sum_columns]
    max_data = data[max_columns].resample('D').max()

    # 両方を結合
    data = pd.concat([summed_data, max_data], axis=1)
    
    data_dict = {
            'NY_Dow': data['NY_Dow'],
            'SP_500': data['SP_500'],
            'content_concern': norm(data['content_concern'])[0],
            'content_despair': norm(data['content_despair'])[0],
            'content_excitement': norm(data['content_excitement'])[0],
            'content_optimism': norm(data['content_optimism'])[0],
            'content_stability': norm(data['content_stability'])[0],
            'headline_concern': norm(data['headline_concern'])[0],
            'headline_despair': norm(data['headline_despair'])[0],
            'headline_excitement': norm(data['headline_excitement'])[0],
            'headline_optimism': norm(data['headline_optimism'])[0],
            'headline_stability': norm(data['headline_stability'])[0],
            'vix': data['vix'],
            'value': data['value']
    }
    
    data = pd.DataFrame(data_dict).dropna()
    
    data.to_csv(path, index=False, encoding='utf-8')
    
    df_train, df_test = train_test_split(data, test_size=0.1, shuffle=False)
    
    # window_sizeの設定とテストデータが小さい場合の調整
    window_size = 5
    n_train = len(df_train) - window_size
    n_test = len(df_test) - window_size

    if n_test <= 0:
        print("テストデータが小さすぎるため、window_sizeを1に設定します。")
        window_size = 1
        n_train = len(df_train) - window_size
        n_test = len(df_test) - window_size

EMN
EMN
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 236 entries, 0 to 235
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   NY_Dow               236 non-null    float64
 1   SP_500               236 non-null    float64
 2   content_concern      236 non-null    float64
 3   content_despair      236 non-null    float64
 4   content_excitement   236 non-null    float64
 5   content_optimism     236 non-null    float64
 6   content_stability    236 non-null    float64
 7   date                 236 non-null    object 
 8   headline_concern     236 non-null    float64
 9   headline_despair     236 non-null    float64
 10  headline_excitement  236 non-null    float64
 11  headline_optimism    236 non-null    float64
 12  headline_stability   236 non-null    float64
 13  time                 236 non-null    object 
 14  value                236 non-null    float64
 15  vix                  236 non-nul

KeyboardInterrupt: 

In [None]:
# 20日先の予測
future_predictions = []
last_window = test[-1]  # テストデータの最後のウィンドウを使用

for _ in range(20):
    x = torch.tensor(last_window).reshape(1, window_size, feature_size).to(device)
    y = net(x)
    future_predictions.append(y.item())
    
    # スカラー値を抽出してから代入
    new_point = y.cpu().detach().numpy().item()
    last_window = np.roll(last_window, -1, axis=0)
    last_window[-1, 12] = new_point  # ここでは `ave_tmp` を予測している前提

In [None]:
import pandas as pd
import os

# 比率が大きかったファイル名を格納するリスト
file_ratios = []

# ディレクトリ内のすべてのCSVファイルに対して処理を行う
directory_path = './predictions/'  # ここに対象のディレクトリパスを指定

# ディレクトリ内のCSVファイルを取得
csv_files = [f for f in os.listdir(directory_path) if f.endswith('.csv')]

# 各CSVファイルに対して処理
for csv_file in csv_files:
    csv_file_path = os.path.join(directory_path, csv_file)
    
    # CSVファイルの読み込み
    df = pd.read_csv(csv_file_path)
    
    # 日付カラムをdatetime形式に変換
    df['Date'] = pd.to_datetime(df['Date'])
    
    # 日付をインデックスに設定
    df.set_index('Date', inplace=True)
    
    # 一番古い日と最新の日の値を取得
    first_day_value = df.iloc[0]  # 最初の日の値
    last_day_value = df.iloc[-1]  # 最後の日の値
    
    # 比率の計算 (ゼロ除算を避けるため、first_day_valueが0の場合は無視)
    ratio = (last_day_value / first_day_value).replace([float('inf'), -float('inf')], float('nan')).dropna()
    
    # 比率の合計を保存 (複数カラムの場合は平均)
    mean_ratio = ratio.mean()
    
    # 結果をリストに追加
    file_ratios.append((csv_file, mean_ratio))

# 比率の降順にソートし、上位3つのファイルを選択
top_files = sorted(file_ratios, key=lambda x: x[1], reverse=True)[:3]

# 上位3つのファイル名をプリント
print("Top 3 files with the largest positive ratios:")
for file_name, ratio in top_files:
    print(f"{file_name}: {ratio}")

print("All files processed.")
