In [11]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from statsmodels.tsa.arima.model import ARIMA

# Set the ticker symbol and date range
symbol = "TQQQ"
start_date = "2018-01-01"
end_date = "2021-12-31"

# Retrieve the historical data from Yahoo Finance
tqqq_data = yf.download(symbol, start=start_date, end=end_date)

# 将日期列设置为索引
#tqqq_data['Date'] = pd.to_datetime(tqqq_data['Date'])
#tqqq_data.set_index('Date', inplace=True)

# 计算每日收盘价的涨跌
tqqq_data['Price_Change'] = tqqq_data['Close'].diff()
tqqq_data['Price_Change'] = np.where(tqqq_data['Price_Change'] > 0, 1, 0)

# 使用过去三年的数据
start_date = pd.Timestamp(tqqq_data.index.max()) - pd.DateOffset(years=3)
tqqq_data = tqqq_data[start_date:]

# 特征选择
X = tqqq_data[['Open', 'High', 'Low', 'Volume']].values
y = tqqq_data['Price_Change'].values

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 线性回归模型
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
lr_pred = np.where(lr_pred > 0.5, 1, 0)

# 逻辑回归模型
logreg_model = LogisticRegression(random_state=42)
logreg_model.fit(X_train, y_train)
logreg_pred = logreg_model.predict(X_test)

# 随机森林模型
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

# 时间序列模型 (ARIMA)
ts_model = ARIMA(y_train, order=(1, 0, 0))
ts_model_fit = ts_model.fit()
ts_pred = ts_model_fit.predict(start=len(y_train), end=len(y_train) + len(y_test) - 1)

# 计算准确率
lr_accuracy = accuracy_score(y_test, lr_pred)
logreg_accuracy = accuracy_score(y_test, logreg_pred)
rf_accuracy = accuracy_score(y_test, rf_pred)
ts_accuracy = accuracy_score(y_test, np.where(ts_pred > 0, 1, 0))

print("Linear Regression Accuracy: {:.2f}%".format(lr_accuracy * 100))
print("Logistic Regression Accuracy: {:.2f}%".format(logreg_accuracy * 100))
print("Random Forest Accuracy: {:.2f}%".format(rf_accuracy * 100))
print("Time Series (ARIMA) Accuracy: {:.2f}%".format(ts_accuracy * 100))

[*********************100%***********************]  1 of 1 completed
Linear Regression Accuracy: 71.05%
Logistic Regression Accuracy: 62.50%
Random Forest Accuracy: 55.26%
Time Series (ARIMA) Accuracy: 62.50%


In [7]:
tqqq_data

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-02,11.741667,12.178333,11.657500,12.153333,11.995692,45867600
2018-01-03,12.202500,12.555000,12.200833,12.509167,12.346910,42612000
2018-01-04,12.655833,12.700833,12.538333,12.582500,12.419292,35012400
2018-01-05,12.758333,12.990000,12.692500,12.962500,12.794362,41190000
2018-01-08,12.960833,13.133333,12.948333,13.106667,12.936659,36126000
...,...,...,...,...,...,...
2021-12-23,81.084999,83.644997,80.985001,82.739998,81.835701,65654200
2021-12-27,83.544998,86.764999,83.500000,86.695000,85.747482,62981600
2021-12-28,87.379997,87.489998,85.014999,85.675003,84.738632,72412600
2021-12-29,85.654999,86.495003,83.974998,85.540001,84.605103,57747400


# More feature

In [17]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from statsmodels.tsa.arima.model import ARIMA

# 从yfinance API获取特斯拉数据
tsla = yf.Ticker("TSLA")
df = tsla.history(period="3y")

# 计算5日均线
df['5_day_mean'] = df['Close'].rolling(5).mean()

# 计算特斯拉的5日均线
df['TSLA_5_day_mean'] = df['Close'].shift(1).rolling(5).mean()

# 计算每日收盘价的涨跌
df['Price_Change'] = df['Close'].diff()
df['Price_Change'] = np.where(df['Price_Change'] > 0, 1, 0)
df.dropna(inplace=True)

# 使用过去三年的数据
start_date = pd.Timestamp(df.index.max()) - pd.DateOffset(years=3)
df = df[start_date:]


# 特征选择
X = df[['Open', 'High', 'Low', 'Volume', '5_day_mean', 'TSLA_5_day_mean']].values
y = df['Price_Change'].values


# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 线性回归模型
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
lr_pred = np.where(lr_pred > 0.5, 1, 0)

# 随机森林模型
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

# 时间序列模型 (ARIMA)
ts_model = ARIMA(y_train, order=(1, 0, 0))
ts_model_fit = ts_model.fit()
ts_pred = ts_model_fit.predict(start=len(y_train), end=len(y_train) + len(y_test) - 1)

# 逻辑回归模型
logreg_model = LogisticRegression()
logreg_model.fit(X_train, y_train)
logreg_pred = logreg_model.predict(X_test)

# 计算准确率
lr_accuracy = accuracy_score(y_test, lr_pred)
rf_accuracy = accuracy_score(y_test, rf_pred)
ts_accuracy = accuracy_score(y_test, np.where(ts_pred > 0, 1, 0))
logreg_accuracy = accuracy_score(y_test, logreg_pred)

print("Linear Regression Accuracy: {:.2f}%".format(lr_accuracy * 100))
print("Random Forest Accuracy: {:.2f}%".format(rf_accuracy * 100))
print("Time Series (ARIMA) Accuracy: {:.2f}%".format(ts_accuracy * 100))
print("Logistic Regression Accuracy: {:.2f}%".format(logreg_accuracy * 100))


Linear Regression Accuracy: 76.82%
Random Forest Accuracy: 60.26%
Time Series (ARIMA) Accuracy: 60.93%
Logistic Regression Accuracy: 60.93%


# more features

In [25]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from statsmodels.tsa.arima.model import ARIMA

# 获取股票数据
tqqq = yf.Ticker("TQQQ")
df_tqqq = tqqq.history(period="3y")

# 获取特斯拉股票数据
tesla = yf.Ticker("TSLA")
df_tesla = tesla.history(period="3y")

# 获取美国10年期国债数据
us_treasury = yf.Ticker("^TNX")
df_us_treasury = us_treasury.history(period="3y")

# 获取黄金价格数据
gold = yf.Ticker("GC=F")
df_gold = gold.history(period="3y")

# 计算五日均线
df_tqqq['MA_5'] = df_tqqq['Close'].rolling(window=5).mean()
df_tesla['MA_5'] = df_tesla['Close'].rolling(window=5).mean()
df_us_treasury['MA_5'] = df_us_treasury['Close'].rolling(window=5).mean()
df_gold['MA_5'] = df_gold['Close'].rolling(window=5).mean()

# 合并数据
df = pd.concat([df_tqqq[['Open', 'High', 'Low', 'Volume','Close', 'MA_5']],
               df_tesla['MA_5'],
               df_us_treasury['MA_5'],
               df_gold['MA_5']], axis=1)
df.dropna(inplace=True)

# 计算涨跌标签
df['Price_Change'] = np.where(df['Close'].diff() > 0, 1, 0)

# 特征选择
X = df.iloc[:, :-1].values
y = df['Price_Change'].values

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 线性回归模型
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_pred = np.where(lr_model.predict(X_test) > 0.5, 1, 0)

# 逻辑回归模型
logreg_model = LogisticRegression()
logreg_model.fit(X_train, y_train)
logreg_pred = logreg_model.predict(X_test)

# 随机森林模型
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

# 时间序列模型 (ARIMA)
ts_model = ARIMA(y_train, order=(1, 0, 0))
ts_model_fit = ts_model.fit()
ts_pred = np.where(ts_model_fit.predict(start=len(y_train), end=len(y_train) + len(y_test) - 1) > 0, 1, 0)

# 计算准确率
lr_accuracy = accuracy_score(y_test, lr_pred)
logreg_accuracy = accuracy_score(y_test, logreg_pred)
rf_accuracy = accuracy_score(y_test, rf_pred)
ts_accuracy = accuracy_score(y_test, ts_pred)


print("Linear Regression Accuracy: {:.2f}%".format(lr_accuracy * 100))
print("Random Forest Accuracy: {:.2f}%".format(rf_accuracy * 100))
print("Time Series (ARIMA) Accuracy: {:.2f}%".format(ts_accuracy * 100))
print("Logistic Regression Accuracy: {:.2f}%".format(logreg_accuracy * 100))

Linear Regression Accuracy: 83.44%
Random Forest Accuracy: 58.94%
Time Series (ARIMA) Accuracy: 53.64%
Logistic Regression Accuracy: 53.64%


In [27]:
X

array([[1.54641213e+01, 1.61713071e+01, 1.51352549e+01, ...,
        4.79462669e+01, 6.04999995e-01, 1.71305999e+03],
       [1.66114419e+01, 1.67128207e+01, 1.62306495e+01, ...,
        4.86448006e+01, 6.11000001e-01, 1.71512000e+03],
       [1.68685999e+01, 1.68710731e+01, 1.54814282e+01, ...,
        4.97434669e+01, 6.18800008e-01, 1.72158000e+03],
       ...,
       [2.73500004e+01, 2.76200008e+01, 2.68099995e+01, ...,
        1.85053995e+02, 3.48400002e+00, 2.01068000e+03],
       [2.80699997e+01, 2.81800003e+01, 2.72000008e+01, ...,
        1.84557996e+02, 3.51160002e+00, 2.01120000e+03],
       [2.68700008e+01, 2.73349991e+01, 2.68400002e+01, ...,
        1.84803976e+02, 3.55280004e+00, 2.01000000e+03]])

In [189]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from statsmodels.tsa.arima.model import ARIMA
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, MaxPooling1D
from tensorflow.keras.optimizers import Adam

#year='5y'
year='3y'
# 从yfinance API获取股票、国债和黄金数据
tqqq = yf.Ticker('TQQQ')
tqqq_df = tqqq.history(period=year)
bond = yf.Ticker('^TNX')
bond_df = bond.history(period=year)
gold = yf.Ticker('GC=F')
gold_df = gold.history(period=year)
tesla = yf.Ticker('TSLA')
tesla_df = tesla.history(period=year)

# 计算五日均线
tqqq_df['5_day_avg'] = tqqq_df['Close'].rolling(window=5).mean()
bond_df['5_day_avg'] = bond_df['Close'].rolling(window=5).mean()
gold_df['5_day_avg'] = gold_df['Close'].rolling(window=5).mean()
tesla_df['5_day_avg'] = tesla_df['Close'].rolling(window=5).mean()

# 合并所有数据
df = pd.DataFrame(index=tqqq_df.index)
df['Close']=tqqq_df['Close']
df['Open'] = tqqq_df['Open']
df['High'] = tqqq_df['High']
df['Low'] = tqqq_df['Low']
df['Volume'] = tqqq_df['Volume']
df['TQQQ_5_day_avg'] = tqqq_df['5_day_avg']
df['Bond_5_day_avg'] = bond_df['5_day_avg']
df['Gold_5_day_avg'] = gold_df['5_day_avg']
df['Tesla_5_day_avg'] = tesla_df['5_day_avg']

# 去掉有缺失值的行
df.dropna(inplace=True)
o_df=df.copy()
df=df.iloc[:-1]  # remove today.  should remove this line (if at night), just now because it is in the trading time
# 计算涨跌情况
df['Price_Change'] = np.where(df['Close'].diff() > 0, 1, 0)

# 用昨天的数据预测今天的收盘价涨跌情况
df_close=df[['Close','Open', 'High', 'Low', 'Volume', 'TQQQ_5_day_avg', 'Bond_5_day_avg', 'Gold_5_day_avg', 'Tesla_5_day_avg']].iloc[:-1]
X_df=df[['Open', 'Volume', 'TQQQ_5_day_avg', 'Bond_5_day_avg', 'Gold_5_day_avg', 'Tesla_5_day_avg']].iloc[:-1]
o_X_df=o_df[['Open', 'Volume', 'TQQQ_5_day_avg', 'Bond_5_day_avg', 'Gold_5_day_avg', 'Tesla_5_day_avg']].iloc[:-1]
y_df=df['Price_Change'].iloc[1:]
#X = df[['Open', 'High', 'Low', 'Volume', 'TQQQ_5_day_avg', 'Bond_5_day_avg', 'Gold_5_day_avg', 'Tesla_5_day_avg']].iloc[:-1].values
X = df[['Open', 'Volume', 'TQQQ_5_day_avg', 'Bond_5_day_avg', 'Gold_5_day_avg', 'Tesla_5_day_avg']].iloc[:-1].values
y = df['Price_Change'].iloc[1:].values


# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 线性回归模型
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_pred = np.where(lr_model.predict(X_test) > 0.5, 1, 0)

# 逻辑回归模型
logreg_model = LogisticRegression()
logreg_model.fit(X_train, y_train)
logreg_pred = logreg_model.predict(X_test)

# 随机森林模型
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
'''
# 时间序列模型 (ARIMA)
ts_model = ARIMA(y_train, order=(1, 0, 0))
ts_model_fit = ts_model.fit()
ts_pred = np.where(ts_model_fit.predict(start=len(y_train), end=len(y_train) + len(y_test) - 1) > 0, 1, 0)
'''
# 计算准确率
lr_accuracy = accuracy_score(y_test, lr_pred)
logreg_accuracy = accuracy_score(y_test, logreg_pred)
rf_accuracy = accuracy_score(y_test, rf_pred)
#ts_accuracy = accuracy_score(y_test, ts_pred)


print("Linear Regression Accuracy: {:.2f}%".format(lr_accuracy * 100))
print("Random Forest Accuracy: {:.2f}%".format(rf_accuracy * 100))
#print("Time Series (ARIMA) Accuracy: {:.3f}%".format(ts_accuracy * 100))
print("Logistic Regression Accuracy: {:.3f}%".format(logreg_accuracy * 100))
print('--------')
#t_df=X_df.iloc[-1,:]
time=X_df.index[-1]

date_string = time.strftime('%Y-%m-%d %H:%M:%S')
date_string =date_string[:10]
print('Train Data Last Date: ', date_string)
print('--------')
keys=['lr_accuracy', 'rf_accuracy' ,'logreg_accuracy']
values=[lr_accuracy, rf_accuracy, logreg_accuracy]
model_l=[lr_model,rf_model,ts_model,logreg_model]
name_model_l=['lr_model','rf_model','logreg_model']
my_dict = dict(zip(keys, model_l))
X_input=o_X_df.iloc[-1:, :]
choose(keys,values,model_l,my_dict,X_input)

X_input

Linear Regression Accuracy: 68.00%
Random Forest Accuracy: 52.00%
Logistic Regression Accuracy: 59.333%
--------
Train Data Last Date:  2023-04-17
--------
The highest accuracy is lr_accuracy
The value is  0.68
Choosing model:  LinearRegression()
Input Date:  2023-04-18
---------
Close price will go up after 2023-04-18


Unnamed: 0_level_0,Open,Volume,TQQQ_5_day_avg,Bond_5_day_avg,Gold_5_day_avg,Tesla_5_day_avg
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-04-18 00:00:00-04:00,28.07,106045600,27.25,3.5116,2011.2,184.557996


In [185]:
def max_index(lst):
    """Returns the index of the maximum element in the given list"""
    max_val = max(lst)
    max_idx = lst.index(max_val)
    return max_idx

def choose(keys,values,model_l,my_dict,X_input):
    i=max_index(values)
    model=model_l[i]
    print('The highest accuracy is', keys[i] )
    print('The value is ',values[i])
    print('Choosing model: ',model)
    X_today=X_input.values
    if model==lr_model:
        
        #print(df['Close'][-1])
        
        lr_pred = np.where(lr_model.predict(X_today) > 0.5, 1, 0)
        time=X_input.index[0]
        
        date_string = time.strftime('%Y-%m-%d %H:%M:%S')
        date_string =date_string[:10]
        print('Input Date: ', date_string)
        print('---------')
        #print(logreg_pred)
        if lr_pred[0]>0:
            print('Close price will go up after', date_string)
        else:
            print('will go down after', date_string)
    else:
        model_pred = model.predict(X_today)
        if model_pred[0]>0:
            print('Close price will go up after', date_string)
        else:
            print('will go down after', date_string)

# predict using today's stats

In [175]:
# 从yfinance API获取股票、国债和黄金数据
tqqq = yf.Ticker('TQQQ')
tqqq_df = tqqq.history(period='3y')
bond = yf.Ticker('^TNX')
bond_df = bond.history(period='3y')
gold = yf.Ticker('GC=F')
gold_df = gold.history(period='3y')
tesla = yf.Ticker('TSLA')
tesla_df = tesla.history(period='3y')

# 计算五日均线
tqqq_df['5_day_avg'] = tqqq_df['Close'].rolling(window=5).mean()
bond_df['5_day_avg'] = bond_df['Close'].rolling(window=5).mean()
gold_df['5_day_avg'] = gold_df['Close'].rolling(window=5).mean()
tesla_df['5_day_avg'] = tesla_df['Close'].rolling(window=5).mean()

# 合并所有数据
df = pd.DataFrame(index=tqqq_df.index)
df['Close']=tqqq_df['Close']
df['Open'] = tqqq_df['Open']
df['High'] = tqqq_df['High']
df['Low'] = tqqq_df['Low']
df['Volume'] = tqqq_df['Volume']
df['TQQQ_5_day_avg'] = tqqq_df['5_day_avg']
df['Bond_5_day_avg'] = bond_df['5_day_avg']
df['Gold_5_day_avg'] = gold_df['5_day_avg']
df['Tesla_5_day_avg'] = tesla_df['5_day_avg']

# 去掉有缺失值的行
df.dropna(inplace=True)
#df=df.iloc[:-1]  # remove today.  should remove, just now because it is in the trading time
# 计算涨跌情况
df['Price_Change'] = np.where(df['Close'].diff() > 0, 1, 0)

# 用昨天的数据预测今天的收盘价涨跌情况
df_close=df[['Close','Open', 'High', 'Low', 'Volume', 'TQQQ_5_day_avg', 'Bond_5_day_avg', 'Gold_5_day_avg', 'Tesla_5_day_avg']].iloc[:-1]
X_df=df[['Open', 'High', 'Low', 'Volume', 'TQQQ_5_day_avg', 'Bond_5_day_avg', 'Gold_5_day_avg', 'Tesla_5_day_avg']].iloc[:-1]
y_df=df['Price_Change'].iloc[1:]
X = df[['Open', 'High', 'Low', 'Volume', 'TQQQ_5_day_avg', 'Bond_5_day_avg', 'Gold_5_day_avg', 'Tesla_5_day_avg']].iloc[:-1].values
y = df['Price_Change'].iloc[1:].values


# 逻辑回归模型
#logreg_model = LogisticRegression()
#logreg_model.fit(X_train, y_train)
#logreg_model.fit(X, y)

X_test
X_input=o_X_df.iloc[-1:, :]
#print(X_input)
#  X_today is today's stats, used to predict tomorrow's close price
X_today=X_input.values
#print(df['Close'][-1])

logreg_pred = logreg_model.predict(X_today)
time=X_input.index[0]

date_string = time.strftime('%Y-%m-%d %H:%M:%S')
date_string =date_string[:10]
print('Input Date: ', date_string)
print('---------')
#print(logreg_pred)
if logreg_pred>0:
    print('Close price will go up after', date_string)
else:
    print('will go down after', date_string)

Input Date:  2023-04-18
---------
Close price will go up after 2023-04-18


In [137]:
type(X_input.index[0])

pandas._libs.tslibs.timestamps.Timestamp