In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import GridSearchCV

In [None]:
# Load data
stock_df = pd.read_csv('test_SP2.csv')
sentiment_df = pd.read_csv('output.csv')
sentiment_df.rename(columns={'time': 'Date'}, inplace=True)


In [None]:
pd.set_option('display.max_rows', None) 
sentiment_df.sort_values('Date', inplace=True)


In [None]:
sentiment_df['Date'] = pd.to_datetime(sentiment_df['Date'], format='%Y%m%d')

sentiment_df['WeekDay'] = sentiment_df['Date'].dt.weekday

# Identify cells representing Saturday or Sunday
weekend_mask = (sentiment_df['WeekDay'] == 5) | (sentiment_df['WeekDay'] == 6)

# Update 'WeekDay' and 'Date' columns
sentiment_df.loc[weekend_mask, 'WeekDay'] = 4  # Update Saturday to Friday (offset of 1)
sentiment_df.loc[weekend_mask, 'Date'] = sentiment_df.loc[weekend_mask, 'Date'] - pd.DateOffset(days=2)  # Update Sunday to Friday (offset of 2)

sentiment_df['WeekDay']=sentiment_df['Date'].dt.strftime('%A')

#刪除0like貼文
sentiment_df = sentiment_df[sentiment_df['like'] != 0.0]
sentiment_df = sentiment_df.sort_index()

sentiment_df

Unnamed: 0,Date,tag,content,like,donate,comment,sentiment_score,WeekDay
0,2023-05-05,"['加權指數', '宏達電', '友達', '群創', '台積電']",別說只發布社團討論，這邊我們也可以討論，大同，目前我還沒有，但是我很想要🥹可能我的生技做完後...,139.0,3.0,16.0,0.001224031,Friday
1,2023-05-05,"['加權指數', '宏達電', '友達', '群創', '台積電']",目前從台指觀察來看，每次爆量就會被打到原點，也就是說只要量縮，就會避免台股的殺盤，原以為昨天...,197.0,2.0,22.0,1.190485e-05,Friday
2,2023-05-05,"['加權指數', '宏碁', '明基材', '飛宏', '富邦媒']",各位午安今天不少股高位股出現不好的技術線樣貌，操做上都要很謹慎直接看圖，也歡迎貼上你懷疑不太...,70.0,0.0,20.0,0.6971134,Friday
3,2023-05-05,"['加權指數', '台積電', '宏碁', '欣興', '南電']",各位晚安我以3037 欣興舉例 剛站上均線 vs 六均多排站上均線不夠，六均依序多排才更重要...,53.0,0.0,11.0,0.2138782,Friday
4,2023-05-05,"['加權指數', '宏達電', '友達', '群創', '台積電']",經我評估之後，這個選股邏輯可以發布沒關係，下半年的選股方向要找到這些條件才可以做長遠的規劃。...,176.0,3.0,21.0,8.878923e-07,Friday
5,2023-05-05,"['加權指數', '艾笛森', '明基材', '飛宏', '森崴能源']",2023.05.05【櫃買指數持續走強，加權指數橫盤震盪】今天加權指數與櫃買指數分別上漲 0...,65.0,5.0,13.0,0.0001541535,Friday
7,2023-05-05,['加權指數'],加權指數05/05(五)收盤15626，漲跌17點什麼！今天收盤竟然是15626就是那個吉米...,74.0,25.0,33.0,0.8114062,Friday
8,2023-05-05,"['加權指數', '台積電', '聯發科', '創意', '國巨']",5/5 盤後心得祝大家週末愉快!指數30分線繼續橫盤，日線上反彈到20MA/50MA這一帶，...,150.0,48.0,40.0,1.267919e-09,Friday
9,2023-05-05,"['加權指數', '國泰永續高股息', '國泰費城半導體', '台積電', '華新科']",各位午安近來指數沒行情小股也不好整体給人意興闌珊之感今天幾檔強股，都是之前壓力挺大之個股，如...,93.0,0.0,25.0,0.03984263,Friday
11,2023-05-05,['加權指數'],車商說買德國車，新車交期要等一年！德國聯邦統計局說「三月汽車部門訂單近腰斬」德國3月工廠訂單...,44.0,1.0,27.0,0.2577322,Friday


In [None]:
#輿情like加權平均
sentiment_df['Sentiment'] = sentiment_df['like'] * sentiment_df['sentiment_score']

#One hot encoding
from sklearn.preprocessing import OneHotEncoder
sentiment_df = sentiment_df.reset_index()
onehotencoder = OneHotEncoder()
data_str_ohe = onehotencoder.fit_transform(sentiment_df[['WeekDay']]).toarray()
sentiment_df = pd.concat([sentiment_df, pd.DataFrame(data_str_ohe, columns=onehotencoder.get_feature_names_out(['WeekDay']))], axis=1)

#dropping_columns
sentiment_df.drop(['tag', 'content', 'like', 'donate', 'comment', 'sentiment_score', 'WeekDay'], axis=1, inplace=True)

In [None]:
#Group by 'Date' and calculate the mean sentiment
sentiment_df = sentiment_df.groupby('Date', as_index=False).mean()
sentiment_df.drop(['index'], axis=1, inplace=True)


In [None]:
stock_df['Date'] = pd.to_datetime(stock_df['Date'], format='%Y-%m-%d')
sentiment_df['Date'] = pd.to_datetime(sentiment_df['Date'], format='%Y%m%d')

In [None]:
merged_df = sentiment_df.merge(stock_df[['Date', 'RSI', 'DIF', 'MACD', 'Pillar', 'Open', 'Close']], on='Date', how='left')

In [None]:
merged_df['RESULT'] = np.where(merged_df['Close'] > merged_df['Open'], '+', np.where(merged_df['Close'] < merged_df['Open'], '-', 'x'))
merged_df['RESULT'] = merged_df['RESULT'].shift(-1)
merged_df.drop(['Close'], axis=1, inplace=True)
merged_df = merged_df[merged_df['RESULT'].notnull()]

merged_df

Unnamed: 0,Date,Sentiment,WeekDay_Friday,WeekDay_Monday,WeekDay_Thursday,WeekDay_Tuesday,WeekDay_Wednesday,RSI,DIF,MACD,Pillar,Open,RESULT
0,2023-05-05,12.054398,1.0,0.0,0.0,0.0,0.0,82.142877,0.384616,0.194166,0.19045,117.699997,+
1,2023-05-08,17.040261,0.0,1.0,0.0,0.0,0.0,76.744203,0.489886,0.25331,0.236576,118.5,-
2,2023-05-10,29.792792,0.0,0.0,0.0,0.0,1.0,60.714383,0.582292,0.373729,0.208564,118.800003,-
3,2023-05-11,182.499729,0.0,0.0,1.0,0.0,0.0,50.909146,0.545993,0.408181,0.137811,118.25,+
4,2023-05-12,34.125324,1.0,0.0,0.0,0.0,0.0,39.285617,0.48341,0.423227,0.060183,117.150002,+
5,2023-05-16,28.813217,0.0,0.0,0.0,1.0,0.0,45.161251,0.492931,0.43807,0.05486,117.900002,+
6,2023-05-18,29.302555,0.0,0.0,1.0,0.0,0.0,93.396125,0.969594,0.585174,0.38442,121.800003,-
7,2023-05-23,81.6778,0.0,0.0,0.0,1.0,0.0,92.857078,1.453091,0.961254,0.491837,122.300003,-
8,2023-05-24,7.16146,0.0,0.0,0.0,0.0,1.0,68.420982,1.464407,1.061885,0.402523,121.650002,+
9,2023-05-26,31.284278,1.0,0.0,0.0,0.0,0.0,82.857114,1.870172,1.305201,0.56497,125.0,x


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [None]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

# Step 2: Split the data into features (X) and labels (Y)
X = merged_df[['Sentiment', 'WeekDay_Friday', 'WeekDay_Monday', 'WeekDay_Thursday',	'WeekDay_Tuesday',	'WeekDay_Wednesday',	'RSI', 'DIF',	'MACD',	'Pillar',	'Open']]
Y = merged_df['RESULT']

# Step 3: Split the data into train and test sets based on the date
train_end_date = pd.to_datetime('2023-05-15')
train_df = merged_df[merged_df['Date'] >= train_end_date]
test_df = merged_df[merged_df['Date'] < train_end_date]

X_train = train_df[['Sentiment', 'WeekDay_Friday', 'WeekDay_Monday', 'WeekDay_Thursday',	'WeekDay_Tuesday',	'WeekDay_Wednesday',	'RSI', 'DIF',	'MACD',	'Pillar',	'Open']]
Y_train = train_df['RESULT']
X_test = test_df[['Sentiment', 'WeekDay_Friday', 'WeekDay_Monday', 'WeekDay_Thursday',	'WeekDay_Tuesday',	'WeekDay_Wednesday',	'RSI', 'DIF',	'MACD',	'Pillar',	'Open']]
Y_test = test_df['RESULT']

# Step 4: Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 5: Reshape the features
X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])

# Step 6: Encode the categorical labels
label_encoder = LabelEncoder()
Y_train = label_encoder.fit_transform(Y_train)
Y_test = label_encoder.transform(Y_test)

# Step 7: Perform one-hot encoding on the categorical labels
num_classes = len(label_encoder.classes_)
Y_train = to_categorical(Y_train, num_classes=num_classes)
Y_test = to_categorical(Y_test, num_classes=num_classes)


# Step 8: Build and train the LSTM model
model = Sequential()
model.add(LSTM(units=64, input_shape=(1, X_train.shape[2])))
model.add(Dense(units=Y_train.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, Y_train, epochs=10, batch_size=32)

# Step 9: Get predictions on the test set
class_names = label_encoder.classes_
y_pred_prob = model.predict(X_test)
y_pred_labels = [class_names[label] for label in y_pred_prob.argmax(axis=1)]
Y_test_labels = [class_names[label] for label in Y_test.argmax(axis=1)]
loss, accuracy = model.evaluate(X_test, Y_test)
print('Test loss:', loss, 'Test accuracy:', accuracy)

print(y_pred_labels,'\n', Y_test_labels)
# Step 10: Generate Classification Report
report = classification_report(Y_test_labels, y_pred_labels)
print('Classification Report:\n', report)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: 1.0320031642913818 Test accuracy: 0.20000000298023224
['-', '+', '+', '-', '+'] 
 ['+', '-', '-', '+', '+']
Classification Report:
               precision    recall  f1-score   support

           +       0.33      0.33      0.33         3
           -       0.00      0.00      0.00         2

    accuracy                           0.20         5
   macro avg       0.17      0.17      0.17         5
weighted avg       0.20      0.20      0.20         5



In [None]:
#調參: learning rate
import tensorflow as tf

best_acc = 0
best_lr = 0
for i in range(1,101):
  cur_lr = i / 1000
  optimizer = tf.keras.optimizers.Adam(learning_rate=cur_lr)
  model = Sequential()
  model.add(LSTM(units=64, input_shape=(1, X_train.shape[2])))
  model.add(Dense(units=Y_train.shape[1], activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
  model.fit(X_train, Y_train, epochs=10, batch_size=32)

  y_pred_prob = model.predict(X_test)
  loss, cur_acc = model.evaluate(X_test, Y_test)
  if cur_acc > best_acc:
    best_acc = cur_acc
    best_lr = cur_lr
  
print(best_lr, best_acc)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10








Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10








Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
E

In [None]:
#調參: Units (learning_rate=0.001)
best_acc = 0
best_units = 0
for i in range(10,101):
  optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
  model = Sequential()
  model.add(LSTM(units=i, input_shape=(1, X_train.shape[2])))
  model.add(Dense(units=Y_train.shape[1], activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
  model.fit(X_train, Y_train, epochs=10, batch_size=32)

  y_pred_prob = model.predict(X_test)
  loss, cur_acc = model.evaluate(X_test, Y_test)
  if cur_acc > best_acc:
    best_acc = cur_acc
    best_units = i
  
print(best_acc, best_units)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
E

In [None]:
#調參: Units (learning_rate=0.003)
best_acc = 0
best_units = 0
for i in range(10,101):
  optimizer = tf.keras.optimizers.Adam(learning_rate=0.003)
  model = Sequential()
  model.add(LSTM(units=i, input_shape=(1, X_train.shape[2])))
  model.add(Dense(units=Y_train.shape[1], activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
  model.fit(X_train, Y_train, epochs=10, batch_size=32)

  y_pred_prob = model.predict(X_test)
  loss, cur_acc = model.evaluate(X_test, Y_test)
  if cur_acc > best_acc:
    best_acc = cur_acc
    best_units = i
  
print(best_acc, best_units)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
E