In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

In [2]:
# Load datasets
combined_news = pd.read_csv('/content/drive/MyDrive/GoogleGirlsScript/286Dailynews/Combined_News_DJIA.csv')
reddit_news = pd.read_csv('/content/drive/MyDrive/GoogleGirlsScript/286Dailynews/RedditNews.csv')
djia_data = pd.read_csv('/content/drive/MyDrive/GoogleGirlsScript/286Dailynews/upload_DJIA_table.csv')


In [3]:
# Combine top news headlines into a single column
combined_news['Combined_News'] = combined_news.iloc[:, 2:].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
combined_news = combined_news[['Date', 'Label', 'Combined_News']]

In [4]:
# Merge with DJIA data
djia_data['Date'] = pd.to_datetime(djia_data['Date'])
combined_news['Date'] = pd.to_datetime(combined_news['Date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_news['Date'] = pd.to_datetime(combined_news['Date'])


In [5]:
data = pd.merge(combined_news, djia_data, on='Date')

In [6]:
# Feature extraction from text data
vectorizer = TfidfVectorizer(max_features=5000)
news_tfidf = vectorizer.fit_transform(data['Combined_News']).toarray()

In [7]:
# Combine with stock data
stock_features = data[['Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']].values
scaler = StandardScaler()
stock_features_scaled = scaler.fit_transform(stock_features)

In [8]:
X = np.hstack((news_tfidf, stock_features_scaled))
y = data['Label'].values

In [9]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l2



In [11]:
# Define the model with regularization and dropout
model = Sequential()
model.add(Dense(512, input_dim=X_train.shape[1], activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

In [12]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [13]:
# Train the model with early stopping
from tensorflow.keras.callbacks import EarlyStopping

In [14]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [15]:
history = model.fit(X_train, y_train, epochs=100, batch_size=64, validation_data=(X_test, y_test), callbacks=[early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100


In [16]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy*100:.2f}%')

Test Accuracy: 57.29%


In [17]:
# Predict and calculate additional metrics if necessary
from sklearn.metrics import classification_report
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.01      0.01       171
           1       0.57      1.00      0.73       227

    accuracy                           0.57       398
   macro avg       0.79      0.50      0.37       398
weighted avg       0.76      0.57      0.42       398



In [19]:
model.save('stock_prediction_model.h5')
