In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load dataset
data = pd.read_csv('Stock_data.csv')

# Display first few rows of the dataset
print(data.head())

# Check for missing values
print(data.isnull().sum())

# Drop missing values if any
data = data.dropna()

# Encode sentiment labels (assuming they are in a column named 'Sentiment')
le = LabelEncoder()
data['Sentiment'] = le.fit_transform(data['Sentiment'])

# Split dataset into features and labels
X = data['Text']
y = data['Sentiment']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenization
tokenizer = Tokenizer(num_words=5000)  # Consider top 5000 words
tokenizer.fit_on_texts(X_train)

# Convert texts to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to ensure uniform input size
max_length = max(len(x) for x in X_train_seq)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

# Display shapes of data
print(X_train_pad.shape, y_train.shape)
print(X_test_pad.shape, y_test.shape)


                                                Text  Sentiment
0  Kickers on my watchlist XIDE TIT SOQ PNK CPW B...          1
1  user: AAP MOVIE. 55% return for the FEA/GEED i...          1
2  user I'd be afraid to short AMZN - they are lo...          1
3                                  MNTA Over 12.00            1
4                                   OI  Over 21.37            1
Text         0
Sentiment    0
dtype: int64
(4632, 34) (4632,)
(1159, 34) (1159,)


In [13]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Define LSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_length))  # Embedding layer
model.add(LSTM(128, return_sequences=True))  # LSTM layer
model.add(Dropout(0.5))  # Dropout to prevent overfitting
model.add(LSTM(64))  # Second LSTM layer
model.add(Dropout(0.5))  # Dropout
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Display model summary
model.summary()




In [14]:
# Train the model
history = model.fit(X_train_pad, y_train, epochs=10, batch_size=64, validation_split=0.2)


Epoch 1/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 67ms/step - accuracy: 0.6284 - loss: 0.6606 - val_accuracy: 0.6268 - val_loss: 0.6536
Epoch 2/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 61ms/step - accuracy: 0.7146 - loss: 0.5622 - val_accuracy: 0.7745 - val_loss: 0.4920
Epoch 3/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 75ms/step - accuracy: 0.8898 - loss: 0.3168 - val_accuracy: 0.7756 - val_loss: 0.4849
Epoch 4/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 71ms/step - accuracy: 0.9278 - loss: 0.2154 - val_accuracy: 0.7433 - val_loss: 0.5643
Epoch 5/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 71ms/step - accuracy: 0.9486 - loss: 0.1725 - val_accuracy: 0.7594 - val_loss: 0.6393
Epoch 6/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 62ms/step - accuracy: 0.9671 - loss: 0.1213 - val_accuracy: 0.7530 - val_loss: 0.8493
Epoch 7/10
[1m58/58[0m [32m━━━━

In [15]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f'Test Accuracy: {accuracy * 100:.2f}%')


[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.7407 - loss: 0.9643
Test Accuracy: 74.63%


In [1]:
def predict_sentiment(text):
    sequence = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(sequence, maxlen=max_length, padding='post')
    prediction = model.predict(padded)
    # Assuming binary classification: 0 for Negative, 1 for Positive
    if prediction[0] > 0.5:
        print("Sentiment: Positive")
    else:
        print("Sentiment: Negative")

# Example usage
new_text = "The stock market is  bad today."
predict_sentiment(new_text)


NameError: name 'tokenizer' is not defined

In [17]:
model.save('sentiment_model.h5')





In [18]:
test_texts = [
    "The stock market is performing well today.",
    "The stock market is performing bad today.",
    "I'm very happy with the profits I've made.",
    "I'm disappointed with the losses this quarter.",
    "It's a great time to invest in stocks!"
    
]

for text in test_texts:
    print(f'Text: "{text}"')
    predict_sentiment(text)


Text: "The stock market is performing well today."
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
Sentiment: Positive
Text: "The stock market is performing bad today."
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
Sentiment: Positive
Text: "I'm very happy with the profits I've made."
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
Sentiment: Positive
Text: "I'm disappointed with the losses this quarter."
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Sentiment: Negative
Text: "It's a great time to invest in stocks!"
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
Sentiment: Positive
