In [None]:
import numpy as np
import pandas as pd
import nltk
import re
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

dataset_path = "D:/sentiment140/training.1600000.processed.noemoticon.csv"
df = pd.read_csv(dataset_path, encoding='latin-1', header=None)
df.columns = ["target", "id", "date", "flag", "user", "text"]
df = df[["target", "text"]]

df["target"] = df["target"].apply(lambda x: 1 if x == 4 else 0)
df = df.sample(250000, random_state=42)

nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(nltk.corpus.stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^a-zA-Z ]', '', text.lower())
    tokens = nltk.word_tokenize(text)
    return " ".join([word for word in tokens if word.isalpha() and word not in stop_words])

df["clean_text"] = df["text"].apply(clean_text)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(df["clean_text"])
sequences = tokenizer.texts_to_sequences(df["clean_text"])
word_index = tokenizer.word_index

max_length = 100
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')
labels = df["target"].values

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

vocab_size = len(word_index) + 1
embedding_dim = 100

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.3),
    LSTM(32),
    Dropout(0.3),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=256, verbose=1, validation_data=(X_test, y_test))

def predict_sentiment(text):
    text = clean_text(text)
    tokens = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(tokens, maxlen=max_length, padding='post')
    prediction = model.predict(padded_sequence)[0, 0]
    return "Positive" if prediction > 0.5 else "Negative"

print(predict_sentiment("I love this phone!"))
print(predict_sentiment("I hate this movie!"))
