In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, LSTM, Dense

In [24]:
# Step 1: Data Preprocessing
df = pd.read_csv("twitter_data.csv", encoding='latin-1')

In [25]:
# Drop rows with NaN values in 'tweet_text' column
df.dropna(subset=['tweet_text'], inplace=True)

# Convert 'tweet_text' column to string type
df['tweet_text'] = df['tweet_text'].astype(str)


In [34]:
# Print unique label values to check for anomalies
print("Unique label values:", df['label'].unique())

# Ensure that label values are within the expected range [0, 3]
if df['label'].min() < 0 or df['label'].max() >= 4:
    print("Error: Label values outside the valid range [0, 3]")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['tweet_text'], df['label'], test_size=0.2, random_state=42)


Unique label values: [nan]


In [26]:
# Define labels
label_map = {'positive': 0, 'negative': 1, 'neutral': 2, 'no_idea': 3}
df['label'] = df['is_there_an_emotion_directed_at_a_brand_or_product'].map(label_map)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['tweet_text'], df['label'], test_size=0.2, random_state=42)


In [35]:
# Tokenization and padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
max_len = max(len(seq) for seq in X_train_seq)
X_train_padded = pad_sequences(X_train_seq, maxlen=max_len)
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len)


In [28]:
# Define the model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_len))
model.add(LSTM(units=128))
model.add(Dense(units=4, activation='softmax'))


In [38]:
# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test_padded, y_test)
print("Test Accuracy:", accuracy)