In [1]:
import pandas as pd
import re
import string
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


In [2]:
# Load the data
data = pd.read_csv('/content/tweet_emotions.csv')

In [3]:
# Drop rows with missing values
data.dropna(subset=['sentiment', 'content'], inplace=True)

In [4]:
# Define a function to clean the text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = ' '.join(word for word in text.split() if word not in stopwords.words('english'))  # Remove stopwords
    return text


In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
# Apply the text cleaning function to the content column
data['cleaned_content'] = data['content'].apply(clean_text)

In [8]:
# Encode sentiment labels to numerical values
label_encoder = LabelEncoder()
data['sentiment_encoded'] = label_encoder.fit_transform(data['sentiment'])

In [9]:
# Display the first few rows of the cleaned data
print(data.head())

     tweet_id   sentiment                                            content  \
0  1956967341       empty  @tiffanylue i know  i was listenin to bad habi...   
1  1956967666     sadness  Layin n bed with a headache  ughhhh...waitin o...   
2  1956967696     sadness                Funeral ceremony...gloomy friday...   
3  1956967789  enthusiasm               wants to hang out with friends SOON!   
4  1956968416     neutral  @dannycastillo We want to trade with someone w...   

                                     cleaned_content  sentiment_encoded  
0  know listenin bad habit earlier started freaki...                  2  
1             layin n bed headache ughhhhwaitin call                 10  
2                      funeral ceremonygloomy friday                 10  
3                            wants hang friends soon                  3  
4             want trade someone houston tickets one                  8  


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limit to top 5000 features for performance




In [11]:
# Fit and transform the cleaned text data to TF-IDF features
X_tfidf = tfidf_vectorizer.fit_transform(data['cleaned_content'])

In [12]:
# Display the shape of the TF-IDF feature matrix
print(X_tfidf.shape)

(40000, 5000)


In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, data['sentiment_encoded'], test_size=0.2, random_state=42)


In [14]:
# Initialize the Logistic Regression model
log_reg = LogisticRegression(max_iter=1000)


In [15]:
# Train the model
log_reg.fit(X_train, y_train)


In [16]:
# Make predictions on the test set
y_pred = log_reg.predict(X_test)


In [17]:
# Evaluate the model
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        19
     boredom       0.00      0.00      0.00        31
       empty       0.33      0.01      0.01       162
  enthusiasm       0.00      0.00      0.00       163
         fun       0.11      0.01      0.03       338
   happiness       0.34      0.37      0.35      1028
        hate       0.51      0.16      0.24       268
        love       0.51      0.37      0.43       762
     neutral       0.33      0.57      0.42      1740
      relief       0.37      0.02      0.04       352
     sadness       0.33      0.24      0.28      1046
    surprise       0.34      0.05      0.09       425
       worry       0.33      0.48      0.39      1666

    accuracy                           0.35      8000
   macro avg       0.27      0.18      0.18      8000
weighted avg       0.34      0.35      0.31      8000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
#LSTM MODEL
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# Tokenize the text data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(data['cleaned_content'])
X_tokenized = tokenizer.texts_to_sequences(data['cleaned_content'])

# Pad the sequences to have the same length
X_padded = pad_sequences(X_tokenized, maxlen=100)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, data['sentiment_encoded'], test_size=0.2, random_state=42)

# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=100))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test), verbose=1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f'Test Accuracy: {accuracy}')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Accuracy: 0.3266249895095825


In [23]:
#Evaluating and Comparing the models
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    return accuracy, precision, recall, f1

