In [1]:
!pip install nltk



In [3]:
# Download resources and libraries
import pandas as pd
import nltk
import re
import string
import nltk

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Read and display CSV
df = pd.read_csv('/content/tweet_emotions.csv')
df.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [4]:
# Data exploration

df.shape

(40000, 3)

In [5]:
df.isna().sum()

Unnamed: 0,0
tweet_id,0
sentiment,0
content,0


In [6]:
df.duplicated().sum()

np.int64(0)

In [7]:
x = df['content']
y = df['sentiment']

In [8]:
#Text Preprocessing

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def preprocess_text(X):
    tokens = word_tokenize(X)

    tokens = [word.lower() for word in tokens if word.isalpha()]  # remove punctuation

    tokens = [word for word in tokens if word not in stopwords.words('english')]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(tokens)

In [9]:
# Apply Preprocessing

processed_tweets = [preprocess_text(tweet) for tweet in x]

print(processed_tweets[:3])

['tiffanylue know listenin bad habit earlier started freakin part', 'layin n bed headache ughhhh waitin call', 'funeral ceremony gloomy friday']


In [10]:
# Encode Labels

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_encoded[:10]

array([ 2, 10, 10,  3,  8, 12, 10, 12, 10, 10])

Traditional machine learning model

In [13]:
# Vectorization + ML Model

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(processed_tweets)

In [14]:
# Train-test split

X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y_encoded, test_size=0.2, random_state=42)

In [15]:
log_reg = LogisticRegression(max_iter=1000, multi_class='multinomial')
log_reg.fit(X_train, y_train)




In [16]:
# Predict
y_pred = log_reg.predict(X_test)


In [17]:
# Evaluate
print(accuracy_score(y_test, y_pred))

0.344875


Deep Learning - Simple RNN

In [32]:
# Deep Learning
!pip install tensorflow



In [4]:
# Import deep learning libraries
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

In [5]:
df = pd.read_csv('/content/tweet_emotions.csv')

In [6]:
x = df['content']
y = df['sentiment']

In [7]:
# split train test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [8]:
# Function to clean text
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Replace newlines with spaces
    text = text.replace('\n', ' ')
    # Remove characters that are not letters or spaces
    text = re.sub(r'[^a-z\s]', '', text)
    return text

In [9]:
# Apply cleaning to the text data
X_train_cleaned = X_train.apply(clean_text)
X_test_cleaned = X_test.apply(clean_text)

In [10]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train_cleaned)

# Convert text to sequences of integers
X_train_sequences = tokenizer.texts_to_sequences(X_train_cleaned)
X_test_sequences = tokenizer.texts_to_sequences(X_test_cleaned)

# Pad sequences to a maximum length of 300
max_length = 300
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length, padding='post', truncating='post')

print("Shape of X_train_padded:", X_train_padded.shape)
print("Shape of X_test_padded:", X_test_padded.shape)

Shape of X_train_padded: (32000, 300)
Shape of X_test_padded: (8000, 300)


In [11]:
# Encode the target variable
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# Encode the target variable
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Convert to one-hot encoding
y_train_categorical = to_categorical(y_train_encoded)
y_test_categorical = to_categorical(y_test_encoded)

print("Shape of y_train_categorical:", y_train_categorical.shape)
print("Shape of y_test_categorical:", y_test_categorical.shape)

Shape of y_train_categorical: (32000, 13)
Shape of y_test_categorical: (8000, 13)


In [13]:
# Get vocabulary size from tokenizer
vocab_size = len(tokenizer.word_index) + 1

model = Sequential()
model.add(Embedding(vocab_size, 10, input_shape=(max_length,)))
model.add(SimpleRNN(50))
model.add(Dense(50, activation='relu'))
model.add(Dense(y_train_categorical.shape[1], activation='softmax'))

  super().__init__(**kwargs)


In [14]:
model.summary()

In [15]:
model.compile(loss = 'binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [17]:
model.fit(X_train_padded, y_train_categorical, epochs=20, batch_size=128)

Epoch 1/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 80ms/step - accuracy: 0.1553 - loss: 0.3437
Epoch 2/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 97ms/step - accuracy: 0.2138 - loss: 0.2365
Epoch 3/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 101ms/step - accuracy: 0.2140 - loss: 0.2361
Epoch 4/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 85ms/step - accuracy: 0.2133 - loss: 0.2363
Epoch 5/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 85ms/step - accuracy: 0.2147 - loss: 0.2368
Epoch 6/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 87ms/step - accuracy: 0.2098 - loss: 0.2365
Epoch 7/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 84ms/step - accuracy: 0.2119 - loss: 0.2368
Epoch 8/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 84ms/step - accuracy: 0.2142 - loss: 0.2363
Epoch 9/20
[1m250/250

<keras.src.callbacks.history.History at 0x79480dbee420>

In [18]:
model.evaluate(X_test_padded,y_test_categorical)

[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 16ms/step - accuracy: 0.2107 - loss: 0.2360


[0.23663999140262604, 0.2082500010728836]

Deep Learning - LSTM

In [19]:
# LSTM

from tensorflow.keras.layers import LSTM

In [20]:
model1 = Sequential()
model1.add(Embedding(vocab_size, 10, input_shape=(max_length,)))
model1.add(LSTM(50,return_sequences = True))
model1.add(LSTM(128,dropout = 0.2))
model1.add(Dense(50, activation='relu'))
model1.add(Dense(y_train_categorical.shape[1], activation='softmax'))
model1.summary()


  super().__init__(**kwargs)


In [21]:
model1.compile(loss='binary_crossentropy',optimizer='adam',metrics = ['accuracy'])

In [23]:
model1.fit(X_test_padded,y_test_categorical,batch_size=256,epochs =5)

Epoch 1/5
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 2s/step - accuracy: 0.1738 - loss: 0.5114
Epoch 2/5
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 2s/step - accuracy: 0.2210 - loss: 0.2392
Epoch 3/5
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 2s/step - accuracy: 0.2059 - loss: 0.2374
Epoch 4/5
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 2s/step - accuracy: 0.2102 - loss: 0.2355
Epoch 5/5
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 2s/step - accuracy: 0.2080 - loss: 0.2363


<keras.src.callbacks.history.History at 0x79480c05cf80>

In [24]:
model1.evaluate(X_test_padded,y_test_categorical)

[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 158ms/step - accuracy: 0.2191 - loss: 0.2360


[0.2365790754556656, 0.2175000011920929]

In [None]:
#Traditional ML Modelshowing higher accuracy