# Problem Statement:
You are a Data Scientist in a big firm. You have to develop a deep learning model to perform sentiment analysis on a dataset of tweets related to various candidates.

# Tasks to be Performed:

### Data Loading and Preprocessing:

● Load the tweet data from a CSV file.

● Filter out the relevant columns: 'candidate', 'sentiment', and 'text'.

● Preprocess the text data by removing stop words, punctuation, converting to lowercase, and other cleaning steps.

## Text Vectorization:

Convert the preprocessed text data into numerical format using tokenization and padding, so that it can be fed into a deep learning model.

### Model Development:

Develop a deep learning model using TensorFlow and Keras. The model includes an Embedding layer, a SpatialDropout1D layer to prevent overfitting, an LSTM layer for sequence data processing, and a Dense layer for output. It aims to classify the sentiment of each tweet into one of the three categories.

### Model Training and Evaluation:
● Train the model on the processed text data, using categorical cross-entropy as the loss function, and accuracy as the evaluation metric.

● Use a validation split to evaluate the model's performance and prevent overfitting.

# 1. Load Libraries

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SpatialDropout1D, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam


import warnings
warnings.filterwarnings("ignore")

  if not hasattr(np, "object"):


In [2]:
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))
nltk.download('punkt', quiet=True)

True

# 2. Load Data

In [3]:
df = pd.read_csv(r"Downloads\Tweets.csv")
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [4]:
#Select Required Columns
df_filtered = df[['name', 'airline_sentiment', 'text']].copy()
df_filtered.columns = ['candidate', 'sentiment', 'text']
df_filtered['candidate'] = df_filtered['candidate'].astype(str)


print("Filtered columns:", df_filtered.columns.tolist())
print("Dataset shape:", df_filtered.shape)
print("\nSentiment distribution:")
print(df_filtered['sentiment'].value_counts())

Filtered columns: ['candidate', 'sentiment', 'text']
Dataset shape: (14640, 3)

Sentiment distribution:
sentiment
negative    9178
neutral     3099
positive    2363
Name: count, dtype: int64


In [5]:
df_filtered.head()

Unnamed: 0,candidate,sentiment,text
0,cairdin,neutral,@VirginAmerica What @dhepburn said.
1,jnardino,positive,@VirginAmerica plus you've added commercials t...
2,yvonnalynn,neutral,@VirginAmerica I didn't today... Must mean I n...
3,jnardino,negative,@VirginAmerica it's really aggressive to blast...
4,jnardino,negative,@VirginAmerica and it's a really big bad thing...


# 3. Preprocessing

In [6]:
# Drop Missing Values
df_filtered = df_filtered.dropna(subset=['text', 'sentiment'])
df_filtered.head()

Unnamed: 0,candidate,sentiment,text
0,cairdin,neutral,@VirginAmerica What @dhepburn said.
1,jnardino,positive,@VirginAmerica plus you've added commercials t...
2,yvonnalynn,neutral,@VirginAmerica I didn't today... Must mean I n...
3,jnardino,negative,@VirginAmerica it's really aggressive to blast...
4,jnardino,negative,@VirginAmerica and it's a really big bad thing...


In [7]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\d+", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

In [8]:
df_filtered['clean_text'] = df_filtered['text'].apply(clean_text)
df_filtered.head()

Unnamed: 0,candidate,sentiment,text,clean_text
0,cairdin,neutral,@VirginAmerica What @dhepburn said.,virginamerica dhepburn said
1,jnardino,positive,@VirginAmerica plus you've added commercials t...,virginamerica plus youve added commercials exp...
2,yvonnalynn,neutral,@VirginAmerica I didn't today... Must mean I n...,virginamerica didnt today must mean need take ...
3,jnardino,negative,@VirginAmerica it's really aggressive to blast...,virginamerica really aggressive blast obnoxiou...
4,jnardino,negative,@VirginAmerica and it's a really big bad thing...,virginamerica really big bad thing


In [9]:
# Label Encoding
label_encoder = LabelEncoder()
df_filtered['label'] = label_encoder.fit_transform(df_filtered['sentiment'])
labels = to_categorical(df_filtered['label'])
print("Classes:", label_encoder.classes_)

Classes: ['negative' 'neutral' 'positive']


# 4. Train Test Split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df_filtered['clean_text'], labels, test_size=0.2, random_state=42)

# 5. Tokenization & Padding

In [11]:
max_features = 10000
maxlen = 100

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen)
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen)

print(f"Training sequences shape: {X_train_pad.shape}")
print(f"Vocabulary size: {len(tokenizer.word_index)}")

Training sequences shape: (11712, 100)
Vocabulary size: 12100


# 6. Hyperparameter Tuning

In [12]:
embed_dim = 128
lstm_out = 128

def build_model(learning_rate):
    model = Sequential([
    Embedding(max_features, embed_dim, input_length=maxlen),
    SpatialDropout1D(0.4),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(3, activation='softmax')])
    
    optimizer = Adam(learning_rate=learning_rate)
    
    model.compile(
        optimizer=optimizer,
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model

In [13]:
learning_rates = [0.05, 0.01, 0.001]
batch_sizes = [128, 64, 32]
epochs_list = [20, 10]

In [14]:
results = []

for lr in learning_rates:
    for batch in batch_sizes:
        for epochs in epochs_list:
            
            print(f"Training with LR={lr}, Batch={batch}, Epochs={epochs}")
            
            model = build_model(lr)
            early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

            
            model.fit(X_train_pad, y_train, epochs=epochs,
                    batch_size=batch, validation_split=0.2,
                    callbacks=[early_stopping], verbose=0)
            
            _, test_acc = model.evaluate(X_test_pad, y_test, verbose=0)
            
            results.append({
                "learning_rate": lr,
                "batch_size": batch,
                "epochs": epochs,
                "test_accuracy": test_acc})

Training with LR=0.05, Batch=128, Epochs=20
Training with LR=0.05, Batch=128, Epochs=10
Training with LR=0.05, Batch=64, Epochs=20
Training with LR=0.05, Batch=64, Epochs=10
Training with LR=0.05, Batch=32, Epochs=20
Training with LR=0.05, Batch=32, Epochs=10
Training with LR=0.01, Batch=128, Epochs=20
Training with LR=0.01, Batch=128, Epochs=10
Training with LR=0.01, Batch=64, Epochs=20
Training with LR=0.01, Batch=64, Epochs=10
Training with LR=0.01, Batch=32, Epochs=20
Training with LR=0.01, Batch=32, Epochs=10
Training with LR=0.001, Batch=128, Epochs=20
Training with LR=0.001, Batch=128, Epochs=10
Training with LR=0.001, Batch=64, Epochs=20
Training with LR=0.001, Batch=64, Epochs=10
Training with LR=0.001, Batch=32, Epochs=20
Training with LR=0.001, Batch=32, Epochs=10


In [15]:
results_df = pd.DataFrame(results)
results_df.sort_values(by="test_accuracy", ascending=False)

Unnamed: 0,learning_rate,batch_size,epochs,test_accuracy
16,0.001,32,20,0.795424
11,0.01,32,10,0.793033
9,0.01,64,10,0.789617
10,0.01,32,20,0.789617
6,0.01,128,20,0.789276
13,0.001,128,10,0.788934
12,0.001,128,20,0.788934
17,0.001,32,10,0.786885
15,0.001,64,10,0.785519
7,0.01,128,10,0.784153


Lets Select the Best Combination for our Model. learning_rate 0.001, batch_size 32, epochs 20

# 6. Build LSTM Model

In [17]:
embed_dim = 128
lstm_out = 128

model = Sequential([
    Embedding(max_features, embed_dim, input_length=maxlen),
    SpatialDropout1D(0.4),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(3, activation='softmax')
])

# 7. Compile The Model

In [18]:
model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

# 8. Train the Model

In [19]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(X_train_pad, y_train, 
                   epochs=20, 
                   batch_size=32,
                   validation_split=0.2,
                   callbacks=[early_stopping],
                   verbose=1)

Epoch 1/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m119s[0m 374ms/step - accuracy: 0.7009 - loss: 0.7239 - val_accuracy: 0.7584 - val_loss: 0.6048
Epoch 2/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 344ms/step - accuracy: 0.8197 - loss: 0.4626 - val_accuracy: 0.7810 - val_loss: 0.5549
Epoch 3/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 344ms/step - accuracy: 0.8754 - loss: 0.3362 - val_accuracy: 0.7776 - val_loss: 0.5707
Epoch 4/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 344ms/step - accuracy: 0.9090 - loss: 0.2525 - val_accuracy: 0.7704 - val_loss: 0.6581
Epoch 5/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 343ms/step - accuracy: 0.9298 - loss: 0.1969 - val_accuracy: 0.7610 - val_loss: 0.7174


# 9. Evaluation

In [26]:
test_loss, test_acc = model.evaluate(X_test_pad, y_test, verbose=0)
print(f"\nTest Accuracy: {test_acc:.4f}")
print(f"Test Loss: {test_loss:.4f}")


Test Accuracy: 0.7838
Test Loss: 0.5331


# 10. Predict Sentiment for Test Set

In [27]:
# Predictions on test set
y_pred = model.predict(X_test_pad)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_test, axis=1)

[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 132ms/step


In [28]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_true_classes, y_pred_classes, target_names=label_encoder.classes_))

              precision    recall  f1-score   support

    negative       0.83      0.92      0.87      1889
     neutral       0.65      0.38      0.48       580
    positive       0.69      0.72      0.70       459

    accuracy                           0.78      2928
   macro avg       0.72      0.67      0.69      2928
weighted avg       0.77      0.78      0.77      2928



# 11. Predict Sentiment for a New Tweet

In [29]:
sample_texts = [
    "great flight on time comfortable seats",
    "delayed again terrible service",
    "average experience nothing special"
]

In [31]:
sample_clean = [clean_text(text) for text in sample_texts]
sample_seq = tokenizer.texts_to_sequences(sample_clean)
sample_pad = pad_sequences(sample_seq, maxlen=maxlen)
sample_pred = model.predict(sample_pad)
sample_results = label_encoder.inverse_transform(np.argmax(sample_pred, axis=1))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 209ms/step


In [32]:
for text, pred in zip(sample_texts, sample_results):
    print(f"'{text}' -> {pred}")

'great flight on time comfortable seats' -> positive
'delayed again terrible service' -> negative
'average experience nothing special' -> negative
