# Multi-class Text Sentiment Classification

## Objective
This notebook demonstrates the following:
- Building a multi-class sentiment classification model using neural networks.
- Performing exploratory data analysis (EDA) on a sentiment dataset.
- Preprocessing text data.
- Using pre-trained embeddings (GloVe and Word2Vec).
- Evaluating the model using metrics such as accuracy, precision, recall, and F1 scores.
- Providing an interactive prompt for user input to predict sentiments.

## 1. Load Libraries and Data
Importing all necessary libraries for data analysis, preprocessing, and model building.


In [None]:
# General Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from wordcloud import WordCloud

# NLP Libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# TensorFlow/Keras Libraries
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler, ModelCheckpoint
from tensorflow.keras.layers import Bidirectional, Input
from tensorflow.keras.optimizers import Adam

# Word Embeddings
from gensim.models import KeyedVectors
# Word Embeddings
from gensim.models import KeyedVectors

# Initialize NLTK Resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')


## 2. Load and Explore Data
Load the dataset and explore its structure.

In [None]:
def load_data():
    train_data = pd.read_csv('../Sentimental Analysis/dataset_twitter/training.csv')
    test_data = pd.read_csv('../Sentimental Analysis/dataset_twitter/test.csv')
    validation_data = pd.read_csv('../Sentimental Analysis/dataset_twitter/validation.csv')
    return train_data, test_data, validation_data

train_data, test_data, validation_data = load_data()

# Display a sample of the training data
train_data.head()


## 3. Data Preprocessing
 - Cleaning the text data.
 - Removing irrelevant labels.
 - Mapping sentiment labels to numerical values.

In [None]:
# Initialize Lemmatizer and Stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Contraction mapping
CONTRACTIONS = {
    "n't": "not",
    "'re": "are",
    "'s": "is",
    "'d": "would",
    "'ll": "will",
    "'t": "not",
    "'ve": "have",
    "'m": "am"
}


In [None]:
# Function to preprocess text
def preprocess_text(text):
    if not isinstance(text, str):
        text = str(text)

    # Convert text to lowercase
    text = text.lower()

    # Expand contractions
    for contraction, expanded in CONTRACTIONS.items():
        text = re.sub(rf"{contraction}\b", expanded, text)

    # Remove URLs, emails, non-ASCII characters, numbers, punctuation, and extra whitespaces
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    # Tokenize, lemmatize, and remove stopwords
    tokenized = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokenized if word not in stop_words]

    return " ".join(tokens)

# Apply preprocessing
train_data['clean_text'] = train_data['text'].apply(preprocess_text)
test_data['clean_text'] = test_data['text'].apply(preprocess_text)
validation_data['clean_text'] = validation_data['text'].apply(preprocess_text)

# Remove irrelevant labels
train_data = train_data[train_data['label'] != 'Irrelevant']
validation_data = validation_data[validation_data['label'] != 'Irrelevant']

In [None]:
# Map numerical labels to sentiments
labels_dict = {"Positive": 2, "Neutral": 1, "Negative": 0}
train_data['emotion'] = train_data['label'].map(labels_dict)
test_data['emotion'] = test_data['label'].map(labels_dict)
validation_data['emotion'] = validation_data['label'].map(labels_dict)

In [None]:
train_data

## 4. Exploratory Data Analysis (EDA)
Visualize the data distribution and explore frequent words.

In [None]:
from collections import Counter

def eda(data):
    """Perform basic exploratory data analysis."""
    print(f"Total records: {len(data)}")
    print(data['label'].value_counts())

    # Word Frequency Analysis
    all_text = " ".join(data['clean_text'])
    word_freq = Counter(all_text.split()).most_common(25)
    words, counts = zip(*word_freq)

    # Plot Word Frequencies
    plt.figure(figsize=(12, 6))
    plt.bar(words, counts, color='gray')
    plt.title('Top 25 Most Frequent Words')
    plt.xticks(rotation=90)
    plt.xlabel('Words')
    plt.ylabel('Frequency')
    plt.grid(True, which='both', linestyle='--', linewidth=0.5, color='grey', alpha=0.7)
    plt.tight_layout()
    plt.show()

    # Word Cloud
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text)
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title("Word Cloud of Text Data")
    plt.show()

# Apply EDA
eda(train_data)


## 5. Build and Train Neural Network


### 5.1 TF-IDF Features
Generate TF-IDF features from the cleaned text data.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def compute_tfidf(train_texts, test_texts, val_texts, max_features=5000):
    vectorizer = TfidfVectorizer(max_features=10000, stop_words='english', ngram_range=(1, 2))
    X_train_tfidf = vectorizer.fit_transform(train_texts).toarray()
    X_test_tfidf = vectorizer.transform(test_texts).toarray()
    X_val_tfidf = vectorizer.transform(val_texts).toarray()
    return X_train_tfidf, X_test_tfidf, X_val_tfidf

# Apply TF-IDF
train_texts = train_data['clean_text']
test_texts = test_data['clean_text']
val_texts = validation_data['clean_text']

X_train_tfidf, X_test_tfidf, X_val_tfidf = compute_tfidf(train_texts, test_texts, val_texts)


In [None]:
# Convert labels to numerical encoding
train_labels = train_data['emotion'].values
validation_labels = validation_data['emotion'].values
test_labels = test_data['emotion'].values


### 5.2 Build and Train Dense Model

In [None]:
def build_dense_model(input_dim, num_classes):
    model = Sequential([
        Input(shape=(input_dim,)),
        Dense(512, activation='relu'),
        Dropout(0.5),
        Dense(256, activation='relu'),
        Dropout(0.5),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

# Build and Train Model
input_dim = X_train_tfidf.shape[1]
num_classes = len(labels_dict)
model = build_dense_model(input_dim, num_classes)

history = model.fit(
    X_train_tfidf, train_labels,
    validation_data=(X_val_tfidf, validation_labels),
    epochs=5,
    batch_size=32,
    callbacks=[EarlyStopping(monitor='val_loss', patience=3)],
    verbose=1
)


### 5.3 Evaluate the Model 
Evaluate performance using accuracy, precision, recall, and F1-score.


In [None]:
def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    predicted_classes = np.argmax(predictions, axis=1)

    accuracy = accuracy_score(y_test, predicted_classes)
    precision = precision_score(y_test, predicted_classes, average="macro")
    recall = recall_score(y_test, predicted_classes, average="macro")
    f1 = f1_score(y_test, predicted_classes, average="macro")

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    # Detailed Report
    print("Classification Report:")
    print(classification_report(y_test, predicted_classes, target_names=["Positive", "Neutral", "Negative"], zero_division=0))

# Evaluate
evaluate_model(model, X_test_tfidf, test_labels)


### 5.4 Using GloVe Pre-trained Embeddings

In [None]:
def load_glove_embeddings(embedding_file, embedding_dim):
    embeddings_index = {}
    with open(embedding_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector
    print(f"Loaded {len(embeddings_index)} word vectors.")
    return embeddings_index


In [None]:
# Load GloVe Embeddings
embedding_dim = 50
glove_file = "../Sentimental Analysis/glove/glove.6B.50d.txt"
embeddings_index = load_glove_embeddings(glove_file, embedding_dim)

def create_embedding_matrix(tokenizer, embeddings_index, embedding_dim):
    word_index = tokenizer.word_index
    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix


In [None]:
# Tokenize and Create Embedding Matrix
max_len = 50
vocab_size = 10000

# Tokenizer
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(train_data['clean_text'])

embedding_matrix = create_embedding_matrix(tokenizer, embeddings_index, embedding_dim)

In [None]:
def build_lstm_model_with_glove(vocab_size, embedding_dim, input_length, num_classes, embedding_matrix):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], input_length=input_length, trainable=False))
    model.add(LSTM(128, return_sequences=False))
    model.add(Dropout(0.5))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model


In [None]:
# Build LSTM Model with GloVe
model_glove = build_lstm_model_with_glove(vocab_size, embedding_dim, max_len, num_classes, embedding_matrix)

history_glove = model_glove.fit(
    pad_sequences(tokenizer.texts_to_sequences(train_data['clean_text']), maxlen=max_len, padding='post', truncating='post'),
    train_labels,
    validation_data=(pad_sequences(tokenizer.texts_to_sequences(validation_data['clean_text']), maxlen=max_len, padding='post', truncating='post'), validation_labels),
    epochs=5,
    batch_size=32,
    callbacks=[EarlyStopping(monitor='val_loss', patience=3)],
    verbose=1
)

### 5.5 Using Word2Vec Pre-trained Embeddings

In [None]:
# Load Word2Vec Pre-trained Embeddings
word2vec_file = "GoogleNews-vectors-negative300.bin"
word2vec = KeyedVectors.load_word2vec_format(word2vec_file, binary=True)

# %%
def create_word2vec_embedding_matrix(tokenizer, word2vec):
    word_index = tokenizer.word_index
    embedding_dim = word2vec.vector_size
    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        if word in word2vec:
            embedding_matrix[i] = word2vec[word]
    return embedding_matrix

# Create Word2Vec Embedding Matrix
embedding_matrix_word2vec = create_word2vec_embedding_matrix(tokenizer, word2vec)

In [None]:
# Build LSTM Model with Word2Vec
model_word2vec = build_lstm_model_with_glove(vocab_size, word2vec.vector_size, max_len, num_classes, embedding_matrix_word2vec)

history_word2vec = model_word2vec.fit(
    pad_sequences(tokenizer.texts_to_sequences(train_data['clean_text']), maxlen=max_len, padding='post', truncating='post'),
    train_labels,
    validation_data=(pad_sequences(tokenizer.texts_to_sequences(validation_data['clean_text']), maxlen=max_len, padding='post', truncating='post'), validation_labels),
    epochs=5,
    batch_size=32,
    callbacks=[EarlyStopping(monitor='val_loss', patience=3)],
    verbose=1
)

## 6. Save and Load the Best Model
Save the best-performing model during training and load it for future predictions.


In [None]:
# Define a checkpoint to save the best model
checkpoint_path = "best_dense_model.h5"
checkpoint = ModelCheckpoint(filepath=checkpoint_path, monitor='val_loss', save_best_only=True, verbose=1)

# Add the checkpoint to callbacks
callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
    checkpoint
]

# Train the dense model with the checkpoint
model = build_dense_model(input_dim, num_classes)

history = model.fit(
    X_train_tfidf, train_labels,
    validation_data=(X_val_tfidf, validation_labels),
    epochs=10,
    batch_size=32,
    callbacks=callbacks,
    verbose=1
)

### 6.1 Load the Saved Model


In [None]:
# Load the best saved model
best_model = load_model(checkpoint_path)
print("Best model loaded successfully.")

## 7. User Interaction for Predictions
Allow users to input custom text for predictions using the saved model.


In [None]:
def predict_user_text(text, model, tokenizer, max_len):
    """Make a prediction for user input."""
    processed_text = preprocess_text(text)
    sequence = tokenizer.texts_to_sequences([processed_text])
    padded_sequence = pad_sequences(sequence, maxlen=max_len, padding='post', truncating='post')

    prediction = model.predict(padded_sequence)
    predicted_class = np.argmax(prediction, axis=1)[0]

    label_map = {v: k for k, v in labels_dict.items()}
    return label_map[predicted_class]

In [None]:
# Example usage
while True:
    user_input = input("Enter a sentence for sentiment prediction (or type 'exit' to quit): ")
    if user_input.lower() == 'exit':
        break
    sentiment = predict_user_text(user_input, best_model, tokenizer, max_len)
    print(f"Predicted Sentiment: {sentiment}")