# Sentiment Analysis
---

## Loading necessary libraries.

In [None]:
%pip install nltk
%pip install scikit-learn
%pip install numpy
%pip install python-utils
%pip install gensim

In [None]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from nltk.corpus import twitter_samples
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
import numpy as np
import re
import nltk
import string
import pandas as pd
from nltk.corpus import twitter_samples, stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

##  Traditional Sentiment Analysis approach


### Preprocessing the dataset



Tokenization after normalization

In [None]:
# Import nltk samples, stopwords
nltk.download('twitter_samples')
nltk.download('stopwords')

# Read corpus package
print(twitter_samples.fileids())

In [None]:
def process_tweet(tweet):
    """
    Process tweet function.
    
    Input:
        tweet: a string containing a tweet.
    Output:
        tweets_clean: a list of words containing the processed tweet.

    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean

In [None]:
# Extract positive and negative tweets
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')


all_tweets = positive_tweets + negative_tweets

# Create labels: 1 for positive, 0 for negative
positive_labels = [1] * len(positive_tweets)
negative_labels = [0] * len(negative_tweets)

all_labels = positive_labels + negative_labels


In [None]:
# Load emotion data set. Data set should be located to same the path as jupyter notebook

emotion_data = pd.read_csv('combined_emotion.csv')

positive_emotions = emotion_data[emotion_data['emotion'] == 'joy']
negative_emotions = emotion_data[emotion_data['emotion'] == 'sad']

all_emotions = pd.concat([positive_emotions, negative_emotions])
print(all_emotions.head())


pos_emotion_labels = [1] * len(positive_emotions)
neg_emotion_labels = [0] * len(negative_emotions)

all_emotion_labels = pos_emotion_labels + neg_emotion_labels


Create DataFrame for visualization

In [None]:
def create_dataframe(tweets, labels):
    """
    Create DataFrame to visualize the dataset.
    
    Input:
        tweet: a list containing tweet texts.
        labels: list containing labels for each tweet (1 for positve, 0 for negative).
    Output:
        A DataFrame with 2 columns of tweets and labels.

    """
    if len(tweets) != len(labels):
        raise ValueError

    df = pd.DataFrame({'tweets': tweets, 'labels': labels})

    return df

In [None]:
# Create DataFrame for tweets and labels
df = create_dataframe(all_tweets, all_labels)
print(df.head())

In [None]:
emotion_df = pd.DataFrame({'sentence': all_emotions['sentence'].tolist(), 'label': all_emotion_labels})
print(emotion_df.head())


In [None]:
# Preparing data for the Word2Vec model
cleaned_tweets = []
for tweet in all_tweets:
    cleaned_tweet = process_tweet(tweet)
    cleaned_tweets.append(cleaned_tweet)

In [None]:
# Initialize tools
# May take over a 1min
stopwords_english = stopwords.words('english')
stemmer = PorterStemmer()

def process_text(text):
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tokens = tokenizer.tokenize(text)
    
    cleaned_tokens = []
    for word in tokens:
        if word not in stopwords_english and word not in string.punctuation:
            stem_word = stemmer.stem(word)  # Apply stemming
            cleaned_tokens.append(stem_word)
    return cleaned_tokens

# Apply preprocessing to emotion data
emotion_df['cleaned_text'] = emotion_df['sentence'].apply(process_text)
cleaned_emotions = emotion_df['cleaned_text'].tolist()


In [None]:
# Word Embeddings using Word2Vec model
word2vec_model = Word2Vec(cleaned_tweets, vector_size=20,
                          window=5, min_count=5, workers=4)

word_embeddings = word2vec_model.wv

In [None]:
word2vec_emotion_model = Word2Vec(cleaned_emotions, vector_size=20,
                          window=5, min_count=5, workers=4)

emotion_embeddings = word2vec_emotion_model.wv

In [None]:
# Example using word embeddings
print(word_embeddings['listen'])

In [None]:
print(emotion_embeddings['listen'])

### Create tweet embeddings

Since each tweet consists of multiple words, convert each tweet into a single vector representation by averaging the Word2Vec embeddings of all the words in the tweet. This averaged vector will represent the tweet in a fixed-dimensional space, suitable for input into a classifier.

In [None]:
def get_embedding(tweet_tokens, word2vec_model):
    """
    Generate the embedding for a tweet by averaging word vectors.
    
    Input: 
        tweet_tokens: a list of tokens from processed tweet.
        word2vec_model: a trained Word2Vec model that contains word embeddings.
    Output:
        tweet_embedding: a numpy array representing the averaged embedding 
                        vector for a tweet. The dimension of the array is 
                        equal to the vector_size of the Word2Vec model.

    """
    tweet_vecs = []

    for word in tweet_tokens:
        if word in word2vec_model.wv:
            tweet_vecs.append(word2vec_model.wv[word])

    if len(tweet_vecs) == 0:
        return np.zeros(word2vec_model.vector_size)

    tweet_embedding = np.mean(tweet_vecs, axis=0)

    return tweet_embedding

In [None]:
# Example for using get_tweet_embeddings function
tweet1 = df['tweets'][4]
tweet1_cleaned = process_tweet(tweet1)
print(tweet1_cleaned)
tweet1_embedding = get_embedding(tweet1, word2vec_model)
print(tweet1_embedding)

Prepare and split the data into training and testing sets

In [None]:
# Convert labels to a numpy array
labels = np.array(all_labels)

# Generate embeddings for all tweets in the dataset
tweet_embeddings = np.array([get_embedding(
    tweet, word2vec_model) for tweet in cleaned_tweets])

# Split the dataset into training and test sets (80% train, 20% test)
x_train, x_test, y_train, y_test = train_test_split(
    tweet_embeddings, labels, test_size=0.2, random_state=42)


In [None]:
# Convert labels to a numpy array
emotion_labels = np.array(all_emotion_labels)

# Generate embeddings for all emotions in the dataset
emotion_embeddings = np.array([get_embedding(
    emotion, word2vec_emotion_model) for emotion in cleaned_emotions])

# Split the dataset into training and test sets (80% train, 20% test)
x_em_train, x_em_test, y_em_train, y_em_test = train_test_split(
    emotion_embeddings, emotion_labels, test_size=0.2, random_state=42)

### Train Classifier

Train a Classifier using Logistic Regression as a baseline classifier, which works well for binary classification tasks like sentiment analysis

In [None]:
# Initialize and train the logistic regression classifier
clf = LogisticRegression(random_state=42)
clf.fit(x_train, y_train)

# Predict on the test set
y_pred = clf.predict(x_test)

In [None]:
# Initialize and train the logistic regression classifier
em_clf = LogisticRegression(random_state=42)
em_clf.fit(x_em_train, y_em_train)

# Predict on the test set
y_em_pred = clf.predict(x_em_test)

### Evaluate the model

In [None]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy: ", accuracy)

# Detailed classification report
print(classification_report(y_test, y_pred))

In [None]:
# Calculate accuracy
em_accuracy = accuracy_score(y_em_test, y_em_pred)
print("Test Accuracy: ", em_accuracy)

# Detailed classification report
print(classification_report(y_em_test, y_em_pred))

Example prediction by using the trained model

In [None]:
def predict_tweet(tweet, word2vec_model, clf):
    """
    Predict sentiment of a tweet using a trained Word2Vec model and classifier.
    
    Input:
        tweet: raw text to predict sentiment.
        word2vec_model: a trained Word2Vec model containing word embeddings.
        clf: a trained classifier for sentiment predictions.
    Output:
        Returns "Positive" if the predicted sentiment is positive, 
                otherwise returns "Negative".
                
    """
    processed_tweet = process_tweet(tweet)
    tweet_embedding = get_embedding(processed_tweet, word2vec_model)
    prediction = clf.predict([tweet_embedding])

    return "Positive" if prediction == 1 else "Negative"

In [None]:
def predict_emotion(sentence, word2vec_emotion_model, em_clf):

    processed_sentence = process_text(sentence)
    sentence_embedding = get_embedding(processed_sentence, word2vec_emotion_model)
    prediction = em_clf.predict([sentence_embedding])

    return "Positive" if prediction == 1 else "Negative"

In [None]:
# Example prediction
new_tweet = "I like to study NLP <3"
print("Sentiment: ", predict_tweet(new_tweet, word2vec_model, clf))

In [None]:
new_sentence = "I like to study NLP <3"
print("Sentiment: ", predict_emotion(new_sentence, word2vec_emotion_model, em_clf))

## Multi-Layer Perceptrons

Multi-Layer Perceptrons (MLP) can enhance the predictive power of sentiment analysis model by allowing it to capture more complex patterns in the data.

### Outline for building a Multi-Layer Perceptrons for Sentiment Analysis

#### 1. Data Preparation: 
- Use the embeddings generated for each tweet as input features for the MLP.

#### 2. Model Architecture:
- Design a simple MLP with several fully connected layers (dense layers), an activation function (ReLu) for non-linearity, and dropout layers to prevent overfitting.

- Use a final ouput layer with a sigmoid activation function for binary classification.

#### 3. Training and Evaluation:
- Train the MLP on the training set, validate on the test set, and evaluate performance using accuracy and a classification report.

#### 4. Hyperparameter Tuning:
- Experiment with the number of layers, number of neurons, dropout rates, and learning rate to optimize performance.

### Import necessary libraries

In [None]:
%pip install torch

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

### Implement the Multi-Layer Perceptron (MLP) class in PyTorch

In [None]:
class SentimentMLP(nn.Module):
    def __init__(self, embedding_dim: int, 
                 hidden_dim1: int, 
                 hidden_dim2: int, 
                 dropout: float):
        """
        Initialize the Multi-Layer Perceptrons (MLP) model for sentiment analysis.

        Parameters:
            input_dim (int): dimension of the input features (tweet embedding size).
            hidden_dim1 (int): Number of neurons in the first hidden layer.
            hidden_dim2 (int): Number of neurons in the second hidden layer.
            dropout_rate (float): Dropout rate to prevent overfitting.

        """
        super(SentimentMLP, self).__init__()

        # Using nn.Sequential to stack layers
        self.network = nn.Sequential(
            nn.Linear(embedding_dim, hidden_dim1),  # First hidden layer
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim1, hidden_dim2),  # Second hidden layer
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim2, 1),  # Output layer
            nn.Sigmoid()  # Sigmoid activation for binary classification
        )
    
    def forward(self, x):
        return self.network(x)

### Prepare the data

In [None]:
X = np.array(tweet_embeddings)  # tweet_embeddings generated from Word2Vec
y = np.array(labels)  # labels for the tweets (1 for positive, 0 for negative)

# Convert data to PyTorch tensors
X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)

# Split the data into training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
em_X = np.array(emotion_embeddings)
em_y = np.array(emotion_labels)

# Convert data to PyTorch tensors
em_X = torch.tensor(em_X, dtype=torch.float32)
em_y = torch.tensor(em_y, dtype=torch.float32)

# Split the data into training and test datasets
X_em_train, X_em_test, y_em_train, y_em_test = train_test_split(em_X, em_y, test_size=0.2, random_state=42)

In [None]:
# Initialize the parameters for the model
embedding_dim = X_train.shape[1]
hidden_dim1 = 128
hidden_dim2 = 64
dropout_rate = 0.4

Initialize the model

In [None]:
mlp_model = SentimentMLP(
    embedding_dim=embedding_dim,
    hidden_dim1=hidden_dim1,
    hidden_dim2=hidden_dim2,
    dropout=dropout_rate
)

In [None]:
em_embedding_dim = X_em_train.shape[1]
em_hidden_dim1 = 128
em_hidden_dim2 = 64
em_dropout_rate = 0.2

em_mlp_model = SentimentMLP(
    embedding_dim=em_embedding_dim,
    hidden_dim1=em_hidden_dim1,
    hidden_dim2=em_hidden_dim2,
    dropout=em_dropout_rate
)

Implementation for training stage

In [None]:
def train(model, x_train, y_train, criterion, optimizer, num_epochs, batch_size, print_every):
    """
    Train a model with the given dataset, loss function, and optimizer.

    Parameters:
        model: the neural network model to train.
        x_train: training features.
        y_train: training labels.
        criterion: loss function.
        optimizer: optimizer to update model parameters.
        num_epochs: number of epochs to train.
        batch_size: size of each batch to training.
        print_every: frequency of printing loss (e.g, every 5 epochs)
    
    Returns:
        A list of loss values for each epoch.
        
    """

    epoch_losses = []
    
    model.train()

    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        permutation = torch.randperm(x_train.size(0))
        epoch_loss = 0

        # Mini-batch training
        for i in range(0, x_train.size(0), batch_size):
            # Select mini-batch
            index = permutation[i:i+ batch_size]
            batch_x, batch_y = x_train[index], y_train[index]

            # Zero gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(batch_x).squeeze()
            loss = criterion(outputs, batch_y)

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            # Accumulate loss for each epoch
            epoch_loss += loss.item()
        
        # Average loss for each epoch
        avg_epoch_loss = epoch_loss / len(permutation)
        epoch_losses.append(avg_epoch_loss)

        # Print progress for each epoch
        if (epoch + 1) % print_every == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}, Loss: {avg_epoch_loss}]")
    
    return model, epoch_losses

Define the loss function and optimizer

In [None]:
criterion = nn.BCELoss()
optimizer = optim.Adam(mlp_model.parameters(), lr=0.001, weight_decay=1e-5)

In [None]:
em_criterion = nn.BCELoss()
em_optimizer = optim.Adam(em_mlp_model.parameters(), lr=0.001, weight_decay=1e-5)

Train model

In [None]:
model, losses = train(mlp_model, X_train, y_train, criterion, optimizer, num_epochs=20, batch_size=64, print_every=5)

In [None]:
em_model, em_losses = train(em_mlp_model, X_em_train, y_em_train, em_criterion, em_optimizer, num_epochs=30, batch_size=256, print_every=5)

Evaluate the model

In [None]:
def evaluate(model, x_test, y_test):
    """
    Evaluate the model on the test dataset and print performance metrics.
    
    Parameters:
        model (nn.Module): Trained model to evaluate.
        X_test (torch.Tensor): Test features.
        y_test (torch.Tensor): Test labels.
        
    Returns:
        Accuracy of the model on the test set.

    """

    model.eval()

    with torch.no_grad():
        outputs = model(x_test)
        predictions = (outputs > 0.5).int()

    accuracy = accuracy_score(y_test, predictions)

    print('Test Accuracy:', accuracy)
    print(classification_report(y_test, predictions))

In [None]:
evaluate(model, X_test, y_test)

In [None]:
evaluate(em_model, X_em_test, y_em_test)

In [None]:
# Task 4:  Evaluate emotion-trained model on tweet data
evaluate(em_model, X_test, y_test)

In [None]:
# Task 5:  Evaluate tweet-trained model on emotion data
evaluate(model, X_em_test, y_em_test)

Predict Sentiment for new data

In [None]:
def preprocess_text(text):
    """
    Preprocessing the new data using process tweet function and Word2Vec model defined above.
    
    """
    processed_tokens = process_tweet(text)
    
    tweet_embedding = get_embedding(processed_tokens, word2vec_model)
    
    # Convert to a tensor and reshape to match the model's expected input shape
    tweet_embedding_tensor = torch.tensor(tweet_embedding, dtype=torch.float32).unsqueeze(0)
    return tweet_embedding_tensor

In [None]:
def preprocess_emotion(text):
    """
    Preprocessing the new data using process tweet function and Word2Vec model defined above.
    
    """
    processed_tokens = process_text(text)
    
    en_embedding = get_embedding(processed_tokens, word2vec_emotion_model)
    
    em_embedding_tensor = torch.tensor(en_embedding, dtype=torch.float32).unsqueeze(0)
    return em_embedding_tensor

In [None]:
def predict_sentiment(model, text, is_emotion):
    """
    Predict the sentiment of a given text using the trained model.
    
    Parameters:
        model: the trained MLP model.
        text: input text to analyze.
    
    Returns:
        "Positive" if sentiment is positive, otherwise "Negative".
    """
    # Set the model to evaluation mode
    model.eval()
    
    # Preprocess the text and get the embedding
    input_tensor = ""
    if is_emotion:
        input_tensor = preprocess_emotion(text)
    else:
        input_tensor = preprocess_text(text)
    
    # Disable gradient computation for inference
    with torch.no_grad():
        output = model(input_tensor)  # Model outputs probability due to Sigmoid activation
    
    # Interpret the output
    prediction = (output.item() > 0.5)  # Threshold at 0.5 for binary classification
    sentiment = "Positive" if prediction else "Negative"
    
    return sentiment

In [None]:
# Example text to analyze
new_text = "Oh great, it's raining again!"

In [None]:
# Predict the sentiment
sentiment = predict_sentiment(model, new_text, is_emotion=False)
print(sentiment)

In [None]:
em_sentiment = predict_sentiment(em_model, new_text, is_emotion=True)
print(em_sentiment)