## Sentiment Analysis on Product Tweets

Following the structure of this guide from HuggingFace: https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb

Dataset used: https://www.kaggle.com/datasets/dshah1612/product-tweets-dataset?select=final_data.csv

Just download the final data final and add it to the content directory on colab to run the notebook.

Adapted for the dataset and task of classifying product review tweets.


~Samyukt Sriram

In [8]:
import pandas as pd

In [9]:
#Loading and Cleaning, EDA

df = pd.read_csv('/content/final_data.csv')

#Some cleanup and formatting
df.rename(columns = {'is_there_an_emotion_directed_at_a_brand_or_product':'sentiment'}, inplace = True)
print('Set of responses in the dataset: ', set(df['sentiment']))

df.dropna(subset = ['tweet_text','sentiment'],inplace = True)

df.reset_index(inplace = True, drop = True)


#Creating integer labels for sentiments
for i in range(len(df)):
  df.loc[i,'labels'] = 0 if df.loc[i,'sentiment'] == 'Negative emotion' else 1 if df.loc[i,'sentiment'] == 'No emotion toward brand or product' else \
                2 if df.loc[i,'sentiment'] == 'Positive emotion' else 3

df['labels'] = df['labels'].astype(int)
print(df.head())
df.info()

Set of responses in the dataset:  {'Negative emotion', 'Positive emotion', "I can't tell", 'No emotion toward brand or product'}
                                          tweet_text  \
0  .@wesley83 I have a 3G iPhone. After 3 hrs twe...   
1  @jessedee Know about @fludapp ? Awesome iPad/i...   
2  @swonderlin Can not wait for #iPad 2 also. The...   
3  @sxsw I hope this year's festival isn't as cra...   
4  @sxtxstate great stuff on Fri #SXSW: Marissa M...   

  emotion_in_tweet_is_directed_at         sentiment  labels  
0                          iPhone  Negative emotion       0  
1              iPad or iPhone App  Positive emotion       2  
2                            iPad  Positive emotion       2  
3              iPad or iPhone App  Negative emotion       0  
4                          Google  Positive emotion       2  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9092 entries, 0 to 9091
Data columns (total 4 columns):
 #   Column                           Non-Null Count  Dt

In [10]:
#Installing packages

!pip install transformers
!pip install datasets


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [11]:
import scipy
import sklearn
import numpy as np

from transformers import AutoTokenizer, DataCollatorWithPadding, TFAutoModelForSequenceClassification, create_optimizer
from transformers.keras_callbacks import KerasMetricCallback
from tensorflow.keras.callbacks import TensorBoard

import datasets

import tensorflow as tf
from datasets import load_dataset, load_metric

In [12]:
#Loading into the datasets library from Huggingface

dataset = datasets.Dataset.from_pandas(df).train_test_split(test_size=0.2)

dataset


DatasetDict({
    train: Dataset({
        features: ['tweet_text', 'emotion_in_tweet_is_directed_at', 'sentiment', 'labels'],
        num_rows: 7273
    })
    test: Dataset({
        features: ['tweet_text', 'emotion_in_tweet_is_directed_at', 'sentiment', 'labels'],
        num_rows: 1819
    })
})

In [13]:
#Setting up task and model:
task = 'sst2' #Similar to problem statement from GLUE - supervised sentiment classification on Stanford Sentiment Treebank
model_checkpoint = 'distilbert-base-uncased' #Make sure the model is compatible with classification tasks
batch_size = 16 #This might be need to tweaked based on task and model.

In [14]:
#Loading metric for sentiment classification 
metric = load_metric('glue', task) #Can look into finding a different metric, here it is accuracy
metric

Metric(name: "glue", features: {'predictions': Value(dtype='int64', id=None), 'references': Value(dtype='int64', id=None)}, usage: """
Compute GLUE evaluation metric associated to each GLUE dataset.
Args:
    predictions: list of predictions to score.
        Each translation should be tokenized into a list of tokens.
    references: list of lists of references for each translation.
        Each reference should be tokenized into a list of tokens.
Returns: depending on the GLUE subset, one or several of:
    "accuracy": Accuracy
    "f1": F1 score
    "pearson": Pearson Correlation
    "spearmanr": Spearman Correlation
    "matthews_correlation": Matthew Correlation
Examples:

    >>> glue_metric = datasets.load_metric('glue', 'sst2')  # 'sst2' or any of ["mnli", "mnli_mismatched", "mnli_matched", "qnli", "rte", "wnli", "hans"]
    >>> references = [0, 1]
    >>> predictions = [0, 1]
    >>> results = glue_metric.compute(predictions=predictions, references=references)
    >>> print(res

In [15]:
print(dataset['train'][5])
dataset

{'tweet_text': 'Photo: Still a lineup around the block at the #sxsw apple store. {link}', 'emotion_in_tweet_is_directed_at': None, 'sentiment': 'No emotion toward brand or product', 'labels': 1}


DatasetDict({
    train: Dataset({
        features: ['tweet_text', 'emotion_in_tweet_is_directed_at', 'sentiment', 'labels'],
        num_rows: 7273
    })
    test: Dataset({
        features: ['tweet_text', 'emotion_in_tweet_is_directed_at', 'sentiment', 'labels'],
        num_rows: 1819
    })
})

In [16]:
#Preprocessing

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [17]:
def preprocess_function(examples):
  return tokenizer(examples['tweet_text'], truncation = True)


encoded_dataset = dataset.map(preprocess_function, batched = True)


#For the DataCollator function, we need to specify which columns are tokenized inputs. 
pre_tokenizer_columns = set(dataset['train'].features)
tokenizer_columns = list(set(encoded_dataset['train'].features) - pre_tokenizer_columns)




  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [18]:
encoded_dataset['train'].features

{'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'emotion_in_tweet_is_directed_at': Value(dtype='string', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'labels': Value(dtype='int64', id=None),
 'sentiment': Value(dtype='string', id=None),
 'tweet_text': Value(dtype='string', id=None)}

In [19]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors ='tf')

tf_train_dataset = encoded_dataset['train'].to_tf_dataset(
    columns = tokenizer_columns,
    label_cols = ['labels'],
    shuffle = True,
    batch_size = 16,
    collate_fn = data_collator,
)

tf_validation_dataset = encoded_dataset['test'].to_tf_dataset(
    columns = tokenizer_columns,
    label_cols = ['labels'],
    shuffle = False, #? Why not shuffle here? Reasons seems to be shuffling is unnecessary, as model isn't learning on this. Just adds excess computation.
    batch_size = 16,
    collate_fn = data_collator,
)

In [20]:
#Defining Loss and Model

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
num_labels = 4 #for tweet sentiments
model = TFAutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels = num_labels)


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_layer_norm', 'activation_13', 'vocab_projector', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'dropout_19', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

In [21]:
#Compiling the model

num_epochs = 5
batches_per_epoch = len(encoded_dataset['train']) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)


#create_optimizer() is AdamW with weight and learning rate decay
optimizer, schedule = create_optimizer(
    init_lr = 3e-5, num_warmup_steps = 0, num_train_steps = total_train_steps
)

model.compile(optimizer = optimizer, loss = loss)

In [22]:
#Setting up KerasMetricCallback, can handle any metric computation (like BLEU, ROUGE)
#Useful for other callbacks like TensorBoard, EarlyStopping. 


def compute_metrics(eval_predictions):
  predictions, labels = eval_predictions
  predictions = np.argmax(predictions, axis=1)
  return metric.compute(predictions=predictions, references = labels)

metric_callback = KerasMetricCallback(
    metric_fn = compute_metrics, eval_dataset = tf_validation_dataset
)

In [23]:
#Training

tensorboard_callback = TensorBoard(log_dir = "./text_classification_model_save/logs")
callbacks = [metric_callback, tensorboard_callback]


#num_epochs defined a few cells above
model.fit(
    tf_train_dataset,
    validation_data = tf_validation_dataset,
    epochs = num_epochs,
    callbacks = callbacks
)
#5 epochs might be overfitting, val_loss gets significantly worse as epochs go on, very little improvement in accuracy.

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f9566487a50>

In [28]:
#Product Tweet Classifier

sentiments = ['Negative emotion', 'No emotion toward brand or product', 'Positive emotion',"I can't tell"]
input_tweet = "What were they thinking! The camera is a huge miss on the new ipad"
input_ids = tf.constant(tokenizer.encode(input_tweet))  # Batch size 1
outputs = model(input_ids)
logits = outputs[0]
print(f'logits: {logits}')
print(f'Input tweet: {input_tweet}')
print(f'Highest probability prediction: {sentiments[np.argmax(logits)]}')

#{'Negative emotion', 'No emotion toward brand or product', 'Positive emotion','I can't tell'}
#p = 0.5 <=> logit = 0

logits: [[ 1.4376023  -0.60808384 -0.41176018 -0.40876502]]
Input tweet: What were they thinking! The camera is a huge miss on the new ipad
Highest probability prediction: Negative emotion
