In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('C:/Data/Tweets.csv')

# Display the first few rows of the dataset
print(df.head())

# Check the columns available in the dataset
print(df.columns)

             tweet_id airline_sentiment  airline_sentiment_confidence  \
0  570306133677760513           neutral                        1.0000   
1  570301130888122368          positive                        0.3486   
2  570301083672813571           neutral                        0.6837   
3  570301031407624196          negative                        1.0000   
4  570300817074462722          negative                        1.0000   

  negativereason  negativereason_confidence         airline  \
0            NaN                        NaN  Virgin America   
1            NaN                     0.0000  Virgin America   
2            NaN                        NaN  Virgin America   
3     Bad Flight                     0.7033  Virgin America   
4     Can't Tell                     1.0000  Virgin America   

  airline_sentiment_gold        name negativereason_gold  retweet_count  \
0                    NaN     cairdin                 NaN              0   
1                    NaN    jnar

In [2]:
# Keep only the relevant columns: 'text' for the tweet and 'airline_sentiment' for the sentiment
df = df[['text', 'airline_sentiment']]

# Display the value counts of sentiment labels
print(df['airline_sentiment'].value_counts())

# Map the sentiments to numerical labels: negative = 0, neutral = 1, positive = 2
df['sentiment_label'] = df['airline_sentiment'].map({'negative': 0, 'neutral': 1, 'positive': 2})

# Display the first few rows of the processed dataset
print(df.head())

airline_sentiment
negative    9178
neutral     3099
positive    2363
Name: count, dtype: int64
                                                text airline_sentiment  \
0                @VirginAmerica What @dhepburn said.           neutral   
1  @VirginAmerica plus you've added commercials t...          positive   
2  @VirginAmerica I didn't today... Must mean I n...           neutral   
3  @VirginAmerica it's really aggressive to blast...          negative   
4  @VirginAmerica and it's a really big bad thing...          negative   

   sentiment_label  
0                1  
1                2  
2                1  
3                0  
4                0  


In [3]:
from transformers import RobertaTokenizer, RobertaModel
import torch
import numpy as np
from tqdm import tqdm

# Load the pre-trained RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')

# Set the model to evaluation mode (important for inference)
model.eval()

# Function to generate embeddings from RoBERTa
def get_roberta_embeddings(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    
    # Pass the tokens through RoBERTa without computing gradients (inference mode)
    with torch.no_grad():
        outputs = model(**inputs)
    
    # The hidden states (embeddings) from RoBERTa
    last_hidden_state = outputs.last_hidden_state  # Shape: (batch_size, sequence_length, hidden_size)
    
    # Use the [CLS] token embedding as the sentence embedding (position 0 in the sequence)
    cls_embedding = last_hidden_state[:, 0, :].squeeze().numpy()  # Shape: (hidden_size,)
    
    return cls_embedding

# Convert all tweets to RoBERTa embeddings
#df['embeddings'] = df['text'].apply(get_bert_embeddings)
embeddings = []
for text in tqdm(df['text'], desc="Embedding Tweets", unit="tweet", ncols=100):
    embeddings.append(get_roberta_embeddings(text))


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Embedding Tweets: 100%|████████████████████████████████████| 14640/14640 [37:42<00:00,  6.47tweet/s]


In [4]:
# Convert the embeddings to a numpy array
X = np.stack(embeddings)

# Target labels
y = df['sentiment_label'].values

print(X.shape)  # Shape of the feature matrix

(14640, 768)


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Split the dataset into training and test sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Logistic Regression model on the BERT embeddings
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

print(classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))

Model Accuracy: 84.29%
              precision    recall  f1-score   support

    negative       0.89      0.93      0.91      1889
     neutral       0.71      0.63      0.67       580
    positive       0.79      0.74      0.77       459

    accuracy                           0.84      2928
   macro avg       0.80      0.77      0.78      2928
weighted avg       0.84      0.84      0.84      2928



In [7]:
# Example tweets to predict sentiment for
new_tweets = [
    "The flight was delayed by 2 hours. Terrible experience!",
    "Great service, I loved the extra legroom in business class.",
    "The flight was fine, nothing special."
]

# Convert new tweets to BERT embeddings
new_X = np.stack([get_roberta_embeddings(tweet) for tweet in new_tweets])

# Predict sentiment for the new tweets
new_predictions = clf.predict(new_X)

# Map the predictions back to sentiment labels
predicted_sentiments = ['negative' if pred == 0 else 'neutral' if pred == 1 else 'positive' for pred in new_predictions]

# Display the results
for tweet, sentiment in zip(new_tweets, predicted_sentiments):
    print(f"Tweet: {tweet} \nPredicted Sentiment: {sentiment}\n")


Tweet: The flight was delayed by 2 hours. Terrible experience! 
Predicted Sentiment: negative

Tweet: Great service, I loved the extra legroom in business class. 
Predicted Sentiment: positive

Tweet: The flight was fine, nothing special. 
Predicted Sentiment: negative

