In [1]:
#  1. Install Required Library (only if not already installed)
# Install the 'transformers' library from Hugging Face, which provides pre-trained models for NLP tasks like sentiment analysis
!pip install transformers




In [2]:
#  2. Import Libraries
import pandas as pd               # For handling data in tabular (DataFrame) form
import re                         # For text preprocessing using regular expressions
from transformers import pipeline  # Import the pipeline API from transformers for easy model access


In [3]:
#  3. Load Dataset and Take a Sample of 500 Tweets
# Read the tweets dataset from a CSV file. Ensure the CSV file 'tweets-data.csv' is in your working directory.
df = pd.read_csv("tweets-data.csv")

# Take a random sample of 500 tweets to make processing faster and reproducible (random_state ensures same result every time)
df_sample = df.sample(n=500, random_state=42).reset_index(drop=True)


In [4]:
#  4. Define Text Cleaning Function (Same as VADER practice)
def clean_text(text):
    text = str(text)  # Ensure the input is a string

    text = re.sub(r"http\S+", "", text)          # Remove URLs (e.g., http://...)
    text = re.sub(r"@\w+", "", text)             # Remove Twitter mentions (e.g., @username)
    text = re.sub(r"#", "", text)                # Remove the '#' symbol but keep the word
    text = re.sub(r"[^A-Za-z\s]", "", text)      # Remove any non-letter character (punctuation, numbers, emojis)
    text = text.lower().strip()                  # Convert text to lowercase and remove leading/trailing spaces

    return text

# Apply the cleaning function to the 'Tweets' column and store the result in a new column 'cleaned_text'
df_sample['cleaned_text'] = df_sample['Tweets'].apply(clean_text)


In [5]:
# 5. Load Transformer Pipeline
# Load a pre-trained sentiment-analysis pipeline from Hugging Face Transformers (by default uses 'distilbert-base-uncased-finetuned-sst-2-english')
# This model classifies text as either POSITIVE or NEGATIVE
sentiment_classifier = pipeline("sentiment-analysis")


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Device set to use cpu


In [6]:
#  6. Define Sentiment Scoring Function with Token Limit Handling
def sentiment_scores_transformers(text):
    text = text.strip()  # Remove any extra whitespace

    # Handle empty text after cleaning by returning NEUTRAL with score 0.0
    if not text:
        return "NEUTRAL", 0.0

    # Transformers models have a maximum token/character limit; here we truncate to 512 characters for safety
    max_length = 512
    text = text[:max_length]

    # Use the sentiment analysis pipeline to predict sentiment
    result = sentiment_classifier(text)[0]  # The pipeline returns a list of predictions; take the first one

    # Return both the predicted label (POSITIVE/NEGATIVE) and confidence score
    return result['label'], result['score']


In [7]:
#  7. Apply Sentiment Function and Create New Columns
# Apply the sentiment scoring function to each cleaned tweet
# Use pd.Series to expand the tuple output into two separate columns: 'sentiment_label' and 'sentiment_score'
df_sample[['sentiment_label', 'sentiment_score']] = df_sample['cleaned_text'].apply(
    lambda x: pd.Series(sentiment_scores_transformers(x))
)


In [8]:
#  8. (Optional) Save the Result to a New CSV
# Save the DataFrame with the new sentiment analysis results to a CSV file for later use or sharing
df_sample.to_csv("tweets_with_transformer_sentiment.csv", index=False)


In [9]:
#  9. Display Sample Output
# Print the first few rows of the DataFrame showing the original tweet, cleaned text, sentiment label, and score
print(df_sample[['Tweets', 'cleaned_text', 'sentiment_label', 'sentiment_score']].head())


                                              Tweets  \
0  Le #DessinDePresse de Sanaga : ls sont morts c...   
1  #Russia #Wagner #RussiaCivilWar https://t.co/P...   
2  Exclusive content -https://t.co/oEiSIIB2Z1\n.\...   
3  Auch heute geht die politische Nachricht des T...   
4  @crazyclipsonly Same type that would take a ho...   

                                        cleaned_text sentiment_label  \
0  le dessindepresse de sanaga  ls sont morts com...        NEGATIVE   
1                       russia wagner russiacivilwar        NEGATIVE   
2  exclusive content \n\ncosplay japan titan tita...        NEGATIVE   
3  auch heute geht die politische nachricht des t...        NEGATIVE   
4  same type that would take a homemade playstati...        NEGATIVE   

   sentiment_score  
0         0.981537  
1         0.962062  
2         0.961531  
3         0.975570  
4         0.994473  
