In [13]:
import pandas as pd
import re
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.metrics import classification_report
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import numpy as np
import nltk
from scipy.special import softmax

In [14]:
# Load the dataset
DATASET_COLUMNS = ['target', 'ids', 'date', 'flag', 'user', 'text']
DATASET_ENCODING = "ISO-8859-1"
data = pd.read_csv('sentiment140.csv', encoding=DATASET_ENCODING, names=DATASET_COLUMNS)

In [15]:
print("Unique value counts in the target column:")
print(data['target'].value_counts())

Unique value counts in the target column:
target
0    800000
4    800000
Name: count, dtype: int64


In [16]:
def clean_text(text):
    stopwordlist = [
        'a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
        'and', 'any', 'are', 'as', 'at', 'be', 'because', 'been', 'before',
        'being', 'below', 'between', 'both', 'by', 'can', 'd', 'did', 'do',
        'does', 'doing', 'down', 'during', 'each', 'few', 'for', 'from',
        'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
        'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
        'into', 'is', 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma',
        'me', 'more', 'most', 'my', 'myself', 'needn', 'no', 'nor', 'now',
        'o', 'of', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves',
        'out', 'own', 're', 's', 'same', 'she', "shes", 'should', "shouldve", 'so', 'some', 'such',
        't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
        'themselves', 'then', 'there', 'these', 'they', 'this', 'those',
        'through', 'to', 'too', 'under', 'until', 'up', 've', 'very', 'was',
        'we', 'were', 'what', 'when', 'where', 'which', 'while', 'who', 'whom',
        'why', 'will', 'with', 'won', 'y', 'you', "youd", "youll", "youre",
        "youve", 'your', 'yours', 'yourself', 'yourselves'
    ]

    # Function to get NLTK POS tag to WordNet POS tag
    def get_wordnet_pos(treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN
    
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'((www\.[^\s]+)|(https?://[^\s]+))', ' ', text)
    # Replace @mentions with 'USER'
    text = re.sub(r'@[\S]+', 'USER', text)
    # Remove hashtags but keep the text
    text = re.sub(r'#(\S+)', r'\1', text)
    # Remove digits
    text = re.sub(r'\d+', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    # Strip leading and trailing whitespace
    text = text.strip()
    # Remove stopwords
    text = " ".join([word for word in text.split() if word not in stopwordlist])
    # Tokenize text
    tokenizer = RegexpTokenizer(r'\w+|[^\w\s]')
    tokens = tokenizer.tokenize(text)

    # POS tagging
    pos_tags = nltk.pos_tag(tokens)

    # Lemmatize each token with the appropriate POS tag
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(tag)) for token, tag in pos_tags]
    
    return " ".join(lemmatized_tokens)


In [17]:
# Preprocess the text data
data['text'] = data['text'].apply(clean_text)

In [18]:
# Ensure equal number of positive and negative samples
positive_samples = data[data['target'] == 4].sample(500, random_state=42)
negative_samples = data[data['target'] == 0].sample(500, random_state=42)
data_subset = pd.concat([positive_samples, negative_samples])

# Replacing target values
data_subset['target'] = data_subset['target'].replace(4, 1)

In [19]:
# Load the model and tokenizer
task = 'sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Function to predict sentiment
def predict_sentiment(text):
    encoded_input = tokenizer(text, return_tensors='pt').to(device)
    with torch.no_grad():
        output = model(**encoded_input)
    scores = output[0][0].cpu().numpy()
    scores = softmax(scores)
    # Ignore neutral class
    negative_score = scores[0]
    positive_score = scores[2]
    return 1 if positive_score > negative_score else 0

# Apply sentiment prediction
data_subset['sentiment'] = data_subset['text'].apply(predict_sentiment)



In [20]:
# Evaluation
y_true = data_subset['target']
y_pred = data_subset['sentiment']
report = classification_report(y_true, y_pred, target_names=["negative", "positive"])

print(report)


              precision    recall  f1-score   support

    negative       0.75      0.71      0.73       500
    positive       0.72      0.76      0.74       500

    accuracy                           0.74      1000
   macro avg       0.74      0.74      0.74      1000
weighted avg       0.74      0.74      0.74      1000



In [23]:
def test_single_tweet(tweet):
    tweet = clean_text(tweet)
    sentiment = predict_sentiment(tweet)

   
    return "positive" if sentiment == 1 else "negative"

In [26]:
# Test the model with a single tweet
test_tweet = "I #hatedata science brain #dsbrain"
predicted_sentiment = test_single_tweet(test_tweet)
print(f"The sentiment of the tweet '{test_tweet}' is {predicted_sentiment}.")

The sentiment of the tweet 'I #hatedata science brain #dsbrain' is negative.
