In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutomaticSpeechRecognitionPipeline

from scipy.special import softmax
import numpy as np
import pandas as pd

df = pd.read_csv('pokemon_sunmoon_new2.csv')

In [2]:
len(df)

429760

In [3]:
df = df[df['text'].str.contains('I liked a @YouTube video')== False]
df = df[df['text'].str.contains('I added a video to a @YouTube')== False]

In [4]:
len(df)

335378

In [5]:
# Preprocessing
preprocessed_text = []

for tweet in df['text']:
    tweet_words = []
    for word in tweet.split(' '):
        if word.startswith('@') and len(word) > 1:
            word = '@user'
        elif word.startswith('http'):
            word = 'http'
        tweet_words.append(word)
    tweet_proc = " ".join(tweet_words)
    preprocessed_text.append(tweet_proc)

In [6]:
df['preprocessed tweets'] = preprocessed_text

In [7]:
df.reset_index(drop=True, inplace=True)

In [8]:
# Load model and tokenizer

roberta = "cardiffnlp/twitter-roberta-base-sentiment-latest"
#roberta = "cardiffnlp/twitter-roberta-base-2021-124m"

model = AutoModelForSequenceClassification.from_pretrained(roberta)
tokenizer = AutoTokenizer.from_pretrained(roberta)

labels = ['Negative', 'Neutral', 'Positive']

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
sentiment_scores = []

for processed_tweet in df['preprocessed tweets']:
    encoded_tweet = tokenizer(processed_tweet, return_tensors='pt')

    output = model(**encoded_tweet)

    scores = output[0][0].detach().numpy()
    scores = softmax(scores)

    sentiment_scores.append(scores)

In [10]:
df['sentiment scores'] = sentiment_scores

In [11]:
labels = ['Negative', 'Neutral', 'Positive']
index_no = [0,1,2]

label_dict = dict(zip(index_no, labels))

overall_sentiment = []

for sentiment_score in df['sentiment scores']:
    overall_sentiment.append(label_dict.get(sentiment_score.argmax()))

In [12]:
df['sentiment'] = overall_sentiment

In [13]:
df.to_csv('pokemon_sunmoon_tweets_datatset.csv')