In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutomaticSpeechRecognitionPipeline

from scipy.special import softmax
import numpy as np
import pandas as pd

df = pd.read_csv('pokemon_letsgo.csv')

In [2]:
len(df)

66949

In [3]:
df = df[df['text'].str.contains('I liked a @YouTube video')== False]
df = df[df['text'].str.contains('I added a video to a @YouTube')== False]

In [4]:
len(df)

65819

In [6]:
# Preprocessing
preprocessed_text = []

for tweet in df['text']:
    tweet_words = []
    for word in tweet.split(' '):
        if word.startswith('@') and len(word) > 1:
            word = '@user'
        elif word.startswith('http'):
            word = 'http'
        tweet_words.append(word)
    tweet_proc = " ".join(tweet_words)
    preprocessed_text.append(tweet_proc)

In [7]:
df['preprocessed tweets'] = preprocessed_text
# df.drop(columns=df.columns[0], axis=1, inplace=True)

In [8]:
df.reset_index(drop=True, inplace=True)
long_tweets = df['preprocessed tweets'].str.len().sort_values(ascending=False)

long_tweets

40424    375
57568    346
56096    328
45658    321
40890    317
        ... 
50904     11
54459     11
28150     10
43706     10
41709     10
Name: preprocessed tweets, Length: 65819, dtype: int64

In [11]:
long_tweets.head(20)

40424    375
57568    346
56096    328
45658    321
40890    317
35178    314
49857    308
33029    306
49486    302
55398    302
62856    301
51055    300
42818    300
11640    300
45224    298
40414    298
40696    298
63303    297
9074     297
53186    297
Name: preprocessed tweets, dtype: int64

In [25]:
df.at[40424,'preprocessed tweets'] = "Ohhh I'll play! 4 upcoming things I'm excited for. Tag 4 peeps.  ðŸŒ¼ðŸŒºðŸŒ·\n\n1. Assassin's Creed Odyssey (pc)\n2. Pokemon Lets' Go (switch)\n3. Stellaris Le Guin update (pc)\n4. Age of Wonders Planetfall (pc)\n\n@CivCat @user @user @user"
df.at[57568,'preprocessed tweets'] = "lets go focuses heavily on pokemon go aspects, its been theorized that this is nintendos way of trying to transition pokemon go players to this hybrid game to hopefully convince them to play the new gen 8 2019 core game. like a smooth transition. Go -&gt; Go/Core hybrid -&gt; Core game"
df.at[56096,'preprocessed tweets'] = "Core series means it takes place within the main Timeline/Storyline. Main series includes core games and a few others that link in. Pokemon Lets go is hybrid of Core/SpinOff. it implements the design of a core game with the mechanics of a spin-off, its a hybrid"
df.at[45658,'preprocessed tweets'] = "Venusaur's ride setting is LAND. That's what I said. \n\n~ Which meant Ride or Following Behind \n~ Game Freak choose Following Behind\n\nCharizard is AIR \nBlastoise is SEA \n\nI even included some Emojis in the leak.\nYou have to decide which one to have first. \n#PokemonLetsGO #Venusaur http"
df.at[40890,'preprocessed tweets'] = "Game Freak lost the picture with #PokemonLetsGo. They need to realize who their fans are and prove they are not so out of touch with us all. This game only shows they dont know the fans anymore. Every time news for LetsGo drops, more I see talking out against it."
df.at[35178,'preprocessed tweets'] = "Lets go is confirmed not a core game but instead a spin off the next core game is next year and lets go comes out november 16 or 17 i forget but not october also the point of lets go is to bring in fans of pkmn go and ease them into the main games and core mechanics"
df.at[49857,'preprocessed tweets'] = "But Nintendo's stock has tumbled in price over the past couple of months as the sales of the switch has slowed dramaticly because the only big games that are coming soon are Pokemon lets go (mixed reactions) and smash which didn't help the Wii u's sales half as much as predicted http"

In [27]:
# Load model and tokenizer

roberta = "cardiffnlp/twitter-roberta-base-sentiment-latest"
#roberta = "cardiffnlp/twitter-roberta-base-2021-124m"

model = AutoModelForSequenceClassification.from_pretrained(roberta)
tokenizer = AutoTokenizer.from_pretrained(roberta)

labels = ['Negative', 'Neutral', 'Positive']

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [28]:
sentiment_scores = []

for processed_tweet in df['preprocessed tweets']:
    encoded_tweet = tokenizer(processed_tweet, return_tensors='pt')

    output = model(**encoded_tweet)

    scores = output[0][0].detach().numpy()
    scores = softmax(scores)

    sentiment_scores.append(scores)

In [29]:
df['sentiment scores'] = sentiment_scores

In [30]:
labels = ['Negative', 'Neutral', 'Positive']
index_no = [0,1,2]

label_dict = dict(zip(index_no, labels))

overall_sentiment = []

for sentiment_score in df['sentiment scores']:
    overall_sentiment.append(label_dict.get(sentiment_score.argmax()))

In [31]:
df['sentiment'] = overall_sentiment

In [32]:
df.to_csv('pokemon_letsgo_tweets_datatset.csv')