In [None]:
import nltk, re, random

# get started with data and setting
tweets = ([(t, "pos") for t in nltk.corpus.twitter_samples.strings("positive_tweets.json")] +
                            [(t, "neg") for t in nltk.corpus.twitter_samples.strings("negative_tweets.json")])
new_tweets = [re.sub('\||\\n', '', t) for t in nltk.corpus.twitter_samples.strings('tweets.20150430-223406.json')]
stopwords = nltk.corpus.stopwords.words('english')
stemmer = nltk.stem.SnowballStemmer('english')
tokenizer = nltk.tokenize.casual.TweetTokenizer()

# print stats and head of data
n_tweets, n_pos_tweets, n_neg_tweets = len(tweets), len([t for t in tweets if t[1] == "pos"]), len([t for t in tweets if t[1] == "neg"])
print(f"length of tweets: {n_tweets}, positive tweets: {n_pos_tweets}, negative tweets: {n_neg_tweets}\n")
for d in tweets[:6]:
    print(d)

In [None]:
# cleaning all the tweets --> set of words model
def set_of_words(text):
    # remove http links and user references
    text = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*(),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+','', text)
    text = re.sub("(@[A-Za-z0-9_]+)","", text)
    # tokenize
    tokens = tokenizer.tokenize(text)
    # remove stopwords
    tokens = [stemmer.stem(t) for t in tokens if len(t)>1 and not t.isnumeric() and t not in stopwords]
    # set of words
    return {t:1 for t in tokens}

data = [(set_of_words(text), label) for text, label in tweets]
for d in data[:6]:
    print(d)

In [None]:
# split training and test (stratify pos/neg samples)
pos_data = [x for x in data if x[1] == 'pos']
pos_tweets = [x[0] for x in tweets if x[1] == 'pos']
neg_data = [x for x in data if x[1] == 'neg']
neg_tweets = [x[0] for x in tweets if x[1] == 'neg']

pos_split = 80 * len(pos_data) // 100
neg_split = 80 * len(neg_data) // 100

train_data = pos_data[:pos_split] + neg_data[:neg_split]
train_tweets = pos_tweets[:pos_split] + neg_tweets[:neg_split]
test_data = pos_data[pos_split:] + neg_data[neg_split:]
test_tweets = pos_tweets[pos_split:] + neg_tweets[neg_split:]

In [None]:
# classify with Naive Bayes (bernoulli)
classifier = nltk.NaiveBayesClassifier.train(train_data)

print("Train Accuracy is:", nltk.classify.accuracy(classifier, train_data))
print("Test Accuracy is:", nltk.classify.accuracy(classifier, test_data))
print("\n")
print(classifier.show_most_informative_features(10))

In [None]:
from IPython.display import Markdown
from tabulate import tabulate

# get all false predictions
false_predictions = [('train', classifier.classify(t[0]), t[1], re.sub('\||\\n','',train_tweets[i]), ', '.join(t[0].keys())) for i, t in enumerate(train_data) if classifier.classify(t[0]) != t[1]]
false_predictions += [('test', classifier.classify(t[0]), t[1], re.sub('\||\\n','',test_tweets[i]), ', '.join(t[0].keys())) for i, t in enumerate(test_data) if classifier.classify(t[0]) != t[1]]
print(f'false predictions: {len(false_predictions)} out of {len(tweets)}')

headers = ['set', 'predicted', 'actual', 'tweet', 'tokens']
Markdown(tabulate(false_predictions, headers, tablefmt='github'))    

In [None]:
from tabulate import tabulate
headers = ['predicted', 'tweet']
random.shuffle(new_tweets)
predictions = [classifier.classify(set_of_words(text)) for text in new_tweets[0:10]]
Markdown(tabulate(zip(predictions, new_tweets), headers, tablefmt='github'))

In [None]:
from transformers import pipeline

# create pipeline for sentiment analysis
trf_classifier = pipeline('sentiment-analysis', model='cardiffnlp/twitter-roberta-base-sentiment')

Labels: 0 -> Negative; 1 -> Neutral; 2 -> Positive

In [None]:
from tabulate import tabulate
headers = ['predicted', 'score', 'actual', 'tweet']
predictions = [trf_classifier(text)[0] for text in pos_tweets[0:10]]
rows = [[x['label'], x['score'], 'pos', re.sub('\||\\n','',pos_tweets[i])] for i,x in enumerate(predictions)]
predictions = [trf_classifier(text)[0] for text in neg_tweets[0:10]]
rows += [[x['label'], x['score'], 'neg', re.sub('\||\\n','',neg_tweets[i])] for i,x in enumerate(predictions)]
Markdown(tabulate(rows, headers, tablefmt='github'))

In [None]:
# get all false predictions
pos_predicitions = trf_classifier(pos_tweets[:512])
neg_predicitions = trf_classifier(neg_tweets[:512])

In [None]:
tp = len([x for x in pos_predicitions if x['label']!='LABEL_0'])
fp = len([x for x in pos_predicitions if x['label']=='LABEL_0'])
tn = len([x for x in neg_predicitions if x['label']!='LABEL_2'])
fn = len([x for x in neg_predicitions if x['label']=='LABEL_2'])

print(f'accuracy: {(tp+tn)/(tp+tn+fp+fn)}')

---