The object here is to train a machine learning model on a database of Tweets and use that model to predict sentiment of Tweets from a user, a hashtag, or any random variety of Tweets. 

**@author Ryan Herren**

**@author Tanner Dunn**

The following link is a good informational guide on how to roughly implement a model like we are aiming for. https://www.analyticsvidhya.com/blog/2021/06/twitter-sentiment-analysis-a-nlp-use-case-for-beginners/

# Initialize environment, install packages

In [2]:
import tweepy
import pandas as pd
import numpy as np
import operator 
import json
from collections import Counter
import os
import yaml
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import string
import math
import time
from tqdm import tqdm

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ryanherren/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
os.environ['python-bot-config'] = "/Users/ryanherren/python-bot-config.yaml"
# os.environ['python-bot-config'] = "/Users/dunnt/python-bot-config.yaml"
# os.environ['python-bot-config'] = "/Users/mariolgw/python-bot-config.yaml"

In [4]:
try:
    v_env = os.environ['python-bot-config']
except:
    print("Config file env variable is not set.")
    print("Set python-bot-config file")
    sys.exit(1)

with open(v_env, "r") as yamlConfig:
    cfg = yaml.safe_load(yamlConfig)

# Import Opinion Lexicon

In [5]:
# Reading negative words list:
with open('negative-words.txt', 'r', encoding = "ISO-8859-1") as t:
    neg = t.readlines()
    neg_words = []
    for i in neg:
        neg_words.append(i.strip())
    del neg_words[0 : 31]
    

# Reading positive words list:
with open('positive-words.txt', 'r', encoding = "ISO-8859-1") as t:
    pos = t.readlines()
    pos_words = []
    for i in pos:
        pos_words.append(i.strip())
    del pos_words[0 : 30]
    
positive_emojis = ['😂','🔥','😍','🤘','🤩','👍','💯','😎','✅','👏','😀','🐐',
                   '❤️','♥️','😘','😊','😄','😃','😆','😋','🤪','😜','😛','🤑']
negative_emojis = ['🥴','🤢','🤮','😧','😑','😰','🤬','😡','😭','😢','😩','🙁',
                   '☹️','😣','😖','😫','😟','😞','😔','😒','👿','🤕','🤒','😷']

# Function to count sentiment words in Tweets

In [6]:
# Function to count negative words in tweet:
def count_negative(tweet):
    neg_word_count = 0
    for word in tweet:
        if word.lower() in neg_words:
            neg_word_count = neg_word_count + 1
            # mg['neg_word_count'] = mg['text'].apply(lambda x: neg_word_count(x), axis = 1, result_type = 'expand')
    return neg_word_count
   
# Function to count positive words in tweet:
def count_positive(tweet):
    pos_word_count = 0
    for word in tweet:
        #print(word)
        if word.lower() in pos_words:
            pos_word_count = pos_word_count + 1
            # mg['neg_word_count'] = mg['text'].apply(lambda x: neg_word_count(x), axis = 1, result_type = 'expand')
    return pos_word_count
   
def count_mentions(tweet):
    num_mentions = 0
    for word in tweet:
        if word.startswith('@'):
            num_mentions = num_mentions + 1
    return num_mentions

def count_hashtags(tweet):
    num_hashtags = 0
    for word in tweet:
        if word.startswith('#'):
            num_hashtags = num_hashtags + 1
    return num_hashtags

def count_positive_emojis(tweet):
    pos_emoji_count = 0
    for word in tweet:
        #print(word)
        if word in positive_emojis:
            pos_emoji_count = pos_emoji_count + 1
            # mg['neg_word_count'] = mg['text'].apply(lambda x: neg_word_count(x), axis = 1, result_type = 'expand')
    return pos_emoji_count

def count_negative_emojis(tweet):
    neg_emoji_count = 0
    for word in tweet:
        #print(word)
        if word in negative_emojis:
            neg_emoji_count = neg_emoji_count + 1
            # mg['neg_word_count'] = mg['text'].apply(lambda x: neg_word_count(x), axis = 1, result_type = 'expand')
    return neg_emoji_count

# Tokenizing functions to break tweets into words, emojis, mentions, and hashtags

In [7]:
# This cell is the basis of how tokenizing the Tweet will work. Using regex statments, it accounts for emojis, 
# hashtags, mentions, and more.
import re
 
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]
    
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
 
def tokenize(s):
    return tokens_re.findall(s)
 
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens

# Configure Twitter API Connection

In [8]:
consumer_key = cfg.get("TwitterAPI").get("consumer_key")
consumer_secret = cfg.get("TwitterAPI").get("consumer_secret")
access_token = cfg.get("TwitterAPI").get("access_token")
access_token_secret = cfg.get("TwitterAPI").get("access_token_secret")

In [9]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
 
api = tweepy.API(auth, wait_on_rate_limit=True)

# Get Tweets by User

In [10]:
# Currently limited to num_tweets Tweets
column_names = ['tweet_id', 'text','created_at']
def get_tweets_by_user(user, tweets):
    tweet_json = pd.DataFrame(columns=column_names)
    for i in range(math.ceil(tweets/20)):
        tweets = api.user_timeline(user, page=i)
#         print(str(i) + "WABBADABBADO\n")
#         print(tweets)
        my_list_of_dicts = []
        for each_json_tweet in tweets:
            my_list_of_dicts.append(each_json_tweet._json)
        filename = user + '.txt'
        if os.path.exists(filename):
            os.remove(filename)
        with open(filename, 'w') as file:
            file.write(json.dumps(my_list_of_dicts, indent=4))
        my_demo_list = []
        with open(filename, encoding='utf-8') as json_file:  
            all_data = json.load(json_file)
            for each_dictionary in all_data:
                tweet_id = each_dictionary['id']
                text = each_dictionary['text']
                favorite_count = each_dictionary['favorite_count']
                retweet_count = each_dictionary['retweet_count']
                created_at = each_dictionary['created_at']
                my_demo_list.append({'tweet_id': str(tweet_id),
                                     'text': str(text),
                                     'created_at': created_at,
                                    })
                #print(my_demo_list)
                temp_json = pd.DataFrame(my_demo_list, columns = 
                                          ['tweet_id', 'text', 
                                           'created_at'])
#             print("\n \n " + str(temp_json.shape) + str(type(temp_json)) + "\n\n")
        tweet_json = tweet_json.append(temp_json, ignore_index=True)
#         print("\n \n " + str(tweet_json.shape) + str(type(tweet_json)) + "\n\n")
    tweet_json = tweet_json.drop_duplicates()    
    return tweet_json

# Get Tweets by Hashtag

In [11]:
# TODO

# Loading in Training Data from Kaggle

In [15]:
# TODO
# https://www.kaggle.com/kazanova/sentiment140
# Download this dataset (~230mb) to your project directory
# Once this is done, begin training models

# 0 is negative
# 2 is neutral
# 4 is positive

# This dataset does not have any emojis, so we will have to find an alternative way if 
# we want to evaluate the effects that emojis have on sentiment

In [16]:
training_cnames = ['sentiment', 'tweet_id', 'created_at', 'mention', 'author', 'text']

training_data = pd.read_csv('training.1600000.processed.noemoticon.csv', names = training_cnames, encoding='latin-1')

training_data = training_data[['sentiment', 'tweet_id', 'created_at', 'text']]

In [17]:
# You can use this to slim down the data set.
# Loading and manipulating the full dataset takes about 30 minutes
# Pulling in 15000 tweets takes ~18 seconds to manipulate and will give a sufficient
# training set to use
training_data = training_data.sample(15000)

In [18]:
start = time.time()
training_data['tokenized'] = training_data['text'].apply(lambda x: tokenize(x))
training_data['pos_words'] = training_data['tokenized'].apply(lambda x: count_positive(x))
training_data['neg_words'] = training_data['tokenized'].apply(lambda x: count_negative(x))
training_data['num_mentions'] = training_data['tokenized'].apply(lambda x: count_mentions(x))
training_data['num_hashtags'] = training_data['tokenized'].apply(lambda x: count_hashtags(x))
training_data['num_positive_emojis'] = training_data['tokenized'].apply(lambda x: count_positive_emojis(x))
training_data['num_negative_emojis'] = training_data['tokenized'].apply(lambda x: count_negative_emojis(x))
end = time.time()
print(end - start)

24.458809852600098


In [19]:
training_data

Unnamed: 0,sentiment,tweet_id,created_at,text,tokenized,pos_words,neg_words,num_mentions,num_hashtags,num_positive_emojis,num_negative_emojis
968662,4,1827880408,Sun May 17 11:38:24 PDT 2009,I am super excited to read Romeo and Juliet S...,"[I, am, super, excited, to, read, Romeo, and, ...",3,0,0,0,0,0
1124608,4,1974693790,Sat May 30 13:39:18 PDT 2009,At the mall!! ï¿½LGV!ï¿½,"[At, the, mall, !, !, ï, ¿, ½LGV, !, ï, ¿, ½]",0,0,0,0,0,0
171119,0,1963124747,Fri May 29 12:14:28 PDT 2009,-was just told by Dan to NOT say anything anym...,"[-, was, just, told, by, Dan, to, NOT, say, an...",1,1,0,0,0,0
1462900,4,2064077865,Sun Jun 07 05:08:48 PDT 2009,OmyGawd I just sang Whitney's Grammy performan...,"[OmyGawd, I, just, sang, Whitney's, Grammy, pe...",1,0,0,0,0,0
1195988,4,1984716534,Sun May 31 15:39:51 PDT 2009,http://twitpic.com/6d7cj - prints for sale.......,"[http://twitpic.com/6d7cj, -, prints, for, sal...",1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
257911,0,1985072835,Sun May 31 16:22:30 PDT 2009,OMG ! I cannot more attend MTV movie awards ! ...,"[OMG, !, I, cannot, more, attend, MTV, movie, ...",1,0,0,0,0,0
1158900,4,1979303470,Sun May 31 02:28:20 PDT 2009,getting an oatmeal mask first lmao. then off ...,"[getting, an, oatmeal, mask, first, lmao, ., t...",0,0,0,0,0,0
945814,4,1822432515,Sat May 16 19:33:56 PDT 2009,I have the cutest dog,"[I, have, the, cutest, dog]",0,0,0,0,0,0
960523,4,1826418299,Sun May 17 08:27:54 PDT 2009,Mission to do before I die: See Marshall Mathe...,"[Mission, to, do, before, I, die, :, See, Mars...",1,2,0,0,0,0


# Load in Emoji Training Data from DS301 Bot Timeline

In [20]:
training_data_emojis = get_tweets_by_user('DS301Bot', 100)
training_data_emojis['tokenized'] = training_data_emojis['text'].apply(lambda x: tokenize(x))
training_data_emojis['pos_words'] = training_data_emojis['tokenized'].apply(lambda x: count_positive(x))
training_data_emojis['neg_words'] = training_data_emojis['tokenized'].apply(lambda x: count_negative(x))
training_data_emojis['num_mentions'] = training_data_emojis['tokenized'].apply(lambda x: count_mentions(x))
training_data_emojis['num_hashtags'] = training_data_emojis['tokenized'].apply(lambda x: count_hashtags(x))
training_data_emojis['num_positive_emojis'] = training_data_emojis['tokenized'].apply(lambda x: count_positive_emojis(x))
training_data_emojis['num_negative_emojis'] = training_data_emojis['tokenized'].apply(lambda x: count_negative_emojis(x))

In [21]:
training_data_emojis

Unnamed: 0,tweet_id,text,created_at,tokenized,pos_words,neg_words,num_mentions,num_hashtags,num_positive_emojis,num_negative_emojis
0,1467703601913077760,A+ on my Data Science final? Perfect. ✅,Mon Dec 06 03:53:01 +0000 2021,"[A, +, on, my, Data, Science, final, ?, Perfec...",1,0,0,0,1,0
1,1467703471000412168,I love rock n' roll 🤘,Mon Dec 06 03:52:30 +0000 2021,"[I, love, rock, n, ', roll, 🤘]",1,0,0,0,1,0
2,1467703361424216072,I am sick. COVID sucks. 😷,Mon Dec 06 03:52:04 +0000 2021,"[I, am, sick, ., COVID, sucks, ., 😷]",0,2,0,0,0,1
3,1467702988101898242,I am so pissed off that there is not any snowf...,Mon Dec 06 03:50:35 +0000 2021,"[I, am, so, pissed, off, that, there, is, not,...",0,1,0,0,0,0
4,1467702798032777218,It is cold outside and I am really mad about i...,Mon Dec 06 03:49:50 +0000 2021,"[It, is, cold, outside, and, I, am, really, ma...",1,2,0,0,0,1
5,1467702582416220163,"Breathe deep darling, you're gonna be alright....",Mon Dec 06 03:48:58 +0000 2021,"[Breathe, deep, darling, ,, you're, gonna, be,...",1,0,0,0,0,0
6,1467687257171795970,If I tell you I don't like something and you c...,Mon Dec 06 02:48:05 +0000 2021,"[If, I, tell, you, I, don't, like, something, ...",1,1,0,0,0,1
7,1467686277030064133,What a bad day. 😞,Mon Dec 06 02:44:11 +0000 2021,"[What, a, bad, day, ., 😞]",0,1,0,0,0,1
8,1467684957514567684,What a good day. 😄,Mon Dec 06 02:38:56 +0000 2021,"[What, a, good, day, ., 😄]",1,0,0,0,1,0
9,1467684474855075844,Phil Hellmuth is one of the most legendary pok...,Mon Dec 06 02:37:01 +0000 2021,"[Phil, Hellmuth, is, one, of, the, most, legen...",1,0,0,0,0,0


In [22]:
# Custom sentiment assignments for tweets with emojis
emojis_sentiment = [4,4,0,0,0,4,0,0,4,4,4,4,0,4,4,4,0,4,4,4,0,0,4,0,4,0,4,0,0,4,0,0,0,4,4,4,0,4,4]

In [23]:
training_data_emojis.insert(0, 'Sentiment', emojis_sentiment)

In [24]:
training_data_emojis

Unnamed: 0,Sentiment,tweet_id,text,created_at,tokenized,pos_words,neg_words,num_mentions,num_hashtags,num_positive_emojis,num_negative_emojis
0,4,1467703601913077760,A+ on my Data Science final? Perfect. ✅,Mon Dec 06 03:53:01 +0000 2021,"[A, +, on, my, Data, Science, final, ?, Perfec...",1,0,0,0,1,0
1,4,1467703471000412168,I love rock n' roll 🤘,Mon Dec 06 03:52:30 +0000 2021,"[I, love, rock, n, ', roll, 🤘]",1,0,0,0,1,0
2,0,1467703361424216072,I am sick. COVID sucks. 😷,Mon Dec 06 03:52:04 +0000 2021,"[I, am, sick, ., COVID, sucks, ., 😷]",0,2,0,0,0,1
3,0,1467702988101898242,I am so pissed off that there is not any snowf...,Mon Dec 06 03:50:35 +0000 2021,"[I, am, so, pissed, off, that, there, is, not,...",0,1,0,0,0,0
4,0,1467702798032777218,It is cold outside and I am really mad about i...,Mon Dec 06 03:49:50 +0000 2021,"[It, is, cold, outside, and, I, am, really, ma...",1,2,0,0,0,1
5,4,1467702582416220163,"Breathe deep darling, you're gonna be alright....",Mon Dec 06 03:48:58 +0000 2021,"[Breathe, deep, darling, ,, you're, gonna, be,...",1,0,0,0,0,0
6,0,1467687257171795970,If I tell you I don't like something and you c...,Mon Dec 06 02:48:05 +0000 2021,"[If, I, tell, you, I, don't, like, something, ...",1,1,0,0,0,1
7,0,1467686277030064133,What a bad day. 😞,Mon Dec 06 02:44:11 +0000 2021,"[What, a, bad, day, ., 😞]",0,1,0,0,0,1
8,4,1467684957514567684,What a good day. 😄,Mon Dec 06 02:38:56 +0000 2021,"[What, a, good, day, ., 😄]",1,0,0,0,1,0
9,4,1467684474855075844,Phil Hellmuth is one of the most legendary pok...,Mon Dec 06 02:37:01 +0000 2021,"[Phil, Hellmuth, is, one, of, the, most, legen...",1,0,0,0,0,0


# Train ML Models on Training Data

In [95]:
# TODO

# Experimentation

In [25]:
user_tweets = get_tweets_by_user('CycloneLarry69', 109)

In [26]:
user_tweets.shape

(100, 3)

In [27]:
user_tweets

Unnamed: 0,tweet_id,text,created_at
0,1467702232380493826,Iowa State football is favored in a post seaso...,Mon Dec 06 03:47:35 +0000 2021
1,1467700179847589899,Coincidence that he jumped ship only a few hou...,Mon Dec 06 03:39:26 +0000 2021
2,1467679345451974664,Lol imagine telling an Iowa State fan in 2015 ...,Mon Dec 06 02:16:38 +0000 2021
3,1467678904357904386,@travisclones You’d be crazy not to,Mon Dec 06 02:14:53 +0000 2021
4,1467648646250483713,@itshanklol Hateful 8 stays winning baby,Mon Dec 06 00:14:39 +0000 2021
...,...,...,...
115,1467297371985952776,Football was meant to be played in 30 degree w...,Sun Dec 05 00:58:49 +0000 2021
116,1467289345791086596,We are all Bearcats now,Sun Dec 05 00:26:55 +0000 2021
117,1467283257205178368,1. Alabama\n2. Michigan\n3. Cincy\n4. Notre Da...,Sun Dec 05 00:02:43 +0000 2021
118,1467275698855456770,Don’t put Georgia in,Sat Dec 04 23:32:41 +0000 2021


In [28]:
user_tweets['tokenized'] = user_tweets['text'].apply(lambda x: tokenize(x))
user_tweets['pos_words'] = user_tweets['tokenized'].apply(lambda x: count_positive(x))
user_tweets['neg_words'] = user_tweets['tokenized'].apply(lambda x: count_negative(x))
user_tweets['num_mentions'] = user_tweets['tokenized'].apply(lambda x: count_mentions(x))
user_tweets['num_hashtags'] = user_tweets['tokenized'].apply(lambda x: count_hashtags(x))
user_tweets['num_positive_emojis'] = user_tweets['tokenized'].apply(lambda x: count_positive_emojis(x))
user_tweets['num_negative_emojis'] = user_tweets['tokenized'].apply(lambda x: count_negative_emojis(x))

In [29]:
user_tweets

Unnamed: 0,tweet_id,text,created_at,tokenized,pos_words,neg_words,num_mentions,num_hashtags,num_positive_emojis,num_negative_emojis
0,1467702232380493826,Iowa State football is favored in a post seaso...,Mon Dec 06 03:47:35 +0000 2021,"[Iowa, State, football, is, favored, in, a, po...",1,0,0,0,0,0
1,1467700179847589899,Coincidence that he jumped ship only a few hou...,Mon Dec 06 03:39:26 +0000 2021,"[Coincidence, that, he, jumped, ship, only, a,...",0,0,0,0,0,0
2,1467679345451974664,Lol imagine telling an Iowa State fan in 2015 ...,Mon Dec 06 02:16:38 +0000 2021,"[Lol, imagine, telling, an, Iowa, State, fan, ...",0,0,0,0,0,0
3,1467678904357904386,@travisclones You’d be crazy not to,Mon Dec 06 02:14:53 +0000 2021,"[@travisclones, You, ’, d, be, crazy, not, to]",0,1,1,0,0,0
4,1467648646250483713,@itshanklol Hateful 8 stays winning baby,Mon Dec 06 00:14:39 +0000 2021,"[@itshanklol, Hateful, 8, stays, winning, baby]",1,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
115,1467297371985952776,Football was meant to be played in 30 degree w...,Sun Dec 05 00:58:49 +0000 2021,"[Football, was, meant, to, be, played, in, 30,...",0,0,0,0,0,0
116,1467289345791086596,We are all Bearcats now,Sun Dec 05 00:26:55 +0000 2021,"[We, are, all, Bearcats, now]",0,0,0,0,0,0
117,1467283257205178368,1. Alabama\n2. Michigan\n3. Cincy\n4. Notre Da...,Sun Dec 05 00:02:43 +0000 2021,"[1, ., Alabama, 2, ., Michigan, 3, ., Cincy, 4...",0,0,0,0,0,0
118,1467275698855456770,Don’t put Georgia in,Sat Dec 04 23:32:41 +0000 2021,"[Don, ’, t, put, Georgia, in]",0,0,0,0,0,0


In [62]:
# punctuation = list(string.punctuation)
# stop = stopwords.words('english') + punctuation + ['rt', 'via', '’', 'RT', '️', '…']

In [63]:
# count_all = Counter()
# for tweet in user_tweets['text']:
#     terms_all = [term for term in preprocess(tweet)]
#     terms_stop = [term for term in preprocess(tweet) if term not in stop]
#     count_all.update(terms_stop)
#     print(tweet + "\n")
# print(count_all.most_common(10))