The object here is to train a machine learning model on a database of Tweets and use that model to predict sentiment of Tweets from a user, a hashtag, or any random variety of Tweets. 

**@author Ryan Herren**

**@author Tanner Dunn**

The following link is a good informational guide on how to roughly implement a model like we are aiming for. https://www.analyticsvidhya.com/blog/2021/06/twitter-sentiment-analysis-a-nlp-use-case-for-beginners/

# Initialize environment, install packages

In [42]:
import tweepy
import pandas as pd
import numpy as np
import operator 
import json
from collections import Counter
import os
import yaml
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import string
import math
import time
from tqdm import tqdm

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ryanherren/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [43]:
os.environ['python-bot-config'] = "/Users/ryanherren/python-bot-config.yaml"
# os.environ['python-bot-config'] = "/Users/dunnt/python-bot-config.yaml"
# os.environ['python-bot-config'] = "/Users/mariolgw/python-bot-config.yaml"

In [44]:
try:
    v_env = os.environ['python-bot-config']
except:
    print("Config file env variable is not set.")
    print("Set python-bot-config file")
    sys.exit(1)

with open(v_env, "r") as yamlConfig:
    cfg = yaml.safe_load(yamlConfig)

# Import Opinion Lexicon

In [45]:
# Reading negative words list:
with open('negative-words.txt', 'r', encoding = "ISO-8859-1") as t:
    neg = t.readlines()
    neg_words = []
    for i in neg:
        neg_words.append(i.strip())
    del neg_words[0 : 31]
    

# Reading positive words list:
with open('positive-words.txt', 'r', encoding = "ISO-8859-1") as t:
    pos = t.readlines()
    pos_words = []
    for i in pos:
        pos_words.append(i.strip())
    del pos_words[0 : 30]
    
positive_emojis = ['😂','🔥','😍','🤘','🤩','👍','💯','😎','✅','👏','😀','🐐',
                   '❤️','♥️','😘','😊','😄','😃','😆','😋','🤪','😜','😛','🤑']
negative_emojis = ['🥴','🤢','🤮','😧','😑','😰','🤬','😡','😭','😢','😩','🙁',
                   '☹️','😣','😖','😫','😟','😞','😔','😒','👿','🤕','🤒','😷']

# Function to count sentiment words in Tweets

In [92]:
# Function to count negative words in tweet:
def count_negative(tweet):
    neg_word_count = 0
    for word in tweet:
        if word.lower() in neg_words:
            neg_word_count = neg_word_count + 1
            # mg['neg_word_count'] = mg['text'].apply(lambda x: neg_word_count(x), axis = 1, result_type = 'expand')
    return neg_word_count
   
# Function to count positive words in tweet:
def count_positive(tweet):
    pos_word_count = 0
    for word in tweet:
        #print(word)
        if word.lower() in pos_words:
            pos_word_count = pos_word_count + 1
            # mg['neg_word_count'] = mg['text'].apply(lambda x: neg_word_count(x), axis = 1, result_type = 'expand')
    return pos_word_count
   
def count_mentions(tweet):
    num_mentions = 0
    for word in tweet:
        if word.startswith('@'):
            num_mentions = num_mentions + 1
    return num_mentions

def count_hashtags(tweet):
    num_hashtags = 0
    for word in tweet:
        if word.startswith('#'):
            num_hashtags = num_hashtags + 1
    return num_hashtags

def count_positive_emojis(tweet):
    pos_emoji_count = 0
    for word in tweet:
        #print(word)
        if word in positive_emojis:
            pos_emoji_count = pos_emoji_count + 1
            # mg['neg_word_count'] = mg['text'].apply(lambda x: neg_word_count(x), axis = 1, result_type = 'expand')
    return pos_emoji_count

def count_negative_emojis(tweet):
    neg_emoji_count = 0
    for word in tweet:
        #print(word)
        if word in negative_emojis:
            neg_emoji_count = neg_emoji_count + 1
            # mg['neg_word_count'] = mg['text'].apply(lambda x: neg_word_count(x), axis = 1, result_type = 'expand')
    return neg_emoji_count

# Tokenizing functions to break tweets into words, emojis, mentions, and hashtags

In [47]:
# This cell is the basis of how tokenizing the Tweet will work. Using regex statments, it accounts for emojis, 
# hashtags, mentions, and more.
import re
 
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]
    
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
 
def tokenize(s):
    return tokens_re.findall(s)
 
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens

# Configure Twitter API Connection

In [53]:
consumer_key = cfg.get("TwitterAPI").get("consumer_key")
consumer_secret = cfg.get("TwitterAPI").get("consumer_secret")
access_token = cfg.get("TwitterAPI").get("access_token")
access_token_secret = cfg.get("TwitterAPI").get("access_token_secret")

In [54]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
 
api = tweepy.API(auth, wait_on_rate_limit=True)

# Get Tweets by User

In [111]:
# Currently limited to num_tweets Tweets
column_names = ['tweet_id', 'text','created_at']
def get_tweets_by_user(user, tweets):
    tweet_json = pd.DataFrame(columns=column_names)
    for i in range(math.ceil(tweets/20)):
        tweets = api.user_timeline(user, page=i)
#         print(str(i) + "WABBADABBADO\n")
#         print(tweets)
        my_list_of_dicts = []
        for each_json_tweet in tweets:
            my_list_of_dicts.append(each_json_tweet._json)
        filename = user + '.txt'
        if os.path.exists(filename):
            os.remove(filename)
        with open(filename, 'w') as file:
            file.write(json.dumps(my_list_of_dicts, indent=4))
        my_demo_list = []
        with open(filename, encoding='utf-8') as json_file:  
            all_data = json.load(json_file)
            for each_dictionary in all_data:
                tweet_id = each_dictionary['id']
                text = each_dictionary['text']
                favorite_count = each_dictionary['favorite_count']
                retweet_count = each_dictionary['retweet_count']
                created_at = each_dictionary['created_at']
                my_demo_list.append({'tweet_id': str(tweet_id),
                                     'text': str(text),
                                     'created_at': created_at,
                                    })
                #print(my_demo_list)
                temp_json = pd.DataFrame(my_demo_list, columns = 
                                          ['tweet_id', 'text', 
                                           'created_at'])
#             print("\n \n " + str(temp_json.shape) + str(type(temp_json)) + "\n\n")
        tweet_json = tweet_json.append(temp_json, ignore_index=True)
#         print("\n \n " + str(tweet_json.shape) + str(type(tweet_json)) + "\n\n")
    tweet_json = tweet_json.drop_duplicates()    
    return tweet_json

# Get Tweets by Hashtag

In [56]:
# TODO

# Loading in Training Data from Kaggle

In [64]:
# TODO
# https://www.kaggle.com/kazanova/sentiment140
# Download this dataset (~230mb) to your project directory
# Once this is done, begin training models

# 0 is negative
# 2 is neutral
# 4 is positive

# This dataset does not have any emojis, so we will have to find an alternative way if 
# we want to evaluate the effects that emojis have on sentiment

In [49]:
training_cnames = ['sentiment', 'tweet_id', 'created_at', 'mention', 'author', 'text']

training_data = pd.read_csv('training.1600000.processed.noemoticon.csv', names = training_cnames, encoding='latin-1')

training_data = training_data[['sentiment', 'tweet_id', 'created_at', 'text']]

In [50]:
# You can use this to slim down the data set.
# Loading and manipulating the full dataset takes about 30 minutes
# Pulling in 15000 tweets takes ~18 seconds to manipulate and will give a sufficient
# training set to use
training_data = training_data.sample(15000)

In [51]:
start = time.time()
training_data['tokenized'] = training_data['text'].apply(lambda x: tokenize(x))
training_data['pos_words'] = training_data['tokenized'].apply(lambda x: count_positive(x))
training_data['neg_words'] = training_data['tokenized'].apply(lambda x: count_negative(x))
training_data['num_mentions'] = training_data['tokenized'].apply(lambda x: count_mentions(x))
training_data['num_hashtags'] = training_data['tokenized'].apply(lambda x: count_hashtags(x))
training_data['num_positive_emojis'] = training_data['tokenized'].apply(lambda x: count_positive_emojis(x))
training_data['num_negative_emojis'] = training_data['tokenized'].apply(lambda x: count_negative_emojis(x))
end = time.time()
print(end - start)

19.19286298751831


In [52]:
training_data

Unnamed: 0,sentiment,tweet_id,created_at,text,tokenized,pos_words,neg_words,num_mentions,num_hashtags,num_positive_emojis,num_negative_emojis
877522,4,1685006851,Sat May 02 22:44:37 PDT 2009,"@fillanypdf thank you, i think i am looking mo...","[@fillanypdf, thank, you, ,, i, think, i, am, ...",1,0,1,0,0,0
756524,0,2288758323,Mon Jun 22 19:34:49 PDT 2009,Days like today make me crazy no matter what I...,"[Days, like, today, make, me, crazy, no, matte...",2,1,0,0,0,0
607307,0,2222912154,Thu Jun 18 07:36:20 PDT 2009,"needs every1 to PRAY for me, just went to orde...","[needs, every, 1, to, PRAY, for, me, ,, just, ...",1,0,0,0,0,0
1261966,4,1998735324,Mon Jun 01 19:02:25 PDT 2009,@iEllie The house we're buying has a pond in t...,"[@iEllie, The, house, we're, buying, has, a, p...",0,0,1,0,0,0
887966,4,1687153078,Sun May 03 07:55:15 PDT 2009,just woke up to a big pile of puppy shit good...,"[just, woke, up, to, a, big, pile, of, puppy, ...",1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
693727,0,2252808073,Sat Jun 20 07:02:15 PDT 2009,@piiyaahn Yeah but I'm using it everyday so pe...,"[@piiyaahn, Yeah, but, I'm, using, it, everyda...",0,0,1,0,0,0
1576987,4,2189617539,Tue Jun 16 00:28:11 PDT 2009,@vampirefreak101 sorry to hear that I feel you...,"[@vampirefreak101, sorry, to, hear, that, I, f...",1,2,1,0,0,0
974181,4,1833357034,Sun May 17 23:02:13 PDT 2009,Watched Star Trek the movie yesterday. WOOHOO!...,"[Watched, Star, Trek, the, movie, yesterday, ....",0,0,0,0,0,0
161742,0,1957387521,Fri May 29 00:20:42 PDT 2009,@106jackfm I'm not that's why I'm sulking not...,"[@106jackfm, I'm, not, that's, why, I'm, sulki...",1,0,1,0,0,0


# Load in Emoji Training Data from DS301 Bot Timeline

In [112]:
training_data_emojis = get_tweets_by_user('DS301Bot', 100)
training_data_emojis['tokenized'] = training_data_emojis['text'].apply(lambda x: tokenize(x))
training_data_emojis['pos_words'] = training_data_emojis['tokenized'].apply(lambda x: count_positive(x))
training_data_emojis['neg_words'] = training_data_emojis['tokenized'].apply(lambda x: count_negative(x))
training_data_emojis['num_mentions'] = training_data_emojis['tokenized'].apply(lambda x: count_mentions(x))
training_data_emojis['num_hashtags'] = training_data_emojis['tokenized'].apply(lambda x: count_hashtags(x))
training_data_emojis['num_positive_emojis'] = training_data_emojis['tokenized'].apply(lambda x: count_positive_emojis(x))
training_data_emojis['num_negative_emojis'] = training_data_emojis['tokenized'].apply(lambda x: count_negative_emojis(x))

In [113]:
training_data_emojis

Unnamed: 0,tweet_id,text,created_at,tokenized,pos_words,neg_words,num_mentions,num_hashtags,num_positive_emojis,num_negative_emojis
0,1467687257171795970,If I tell you I don't like something and you c...,Mon Dec 06 02:48:05 +0000 2021,"[If, I, tell, you, I, don't, like, something, ...",1,1,0,0,0,1
1,1467686277030064133,What a bad day. 😞,Mon Dec 06 02:44:11 +0000 2021,"[What, a, bad, day, ., 😞]",0,1,0,0,0,1
2,1467684957514567684,What a good day. 😄,Mon Dec 06 02:38:56 +0000 2021,"[What, a, good, day, ., 😄]",1,0,0,0,1,0
3,1467684474855075844,Phil Hellmuth is one of the most legendary pok...,Mon Dec 06 02:37:01 +0000 2021,"[Phil, Hellmuth, is, one, of, the, most, legen...",1,0,0,0,0,0
4,1467684285402468355,Jennifer Lawrence at press conference #DontLoo...,Mon Dec 06 02:36:16 +0000 2021,"[Jennifer, Lawrence, at, press, conference, #D...",0,0,0,1,4,0
5,1467674221853786113,"- Okay so it’s true, Taehyungie staying back w...",Mon Dec 06 01:56:17 +0000 2021,"[-, Okay, so, it, ’, s, true, ,, Taehyungie, s...",0,0,0,0,0,0
6,1467673381172654087,😩 why do you break my heart \n@Raiders 💔,Mon Dec 06 01:52:56 +0000 2021,"[😩, why, do, you, break, my, heart, @Raiders, 💔]",0,1,1,0,0,1
7,1467673144647376902,Raiders are in last place 😂😂😂😂,Mon Dec 06 01:52:00 +0000 2021,"[Raiders, are, in, last, place, 😂, 😂, 😂, 😂]",0,0,0,0,4,0
8,1467655653917401089,I love lasagna. Unequivocally the top choice i...,Mon Dec 06 00:42:30 +0000 2021,"[I, love, lasagna, ., Unequivocally, the, top,...",3,0,0,0,1,0
9,1467654980664381442,What is the best coding language and why is it...,Mon Dec 06 00:39:49 +0000 2021,"[What, is, the, best, coding, language, and, w...",1,0,0,0,1,0


In [116]:
training_data

Unnamed: 0,sentiment,tweet_id,created_at,text,tokenized,pos_words,neg_words,num_mentions,num_hashtags,num_positive_emojis,num_negative_emojis
877522,4,1685006851,Sat May 02 22:44:37 PDT 2009,"@fillanypdf thank you, i think i am looking mo...","[@fillanypdf, thank, you, ,, i, think, i, am, ...",1,0,1,0,0,0
756524,0,2288758323,Mon Jun 22 19:34:49 PDT 2009,Days like today make me crazy no matter what I...,"[Days, like, today, make, me, crazy, no, matte...",2,1,0,0,0,0
607307,0,2222912154,Thu Jun 18 07:36:20 PDT 2009,"needs every1 to PRAY for me, just went to orde...","[needs, every, 1, to, PRAY, for, me, ,, just, ...",1,0,0,0,0,0
1261966,4,1998735324,Mon Jun 01 19:02:25 PDT 2009,@iEllie The house we're buying has a pond in t...,"[@iEllie, The, house, we're, buying, has, a, p...",0,0,1,0,0,0
887966,4,1687153078,Sun May 03 07:55:15 PDT 2009,just woke up to a big pile of puppy shit good...,"[just, woke, up, to, a, big, pile, of, puppy, ...",1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
693727,0,2252808073,Sat Jun 20 07:02:15 PDT 2009,@piiyaahn Yeah but I'm using it everyday so pe...,"[@piiyaahn, Yeah, but, I'm, using, it, everyda...",0,0,1,0,0,0
1576987,4,2189617539,Tue Jun 16 00:28:11 PDT 2009,@vampirefreak101 sorry to hear that I feel you...,"[@vampirefreak101, sorry, to, hear, that, I, f...",1,2,1,0,0,0
974181,4,1833357034,Sun May 17 23:02:13 PDT 2009,Watched Star Trek the movie yesterday. WOOHOO!...,"[Watched, Star, Trek, the, movie, yesterday, ....",0,0,0,0,0,0
161742,0,1957387521,Fri May 29 00:20:42 PDT 2009,@106jackfm I'm not that's why I'm sulking not...,"[@106jackfm, I'm, not, that's, why, I'm, sulki...",1,0,1,0,0,0


# Train ML Models on Training Data

In [95]:
# TODO

# Experimentation

In [57]:
user_tweets = get_tweets_by_user('CycloneLarry69', 109)

In [58]:
user_tweets.shape

(100, 5)

In [59]:
user_tweets

Unnamed: 0,tweet_id,text,favorite_count,retweet_count,created_at
0,1467591782607343624,@GACyclone91 The SEC scan lick my butt and I d...,2,0,Sun Dec 05 20:28:42 +0000 2021
1,1467591477496848388,What a time to be alive https://t.co/i3bevycYtV,22,0,Sun Dec 05 20:27:29 +0000 2021
2,1467591379266293768,The drunk game watch in Florida for the #5 Cyc...,7,0,Sun Dec 05 20:27:06 +0000 2021
3,1467591161074360322,I think I’m actually going to cheer for Iowa a...,10,0,Sun Dec 05 20:26:13 +0000 2021
4,1467590241917849600,@baylortk Yes sir,1,0,Sun Dec 05 20:22:34 +0000 2021
...,...,...,...,...,...
115,1467229885202632705,@RJHINDM Or force a turnover I guess,0,0,Sat Dec 04 20:30:39 +0000 2021
116,1467229697780109323,@RJHINDM Have to,0,0,Sat Dec 04 20:29:54 +0000 2021
117,1467229585620275210,Oh man that’s a brutal call,9,1,Sat Dec 04 20:29:27 +0000 2021
118,1467227767049797637,Ain’t no way lol,9,0,Sat Dec 04 20:22:14 +0000 2021


In [60]:
user_tweets['tokenized'] = user_tweets['text'].apply(lambda x: tokenize(x))
user_tweets['pos_words'] = user_tweets['tokenized'].apply(lambda x: count_positive(x))
user_tweets['neg_words'] = user_tweets['tokenized'].apply(lambda x: count_negative(x))
user_tweets['num_mentions'] = user_tweets['tokenized'].apply(lambda x: count_mentions(x))
user_tweets['num_hashtags'] = user_tweets['tokenized'].apply(lambda x: count_hashtags(x))
user_tweets['num_positive_emojis'] = user_tweets['tokenized'].apply(lambda x: count_positive_emojis(x))
user_tweets['num_negative_emojis'] = user_tweets['tokenized'].apply(lambda x: count_negative_emojis(x))

In [61]:
user_tweets

Unnamed: 0,tweet_id,text,favorite_count,retweet_count,created_at,tokenized,pos_words,neg_words,num_mentions,num_hashtags,num_positive_emojis,num_negative_emojis
0,1467591782607343624,@GACyclone91 The SEC scan lick my butt and I d...,2,0,Sun Dec 05 20:28:42 +0000 2021,"[@GACyclone91, The, SEC, scan, lick, my, butt,...",1,0,1,0,0,0
1,1467591477496848388,What a time to be alive https://t.co/i3bevycYtV,22,0,Sun Dec 05 20:27:29 +0000 2021,"[What, a, time, to, be, alive, https://t.co/i3...",0,0,0,0,0,0
2,1467591379266293768,The drunk game watch in Florida for the #5 Cyc...,7,0,Sun Dec 05 20:27:06 +0000 2021,"[The, drunk, game, watch, in, Florida, for, th...",1,1,0,2,0,0
3,1467591161074360322,I think I’m actually going to cheer for Iowa a...,10,0,Sun Dec 05 20:26:13 +0000 2021,"[I, think, I, ’, m, actually, going, to, cheer...",1,0,0,0,0,0
4,1467590241917849600,@baylortk Yes sir,1,0,Sun Dec 05 20:22:34 +0000 2021,"[@baylortk, Yes, sir]",0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
115,1467229885202632705,@RJHINDM Or force a turnover I guess,0,0,Sat Dec 04 20:30:39 +0000 2021,"[@RJHINDM, Or, force, a, turnover, I, guess]",0,0,1,0,0,0
116,1467229697780109323,@RJHINDM Have to,0,0,Sat Dec 04 20:29:54 +0000 2021,"[@RJHINDM, Have, to]",0,0,1,0,0,0
117,1467229585620275210,Oh man that’s a brutal call,9,1,Sat Dec 04 20:29:27 +0000 2021,"[Oh, man, that, ’, s, a, brutal, call]",0,1,0,0,0,0
118,1467227767049797637,Ain’t no way lol,9,0,Sat Dec 04 20:22:14 +0000 2021,"[Ain, ’, t, no, way, lol]",0,0,0,0,0,0


In [62]:
# punctuation = list(string.punctuation)
# stop = stopwords.words('english') + punctuation + ['rt', 'via', '’', 'RT', '️', '…']

In [63]:
# count_all = Counter()
# for tweet in user_tweets['text']:
#     terms_all = [term for term in preprocess(tweet)]
#     terms_stop = [term for term in preprocess(tweet) if term not in stop]
#     count_all.update(terms_stop)
#     print(tweet + "\n")
# print(count_all.most_common(10))