The object here is to train a machine learning model on a database of Tweets and use that model to predict sentiment of Tweets from a user, a hashtag, or any random variety of Tweets. 

**@author Ryan Herren**

**@author Tanner Dunn**

# Initialize environment, install packages

In [51]:
import tweepy
import pandas as pd
import numpy as np
import operator 
import json
from collections import Counter
import os
import yaml
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import string
import math

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ryanherren/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [52]:
os.environ['python-bot-config'] = "/Users/ryanherren/python-bot-config.yaml"
# os.environ['python-bot-config'] = "/Users/dunnt/python-bot-config.yaml"
# os.environ['python-bot-config'] = "/Users/mariolgw/python-bot-config.yaml"

In [53]:
try:
    v_env = os.environ['python-bot-config']
except:
    print("Config file env variable is not set.")
    print("Set python-bot-config file")
    sys.exit(1)

with open(v_env, "r") as yamlConfig:
    cfg = yaml.safe_load(yamlConfig)

# Import Opinion Lexicon

In [109]:
# Reading negative words list:
with open('negative-words.txt', 'r', encoding = "ISO-8859-1") as t:
    neg = t.readlines()
    neg_words = []
    for i in neg:
        neg_words.append(i.strip())
    del neg_words[0 : 31]
    

# Reading positive words list:
with open('positive-words.txt', 'r', encoding = "ISO-8859-1") as t:
    pos = t.readlines()
    pos_words = []
    for i in pos:
        pos_words.append(i.strip())
    del pos_words[0 : 30]
    
positive_emojis = ['😂','🔥','😍','🤘','🤩','👍','💯','😎','✅','👏','😀','🐐',
                   '❤️','♥️','😘','😊','😄','😃','😆','😋','🤪','😜','😛','🤑']
negative_emojis = ['🥴','🤢','🤮','😧','😑','😰','🤬','😡','😭','😢','😩','🙁',
                   '☹️','😣','😖','😫','😟','😞','😔','😒','👿','🤕','🤒','😷']

# Function to count sentiment words in Tweets

In [110]:
# Function to count negative words in tweet:
def count_negative(tweet):
    neg_word_count = 0
    for word in tweet:
        if word in neg_words:
            neg_word_count = neg_word_count + 1
            # mg['neg_word_count'] = mg['text'].apply(lambda x: neg_word_count(x), axis = 1, result_type = 'expand')
    return neg_word_count
   
# Function to count positive words in tweet:
def count_positive(tweet):
    pos_word_count = 0
    for word in tweet:
        #print(word)
        if word in pos_words:
            pos_word_count = pos_word_count + 1
            # mg['neg_word_count'] = mg['text'].apply(lambda x: neg_word_count(x), axis = 1, result_type = 'expand')
    return pos_word_count
   
def count_mentions(tweet):
    num_mentions = 0
    for word in tweet:
        if word.startswith('@'):
            num_mentions = num_mentions + 1
    return num_mentions

def count_hashtags(tweet):
    num_hashtags = 0
    for word in tweet:
        if word.startswith('#'):
            num_hashtags = num_hashtags + 1
    return num_hashtags

def count_positive_emojis(tweet):
    pos_emoji_count = 0
    for word in tweet:
        #print(word)
        if word in positive_emojis:
            pos_emoji_count = pos_emoji_count + 1
            # mg['neg_word_count'] = mg['text'].apply(lambda x: neg_word_count(x), axis = 1, result_type = 'expand')
    return pos_emoji_count

def count_negative_emojis(tweet):
    neg_emoji_count = 0
    for word in tweet:
        #print(word)
        if word in negative_emojis:
            neg_emoji_count = neg_emoji_count + 1
            # mg['neg_word_count'] = mg['text'].apply(lambda x: neg_word_count(x), axis = 1, result_type = 'expand')
    return neg_emoji_count

# Tokenizing functions to break tweets into words, emojis, mentions, and hashtags

In [90]:
# This cell is the basis of how tokenizing the Tweet will work. Using regex statments, it accounts for emojis, 
# hashtags, mentions, and more.
import re
 
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]
    
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
 
def tokenize(s):
    return tokens_re.findall(s)
 
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens

# Configure Twitter API Connection

In [56]:
consumer_key = cfg.get("TwitterAPI").get("consumer_key")
consumer_secret = cfg.get("TwitterAPI").get("consumer_secret")
access_token = cfg.get("TwitterAPI").get("access_token")
access_token_secret = cfg.get("TwitterAPI").get("access_token_secret")

In [57]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
 
api = tweepy.API(auth, wait_on_rate_limit=True)

# Get Tweets by User

In [58]:
# Currently limited to num_tweets Tweets
column_names = ['tweet_id', 'text','favorite_count','retweet_count','created_at']
def get_tweets_by_user(user, tweets):
    tweet_json = pd.DataFrame(columns=column_names)
    for i in range(math.ceil(tweets/20)):
        tweets = api.user_timeline(user, page=i)
#         print(str(i) + "WABBADABBADO\n")
#         print(tweets)
        my_list_of_dicts = []
        for each_json_tweet in tweets:
            my_list_of_dicts.append(each_json_tweet._json)
        filename = user + '.txt'
        if os.path.exists(filename):
            os.remove(filename)
        with open(filename, 'w') as file:
            file.write(json.dumps(my_list_of_dicts, indent=4))
        my_demo_list = []
        with open(filename, encoding='utf-8') as json_file:  
            all_data = json.load(json_file)
            for each_dictionary in all_data:
                tweet_id = each_dictionary['id']
                text = each_dictionary['text']
                favorite_count = each_dictionary['favorite_count']
                retweet_count = each_dictionary['retweet_count']
                created_at = each_dictionary['created_at']
                my_demo_list.append({'tweet_id': str(tweet_id),
                                     'text': str(text),
                                     'favorite_count': int(favorite_count),
                                     'retweet_count': int(retweet_count),
                                     'created_at': created_at,
                                    })
                #print(my_demo_list)
                temp_json = pd.DataFrame(my_demo_list, columns = 
                                          ['tweet_id', 'text', 
                                           'favorite_count', 'retweet_count', 
                                           'created_at'])
#             print("\n \n " + str(temp_json.shape) + str(type(temp_json)) + "\n\n")
        tweet_json = tweet_json.append(temp_json, ignore_index=True)
#         print("\n \n " + str(tweet_json.shape) + str(type(tweet_json)) + "\n\n")
    tweet_json = tweet_json.drop_duplicates()    
    return tweet_json

# Get Tweets by Hashtag

In [59]:
# TODO

# Experimentation

In [111]:
user_tweets = get_tweets_by_user('DS301Bot', 109)

In [112]:
user_tweets.shape

(3, 5)

In [113]:
user_tweets

Unnamed: 0,tweet_id,text,favorite_count,retweet_count,created_at
0,1467259093375143940,🥴🤢🤮😧😑😰🤬😡😭😢😩🙁☹️😣😖😫😟😞😔😒👿🤕🤒😷,0,0,Sat Dec 04 22:26:42 +0000 2021
1,1467256679561633792,😂🔥😍🤘🏼🤩👍🏼💯😎✅👏🏼😀🐐❤️♥️😘😊😄😃😆😋🤪😜😛🤑,0,0,Sat Dec 04 22:17:07 +0000 2021
2,1463255944565977092,Hello friends. I am a friendly bot.,1,0,Tue Nov 23 21:19:37 +0000 2021


In [114]:
user_tweets['tokenized'] = user_tweets['text'].apply(lambda x: tokenize(x))
user_tweets['pos_words'] = user_tweets['tokenized'].apply(lambda x: count_positive(x))
user_tweets['neg_words'] = user_tweets['tokenized'].apply(lambda x: count_negative(x))
user_tweets['num_mentions'] = user_tweets['tokenized'].apply(lambda x: count_mentions(x))
user_tweets['num_hashtags'] = user_tweets['tokenized'].apply(lambda x: count_hashtags(x))
user_tweets['num_positive_emojis'] = user_tweets['tokenized'].apply(lambda x: count_positive_emojis(x))
user_tweets['num_negative_emojis'] = user_tweets['tokenized'].apply(lambda x: count_negative_emojis(x))

In [115]:
user_tweets

Unnamed: 0,tweet_id,text,favorite_count,retweet_count,created_at,tokenized,pos_words,neg_words,num_mentions,num_hashtags,num_positive_emojis,num_negative_emojis
0,1467259093375143940,🥴🤢🤮😧😑😰🤬😡😭😢😩🙁☹️😣😖😫😟😞😔😒👿🤕🤒😷,0,0,Sat Dec 04 22:26:42 +0000 2021,"[🥴, 🤢, 🤮, 😧, 😑, 😰, 🤬, 😡, 😭, 😢, 😩, 🙁, ☹, ️, 😣, ...",0,0,0,0,0,23
1,1467256679561633792,😂🔥😍🤘🏼🤩👍🏼💯😎✅👏🏼😀🐐❤️♥️😘😊😄😃😆😋🤪😜😛🤑,0,0,Sat Dec 04 22:17:07 +0000 2021,"[😂, 🔥, 😍, 🤘, 🏼, 🤩, 👍, 🏼, 💯, 😎, ✅, 👏, 🏼, 😀, 🐐, ...",0,0,0,0,22,0
2,1463255944565977092,Hello friends. I am a friendly bot.,1,0,Tue Nov 23 21:19:37 +0000 2021,"[Hello, friends, ., I, am, a, friendly, bot, .]",1,0,0,0,0,0


In [83]:
# punctuation = list(string.punctuation)
# stop = stopwords.words('english') + punctuation + ['rt', 'via', '’', 'RT', '️', '…']

In [84]:
# count_all = Counter()
# for tweet in user_tweets['text']:
#     terms_all = [term for term in preprocess(tweet)]
#     terms_stop = [term for term in preprocess(tweet) if term not in stop]
#     count_all.update(terms_stop)
#     print(tweet + "\n")
# print(count_all.most_common(10))

YOU ALL NEED TO TOUCH GRASS

@ElamarDaGreat Love you man best of luck!

@tomh1138 @jakebrend32 There is no friction between Campbell and Pollard and Matt Campbell will be in Ames, Iowa co… https://t.co/CupKqzCsH9

RT @WillBlackmon: Brian Kelly at first interview after practice at LSU https://t.co/J4HneZqUVy

@ImDerBatman @GilletteLD Boyyyyyyy

@GilletteLD @ImDerBatman Bill O’Brien

@drakectoll A Baylor fan that can afford losing a hamstring?

FIFTEEN MILLION DOLLARS

@dennisdoddcbs @PeteThamel It’s Dennis Dodd’s time to lick my nuts

Do you think Steele Jantz knows about all this?

Congrats bro @itshanklol!

They said your hamstrings weren’t good enough, but keep proven the haters wrong😤✊ https://t.co/A0Z8vPlrDr

RT @G_Bombastic: YOOOO 😭😭 https://t.co/Gu8ogF4nnk

Yup.

Bob Stoops and Lincoln Riley situation, but Brian decided to get that bread before retiring all the way https://t.co/X0vVeIPQ4o

@AidenWyatt01 AIDEN

@tKCyclone Brother what?

@steveryancarter It’s gotta be my nuts to yo