The object here is to train a machine learning model on a database of Tweets and use that model to predict sentiment of Tweets from a user, a hashtag, or any random variety of Tweets. 

@author Ryan Herren
@author Tanner Dunn

In [152]:
import tweepy
import pandas as pd
import numpy as np
import operator 
import json
from collections import Counter
import os
import yaml
import nltk
nltk.download('stopwords')
import string

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ryanherren/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [153]:
os.environ['python-bot-config'] = "/Users/ryanherren/python-bot-config.yaml"
# os.environ['python-bot-config'] = "/Users/dunnt/python-bot-config.yaml"
# os.environ['python-bot-config'] = "/Users/mariolgw/python-bot-config.yaml"

In [154]:
try:
    v_env = os.environ['python-bot-config']
except:
    print("Config file env variable is not set.")
    print("Set python-bot-config file")
    sys.exit(1)

with open(v_env, "r") as yamlConfig:
    cfg = yaml.safe_load(yamlConfig)

In [155]:
consumer_key = cfg.get("TwitterAPI").get("consumer_key")
consumer_secret = cfg.get("TwitterAPI").get("consumer_secret")
access_token = cfg.get("TwitterAPI").get("access_token")
access_token_secret = cfg.get("TwitterAPI").get("access_token_secret")

In [156]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
 
api = tweepy.API(auth, wait_on_rate_limit=True)

In [157]:
# # Iterates through and writes individual objects to the file, doesn't work when parsing file later
# f = open("tweets.json", "w")
# for tweet in tweepy.Cursor(api.home_timeline).items(10):
#     json.dump(tweet._json, f)
# f.close()

In [158]:
# # Evaluates all Tweets on a the bot's home timeline
# # Dumps raw json
# api.home_timeline(items=10)

In [159]:
# Currently limited to num_tweets Tweets
def get_tweets_by_user(user):
    tweets = api.user_timeline(user, items=12)
    my_list_of_dicts = []
    for each_json_tweet in tweets:
        my_list_of_dicts.append(each_json_tweet._json)
    filename = user + '.txt'
    if os.path.exists(filename):
        os.remove(filename)
    with open(filename, 'w') as file:
        file.write(json.dumps(my_list_of_dicts, indent=4))
    my_demo_list = []
    with open(filename, encoding='utf-8') as json_file:  
        all_data = json.load(json_file)
        for each_dictionary in all_data:
            tweet_id = each_dictionary['id']
            text = each_dictionary['text']
            favorite_count = each_dictionary['favorite_count']
            retweet_count = each_dictionary['retweet_count']
            created_at = each_dictionary['created_at']
            my_demo_list.append({'tweet_id': str(tweet_id),
                                 'text': str(text),
                                 'favorite_count': int(favorite_count),
                                 'retweet_count': int(retweet_count),
                                 'created_at': created_at,
                                })
            #print(my_demo_list)
            tweet_json = pd.DataFrame(my_demo_list, columns = 
                                      ['tweet_id', 'text', 
                                       'favorite_count', 'retweet_count', 
                                       'created_at'])
    return tweet_json

In [160]:
ryan_tweets2 = get_tweets_by_user('nick')

In [161]:
ryan_tweets2.shape

(20, 5)

In [162]:
ryan_tweets2

Unnamed: 0,tweet_id,text,favorite_count,retweet_count,created_at
0,1444867435840950281,@eviloars Come visit,1,0,Mon Oct 04 03:30:15 +0000 2021
1,1444851476275236865,Colorado PSA: Aspens are at peak color in the ...,11,1,Mon Oct 04 02:26:50 +0000 2021
2,1444024870950473728,@cg Don’t worry. It was more a commentary on t...,1,0,Fri Oct 01 19:42:12 +0000 2021
3,1443970787229065223,how october is going so far https://t.co/cumD2...,9,1,Fri Oct 01 16:07:18 +0000 2021
4,1433674940864024580,RT @_tomcashman: Somehow we ended up here http...,0,60802,Fri Sep 03 06:15:16 +0000 2021
5,1390007073232982018,RT @1AbbyRoad: “No worries if not!” https://t....,0,27791,Wed May 05 18:14:45 +0000 2021
6,1359618955913007104,RT @bimadew: this is so gentle https://t.co/Cb...,0,1819,Wed Feb 10 21:43:14 +0000 2021
7,1334552503203950596,RT @reganbich: hi twitter i never do stuff lik...,0,2023,Thu Dec 03 17:38:06 +0000 2020
8,1325125756633018368,RT @VanJones68: Today is a good day. \nIt’s ea...,0,72633,Sat Nov 07 17:19:34 +0000 2020
9,1325114551176552451,I’ve been waiting four years for this moment,10,2,Sat Nov 07 16:35:03 +0000 2020


In [112]:
# This cell is the basis of how tokenizing the Tweet will work. Using regex statments, it accounts for emojis, 
# hashtags, mentions, and more.
import re
 
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]
    
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
 
def tokenize(s):
    return tokens_re.findall(s)
 
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens

In [113]:
preprocess(ryan_tweets.loc[8]['text'])

['RT',
 '@CycloneMBB',
 ':',
 'Lights',
 'always',
 'shine',
 'brighter',
 'at',
 'Hilton',
 'Coliseum',
 '!',
 '🤩',
 '#C5C',
 '|',
 '#Cyclones',
 '🌪',
 'https://t.co/5Lt0Cd9j6I']

In [114]:
tokenize(ryan_tweets.loc[8]['text'])

['RT',
 '@CycloneMBB',
 ':',
 'Lights',
 'always',
 'shine',
 'brighter',
 'at',
 'Hilton',
 'Coliseum',
 '!',
 '🤩',
 '#C5C',
 '|',
 '#Cyclones',
 '🌪',
 'https://t.co/5Lt0Cd9j6I']

In [115]:
punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['rt', 'via', '’', 'RT', '️']

In [116]:
count_all = Counter()
for tweet in ryan_tweets['text']:
    #terms_all = [term for term in preprocess(tweet)]
    terms_stop = [term for term in preprocess(tweet) if term not in stop]
    count_all.update(terms_stop)
    #print(tweet + "\n")
print(count_all.most_common(5))

[('🌪', 17), ('#Cyclones', 6), ('@CycloneWBB', 4), ('🏀', 4), ('🎥', 3)]
