The object here is to train a machine learning model on a database of Tweets and use that model to predict sentiment of Tweets from a user, a hashtag, or any random variety of Tweets. 

**@author Ryan Herren**

**@author Tanner Dunn**

In [1]:
import tweepy
import pandas as pd
import numpy as np
import operator 
import json
from collections import Counter
import os
import yaml
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import string
import math

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ryanherren/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
os.environ['python-bot-config'] = "/Users/ryanherren/python-bot-config.yaml"
# os.environ['python-bot-config'] = "/Users/dunnt/python-bot-config.yaml"
# os.environ['python-bot-config'] = "/Users/mariolgw/python-bot-config.yaml"

In [3]:
try:
    v_env = os.environ['python-bot-config']
except:
    print("Config file env variable is not set.")
    print("Set python-bot-config file")
    sys.exit(1)

with open(v_env, "r") as yamlConfig:
    cfg = yaml.safe_load(yamlConfig)

In [4]:
consumer_key = cfg.get("TwitterAPI").get("consumer_key")
consumer_secret = cfg.get("TwitterAPI").get("consumer_secret")
access_token = cfg.get("TwitterAPI").get("access_token")
access_token_secret = cfg.get("TwitterAPI").get("access_token_secret")

In [5]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
 
api = tweepy.API(auth, wait_on_rate_limit=True)

In [73]:
# Currently limited to num_tweets Tweets
column_names = ['tweet_id', 'text','favorite_count','retweet_count','created_at']
def get_tweets_by_user(user, tweets):
    tweet_json = pd.DataFrame(columns=column_names)
    for i in range(math.ceil(tweets/20)):
        tweets = api.user_timeline(user, page=i)
#         print(str(i) + "WABBADABBADO\n")
#         print(tweets)
        my_list_of_dicts = []
        for each_json_tweet in tweets:
            my_list_of_dicts.append(each_json_tweet._json)
        filename = user + '.txt'
        if os.path.exists(filename):
            os.remove(filename)
        with open(filename, 'w') as file:
            file.write(json.dumps(my_list_of_dicts, indent=4))
        my_demo_list = []
        with open(filename, encoding='utf-8') as json_file:  
            all_data = json.load(json_file)
            for each_dictionary in all_data:
                tweet_id = each_dictionary['id']
                text = each_dictionary['text']
                favorite_count = each_dictionary['favorite_count']
                retweet_count = each_dictionary['retweet_count']
                created_at = each_dictionary['created_at']
                my_demo_list.append({'tweet_id': str(tweet_id),
                                     'text': str(text),
                                     'favorite_count': int(favorite_count),
                                     'retweet_count': int(retweet_count),
                                     'created_at': created_at,
                                    })
                #print(my_demo_list)
                temp_json = pd.DataFrame(my_demo_list, columns = 
                                          ['tweet_id', 'text', 
                                           'favorite_count', 'retweet_count', 
                                           'created_at'])
#             print("\n \n " + str(temp_json.shape) + str(type(temp_json)) + "\n\n")
        tweet_json = tweet_json.append(temp_json, ignore_index=True)
#         print("\n \n " + str(tweet_json.shape) + str(type(tweet_json)) + "\n\n")
    tweet_json = tweet_json.drop_duplicates()    
    return tweet_json

In [78]:
user_tweets = get_tweets_by_user('CycloneLarry69', 109)

In [79]:
user_tweets.shape

(100, 5)

In [80]:
user_tweets

Unnamed: 0,tweet_id,text,favorite_count,retweet_count,created_at
0,1465532547971047425,YOU ALL NEED TO TOUCH GRASS,19,0,Tue Nov 30 04:06:02 +0000 2021
1,1465530407231885312,@ElamarDaGreat Love you man best of luck!,31,0,Tue Nov 30 03:57:31 +0000 2021
2,1465530316727201796,@tomh1138 @jakebrend32 There is no friction be...,12,0,Tue Nov 30 03:57:10 +0000 2021
3,1465520829412880393,RT @WillBlackmon: Brian Kelly at first intervi...,0,2796,Tue Nov 30 03:19:28 +0000 2021
4,1465520593424596997,@ImDerBatman @GilletteLD Boyyyyyyy,0,0,Tue Nov 30 03:18:32 +0000 2021
...,...,...,...,...,...
115,1465174194044522499,Goodnight my beautiful friends,59,0,Mon Nov 29 04:22:04 +0000 2021
116,1465171231217537031,Just remembered OU has to play at Nebraska nex...,137,1,Mon Nov 29 04:10:17 +0000 2021
117,1465163973297422336,@ImDerBatman #neverforget,1,0,Mon Nov 29 03:41:27 +0000 2021
118,1465163851415101442,@SpurHorn I’m skeptical,1,0,Mon Nov 29 03:40:58 +0000 2021


In [81]:
# This cell is the basis of how tokenizing the Tweet will work. Using regex statments, it accounts for emojis, 
# hashtags, mentions, and more.
import re
 
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]
    
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
 
def tokenize(s):
    return tokens_re.findall(s)
 
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens

In [82]:
preprocess(user_tweets.loc[11]['text'])

['RT', '@G_Bombastic', ':', 'YOOOO', '😭', '😭', 'https://t.co/Gu8ogF4nnk']

In [83]:
punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['rt', 'via', '’', 'RT', '️', '…']

In [84]:
count_all = Counter()
for tweet in user_tweets['text']:
    terms_all = [term for term in preprocess(tweet)]
    terms_stop = [term for term in preprocess(tweet) if term not in stop]
    count_all.update(terms_stop)
    print(tweet + "\n")
print(count_all.most_common(10))

YOU ALL NEED TO TOUCH GRASS

@ElamarDaGreat Love you man best of luck!

@tomh1138 @jakebrend32 There is no friction between Campbell and Pollard and Matt Campbell will be in Ames, Iowa co… https://t.co/CupKqzCsH9

RT @WillBlackmon: Brian Kelly at first interview after practice at LSU https://t.co/J4HneZqUVy

@ImDerBatman @GilletteLD Boyyyyyyy

@GilletteLD @ImDerBatman Bill O’Brien

@drakectoll A Baylor fan that can afford losing a hamstring?

FIFTEEN MILLION DOLLARS

@dennisdoddcbs @PeteThamel It’s Dennis Dodd’s time to lick my nuts

Do you think Steele Jantz knows about all this?

Congrats bro @itshanklol!

They said your hamstrings weren’t good enough, but keep proven the haters wrong😤✊ https://t.co/A0Z8vPlrDr

RT @G_Bombastic: YOOOO 😭😭 https://t.co/Gu8ogF4nnk

Yup.

Bob Stoops and Lincoln Riley situation, but Brian decided to get that bread before retiring all the way https://t.co/X0vVeIPQ4o

@AidenWyatt01 AIDEN

@tKCyclone Brother what?

@steveryancarter It’s gotta be my nuts to yo