In [1]:
import tweepy
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt

# To split the data as necessary for modelling
from sklearn.model_selection import train_test_split

# To build a simple model
from sklearn.linear_model import LogisticRegression

# To get rid of logistic regression default solver warnings that appear if sklearn hasn't been updated
import warnings
warnings.filterwarnings('ignore')

# To "pickle" things, like accuracies or even an entire fitted model
import joblib

# To cross-validate
from sklearn.model_selection import cross_val_score

# To try scaling the data in various ways
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from bs4 import BeautifulSoup
import re
import preprocessor as p
p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION, p.OPT.SMILEY, p.OPT.NUMBER)
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import nltk

from nltk.corpus import stopwords
from wordcloud import WordCloud
nltk.download('stopwords')
ENGLISH_STOP_WORDS = stopwords.words('english')

# To try dimensionality reduction
from sklearn.decomposition import PCA

# To do a cross-validated grid search
from sklearn.model_selection import GridSearchCV


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chadh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Authenticate to Twitter
auth = tweepy.OAuthHandler("OENDOLh2rnnwsUKOO6XO1WED9", 
    "opxleI1SC7wrCLDZpQXWpoDhL2ntNXvv7fSx09OiQnh9rqWStd")
auth.set_access_token("1292145515925307393-YT3Bg347EwCst3XgRS43EtLM7i9B8l", 
    "VvX4ikUwSukdZzUMyMCYa4tUM4He4craOxZXDAb5BtSDK")

api = tweepy.API(auth)

try:
    api.verify_credentials()
    print("Authentication OK")
except:
    print("Error during authentication")

Authentication OK


In [5]:
# Setting notifier for hitting the wait limit
api = tweepy.API(auth, wait_on_rate_limit=True,
    wait_on_rate_limit_notify=True)

In [6]:
# Pulling the Republican tweets and assigning them a value of 1 in our target column

username = {'GOPLeader','SteveScalise','Jim_Jordan','RandPaul','GOP','Mike_Pence','MarshaBlackburn',
           'tedcruz','marcorubio','DanCrenshawTX','LindseyGrahamSC'}
count = 1000
total_list = []
for user in username:
    tweets = tweepy.Cursor(api.user_timeline,id=user, include_rts = True, tweet_mode = 'extended').items(count)
    tweets_list = [[tweet.created_at, tweet.full_text, user, 1] for tweet in tweets]
    total_list.extend(tweets_list)

In [7]:
# Pulling the Democratic tweets and assigning them a value of 0 in our target column

username = {'BernieSanders','JoeBiden','ewarren','PeteButtigieg',
           'KamalaHarris','JohnDelaney','amyklobuchar', 'MichaelBennet', 'AOC', 'SpeakerPelosi','TheDemocrats'}
count = 1000
for user in username:
    tweets = tweepy.Cursor(api.user_timeline,id=user, include_rts = True, tweet_mode = 'extended').items(count)
    tweets_list = [[tweet.created_at, tweet.full_text, user, 0] for tweet in tweets]
    total_list.extend(tweets_list)
    

In [8]:
# Turning the scrape into a dataframe and taking a look
df1 = pd.DataFrame(total_list)

In [9]:
df1

Unnamed: 0,0,1,2,3
0,2020-10-30 15:52:39,RT @SelectGOP: Democrats’ latest partisan repo...,SteveScalise,1
1,2020-10-30 14:04:01,RT @Jim_Jordan: Republicans:\n\n-Lower taxes\n...,SteveScalise,1
2,2020-10-29 23:43:19,Under Democrats: More job-crushing lockdowns.\...,SteveScalise,1
3,2020-10-29 20:35:23,Yesterday's Senate hearings should just be the...,SteveScalise,1
4,2020-10-29 19:53:43,You've got to be kidding.\n\nThe governor whos...,SteveScalise,1
...,...,...,...,...
21995,2020-08-20 02:43:49,“Donald Trump hasn’t grown into the job becaus...,JoeBiden,0
21996,2020-08-20 02:31:00,"Hillary: tonight, I’m reminded of one of your ...",JoeBiden,0
21997,2020-08-20 02:25:00,"I authored the Violence Against Women Act, and...",JoeBiden,0
21998,2020-08-20 02:23:56,RT @TeamJoe: .@ewarren knows that the only way...,JoeBiden,0


In [10]:
# Checking Distribution
df1[3].value_counts()

1    11000
0    11000
Name: 3, dtype: int64

In [11]:
# Renaming columns and checking
df1 = df1.rename(columns = {0 : 'Timestamp of Tweet', 1 : 'Tweet Text', 2 : 'Twitter Handle', 3: 'Is Republican'})

In [12]:
df1

Unnamed: 0,Timestamp of Tweet,Tweet Text,Twitter Handle,Is Republican
0,2020-10-30 15:52:39,RT @SelectGOP: Democrats’ latest partisan repo...,SteveScalise,1
1,2020-10-30 14:04:01,RT @Jim_Jordan: Republicans:\n\n-Lower taxes\n...,SteveScalise,1
2,2020-10-29 23:43:19,Under Democrats: More job-crushing lockdowns.\...,SteveScalise,1
3,2020-10-29 20:35:23,Yesterday's Senate hearings should just be the...,SteveScalise,1
4,2020-10-29 19:53:43,You've got to be kidding.\n\nThe governor whos...,SteveScalise,1
...,...,...,...,...
21995,2020-08-20 02:43:49,“Donald Trump hasn’t grown into the job becaus...,JoeBiden,0
21996,2020-08-20 02:31:00,"Hillary: tonight, I’m reminded of one of your ...",JoeBiden,0
21997,2020-08-20 02:25:00,"I authored the Violence Against Women Act, and...",JoeBiden,0
21998,2020-08-20 02:23:56,RT @TeamJoe: .@ewarren knows that the only way...,JoeBiden,0


In [13]:
# Saving to a CSV file so we dont have to scrape again for analysis in other books
df1.to_csv('TwitterDataScrape10302020.csv')

In [None]:
#restart from here ###

In [9]:
df1 = pd.read_csv('TwitterDataScrape10182020.csv', index_col=0)

In [14]:
df1

Unnamed: 0,Timestamp of Tweet,Tweet Text,Twitter Handle,Is Republican
0,2020-10-30 15:52:39,RT @SelectGOP: Democrats’ latest partisan repo...,SteveScalise,1
1,2020-10-30 14:04:01,RT @Jim_Jordan: Republicans:\n\n-Lower taxes\n...,SteveScalise,1
2,2020-10-29 23:43:19,Under Democrats: More job-crushing lockdowns.\...,SteveScalise,1
3,2020-10-29 20:35:23,Yesterday's Senate hearings should just be the...,SteveScalise,1
4,2020-10-29 19:53:43,You've got to be kidding.\n\nThe governor whos...,SteveScalise,1
...,...,...,...,...
21995,2020-08-20 02:43:49,“Donald Trump hasn’t grown into the job becaus...,JoeBiden,0
21996,2020-08-20 02:31:00,"Hillary: tonight, I’m reminded of one of your ...",JoeBiden,0
21997,2020-08-20 02:25:00,"I authored the Violence Against Women Act, and...",JoeBiden,0
21998,2020-08-20 02:23:56,RT @TeamJoe: .@ewarren knows that the only way...,JoeBiden,0


In [15]:
# Learning from our previous mistake of trying to find information from the Twitter handles
df1.drop('Twitter Handle',axis = 1, inplace = True)

In [17]:
# Checking to see if the table dropped
df1

Unnamed: 0,Timestamp of Tweet,Tweet Text,Is Republican
0,2020-10-30 15:52:39,RT @SelectGOP: Democrats’ latest partisan repo...,1
1,2020-10-30 14:04:01,RT @Jim_Jordan: Republicans:\n\n-Lower taxes\n...,1
2,2020-10-29 23:43:19,Under Democrats: More job-crushing lockdowns.\...,1
3,2020-10-29 20:35:23,Yesterday's Senate hearings should just be the...,1
4,2020-10-29 19:53:43,You've got to be kidding.\n\nThe governor whos...,1
...,...,...,...
21995,2020-08-20 02:43:49,“Donald Trump hasn’t grown into the job becaus...,0
21996,2020-08-20 02:31:00,"Hillary: tonight, I’m reminded of one of your ...",0
21997,2020-08-20 02:25:00,"I authored the Violence Against Women Act, and...",0
21998,2020-08-20 02:23:56,RT @TeamJoe: .@ewarren knows that the only way...,0


In retrospect, it would've been smarter to save the csv file AFTER the cleaning process however let's use this as an opportunity to create a singular for loop to quickly clean the data!

In [18]:
# For loop for cleaning
prelist1 = []
prelist2 = []
prelist3 = []
for text in df1['Tweet Text']:
    souper2 = BeautifulSoup(text, "lxml")
    prelist1.append(souper2.get_text())
        
for text in prelist1:
    prelist2.append(p.clean(text))
    
for text in prelist2:
    prelist3.append(re.sub(r"^RT.*:","",text))

df1['Cleaned Tweet Text']= prelist3

In [19]:
# Check
df1

Unnamed: 0,Timestamp of Tweet,Tweet Text,Is Republican,Cleaned Tweet Text
0,2020-10-30 15:52:39,RT @SelectGOP: Democrats’ latest partisan repo...,1,Democrats latest partisan report underscores ...
1,2020-10-30 14:04:01,RT @Jim_Jordan: Republicans:\n\n-Lower taxes\n...,1,-Higher taxes -Less freedom -More lockdo
2,2020-10-29 23:43:19,Under Democrats: More job-crushing lockdowns.\...,1,Under Democrats: More job-crushing lockdowns. ...
3,2020-10-29 20:35:23,Yesterday's Senate hearings should just be the...,1,Yesterday's Senate hearings should just be the...
4,2020-10-29 19:53:43,You've got to be kidding.\n\nThe governor whos...,1,You've got to be kidding. The governor whose o...
...,...,...,...,...
21995,2020-08-20 02:43:49,“Donald Trump hasn’t grown into the job becaus...,0,Donald Trump hasnt grown into the job because ...
21996,2020-08-20 02:31:00,"Hillary: tonight, I’m reminded of one of your ...",0,"Hillary: tonight, Im reminded of one of your f..."
21997,2020-08-20 02:25:00,"I authored the Violence Against Women Act, and...",0,"I authored the Violence Against Women Act, and..."
21998,2020-08-20 02:23:56,RT @TeamJoe: .@ewarren knows that the only way...,0,. knows that the only way to defeat Donald Tr...


In [20]:
# Dropping the unclean column
df1.drop('Tweet Text',axis = 1, inplace = True)

In [21]:
# Check
df1

Unnamed: 0,Timestamp of Tweet,Is Republican,Cleaned Tweet Text
0,2020-10-30 15:52:39,1,Democrats latest partisan report underscores ...
1,2020-10-30 14:04:01,1,-Higher taxes -Less freedom -More lockdo
2,2020-10-29 23:43:19,1,Under Democrats: More job-crushing lockdowns. ...
3,2020-10-29 20:35:23,1,Yesterday's Senate hearings should just be the...
4,2020-10-29 19:53:43,1,You've got to be kidding. The governor whose o...
...,...,...,...
21995,2020-08-20 02:43:49,0,Donald Trump hasnt grown into the job because ...
21996,2020-08-20 02:31:00,0,"Hillary: tonight, Im reminded of one of your f..."
21997,2020-08-20 02:25:00,0,"I authored the Violence Against Women Act, and..."
21998,2020-08-20 02:23:56,0,. knows that the only way to defeat Donald Tr...


In [22]:
# Dropping the timestamp column since we wont need it
df2 = df1.drop('Timestamp of Tweet',axis = 1)

In [23]:
# Check
df2

Unnamed: 0,Is Republican,Cleaned Tweet Text
0,1,Democrats latest partisan report underscores ...
1,1,-Higher taxes -Less freedom -More lockdo
2,1,Under Democrats: More job-crushing lockdowns. ...
3,1,Yesterday's Senate hearings should just be the...
4,1,You've got to be kidding. The governor whose o...
...,...,...
21995,0,Donald Trump hasnt grown into the job because ...
21996,0,"Hillary: tonight, Im reminded of one of your f..."
21997,0,"I authored the Violence Against Women Act, and..."
21998,0,. knows that the only way to defeat Donald Tr...


In [24]:
# Setting the variables
X = df2['Cleaned Tweet Text']
y = df2['Is Republican']

In [25]:
# Creating train test split before the vectorizing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=9)

#### Any model can be run from here - not necessarily the one that comes in order

In [27]:
# Running our most successful model from the last workbook
tfidf = TfidfVectorizer(min_df=5, stop_words=ENGLISH_STOP_WORDS)
tfidf.fit(X_train)
X_train_tfidf = tfidf.transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# fit the model
my_best_model = LogisticRegression(C=1)
my_best_model.fit(X_train_tfidf, y_train)

# extract the coefficients
coefs = my_best_model.coef_
tokens = tfidf.get_feature_names()
results = pd.DataFrame({'tokens': tokens, 'coef': coefs[0]})

# Sort the rows by the coefficient of the word/token (from lowest to highest)
results.sort_values(by='coef', inplace=True)

print("Democratic Tokens")
display(results.head(100).tokens.values)


print("Republican Tokens")
display(results.tail(100).tokens.values)

Democratic Tokens


array(['democracy', 'donald', 'climate', 'familiesfirst', 'ballot',
       'trump', 'build', 'fitn', 'demconvention', 'voting', 'moment',
       'crisis', 'white', 'need', 'racism', 'chip', 'heroesact', 'vote',
       'plan', 'folks', 'everyone', 'families', 'lgbtq', 'mcconnell',
       'together', 'future', 'make', 'pandemic', 'lives', 'health',
       'works', 'act', 'public', 'people', 'progressive', 'early', 'mask',
       'billionaires', 'must', 'elect', 'capitol', 'wealth', 'reporters',
       'change', 'leaders', 'mail', 'trumps', 'us', 'flip', 'covid',
       'latino', 'workers', 'got', 'cannot', 'ensure', 'affordable',
       'jill', 'grassroots', 'corporate', 'infrastructure', 'country',
       'register', 'experience', 'dejoy', 'minnesota', 'class',
       'conversation', 'colorado', 'black', 'bennet', 'essential',
       'promise', 'economic', 'poverty', 'community', 'cases', 'etc',
       'voice', 'progress', 'care', 'everything', 'science', 'george',
       'masks', 'affo

Republican Tokens


array(['obama', 'ukraine', 'paycheck', 'antifa', 'pro', 'renew', 'pence',
       'evil', 'judges', 'fourmoreyears', 'amy', 'results', 'proxy',
       'book', 'nancy', 'conservatives', 'agree', 'speaker', 'announced',
       'packing', 'socialism', 'policies', 'thread', 'covid19', 'rt',
       'speaking', 'pennsylvania', 'operation', 'houston', 'round',
       'pack', 'confident', 'israel', 'ted', 'whistleblower', 'scotus',
       'socialist', 'peace', 'schiff', 'coney', 'patients', 'hunter',
       'america', 'smallbiz', 'football', 'conservative', 'bidens',
       'proverbs', 'smallbusiness', 'psalms', 'freedom', 'comey',
       'afghanistan', 'twitter', 'enforcement', 'liberty', 'report',
       'away', 'cruz', 'flynn', 'prayers', 'committee', 'lord',
       '100yearsofwomenssuffrage', 'praying', 'great', 'chinas',
       'wewanttoplay', 'left', 'cuomo', 'ccp', 'tech', 'media', 'pres',
       'greatest', 'florida', 'liberal', 'radical', 'fisa',
       'goodnewsoftheday', 'nursing', '

In [28]:
# Optimizing min_df parameter
dfvalue = [1,2,3,4,5,6,7,8,9,10]
nltk.download('stopwords')
ENGLISH_STOP_WORDS = stopwords.words('english')

results_df = pd.DataFrame(index=dfvalue, 
                          columns=["Training Accuracies", "Test Accuracies", "Min_DF"])
for d in dfvalue:
    tfidf = TfidfVectorizer(min_df=d, stop_words=ENGLISH_STOP_WORDS)
    tfidf.fit(X_train)
    X_train_tfidf = tfidf.transform(X_train)
    X_test_tfidf = tfidf.transform(X_test)

    # fit the model
    lr_model = LogisticRegression(C=1)
    lr_model.fit(X_train_tfidf, y_train)
    
    results_df.loc[d, "Training Accuracies"] = lr_model.score(X_train_tfidf, y_train)
    results_df.loc[d, "Test Accuracies"] = lr_model.score(X_test_tfidf, y_test)
    results_df.loc[d, "Min_DF"] = d

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chadh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:
# display with a heatmap background
display(results_df.apply(pd.to_numeric).style.background_gradient(cmap='Blues', 
                                                                  axis=0, 
                                                                  subset = ["Training Accuracies", "Test Accuracies"]))

Unnamed: 0,Training Accuracies,Test Accuracies,Min_DF
1,0.891879,0.827273,1
2,0.888,0.827455,2
3,0.883152,0.828182,3
4,0.880727,0.827818,4
5,0.877394,0.826545,5
6,0.874788,0.827818,6
7,0.872364,0.825636,7
8,0.869697,0.825818,8
9,0.867152,0.823091,9
10,0.864545,0.821636,10


In [32]:
# looking at the coefficients and the words 
results

Unnamed: 0,tokens,coef
1178,democracy,-4.764826
1177,demconvention,-4.476918
778,climate,-4.088613
1331,donald,-3.909582
1655,familiesfirst,-3.796348
...,...,...
1185,dems,3.298642
843,communist,3.522482
3182,pelosi,4.599541
731,china,4.748484


In [29]:
# Running our most successful model from the last workbook
tfidf2 = TfidfVectorizer(min_df=5, stop_words=ENGLISH_STOP_WORDS, ngram_range = (2,3))
tfidf2.fit(X_train)
X_train_tfidf2 = tfidf2.transform(X_train)
X_test_tfidf2 = tfidf2.transform(X_test)

# fit the model
my_best_model2 = LogisticRegression(C=1)
my_best_model2.fit(X_train_tfidf2, y_train)

# extract the coefficients
coefs = my_best_model2.coef_
tokens = tfidf2.get_feature_names()
results2gram = pd.DataFrame({'tokens': tokens, 'coef': coefs[0]})

# Sort the rows by the coefficient of the word/token (from lowest to highest)
results2gram.sort_values(by='coef', inplace=True)

print("Democratic Tokens")
display(results2gram.head(100).tokens.values)

print("Republican Tokens")
display(results2gram.tail(100).tokens.values)

Democratic Tokens


array(['donald trump', 'working families', 'climate change', 'vote early',
       'health care', 'donald trumps', 'plan vote', 'need president',
       'pm et', 'voting plan', 'make plan', 'register vote',
       'young people', 'working people', 'vote mail', 'next president',
       'systemic racism', 'new hampshire', 'mitch mcconnell',
       'working class', 'proud endorse', 'million americans',
       'take action', 'climate crisis', 'voter suppression',
       'cast ballot', 'fitn nhpolitics', 'real deal', 'fitn primary',
       'house passed', 'voting rights', 'come together',
       'affordable care', 'back better', 'wall street', 'commander chief',
       'join us', 'americans died', 'ballot box', 'white supremacists',
       'friends family', 'economic crisis', 'racial justice',
       'john lewis', 'build back', 'black brown', 'deserve president',
       'campaign trail', 'million people', 'white supremacist',
       'white house', 'voting mail', 'american democracy',
       

Republican Tokens


array(['death rate', 'amy coney barrett', 'great american',
       'war afghanistan', 'god bless', 'deep state',
       'chinese communist party', 'pack supreme', 'pack supreme court',
       'sham impeachment', 'prayers family', 'restore way',
       'career politician', 'free markets', 'thanks president',
       'dr fauci', 'vp pence', 'house republicans', 'make america great',
       'air force', 'second amendment', 'commitment america',
       'joe bidens', 'north carolina', 'discharge petition', 'law order',
       'back work', 'working hard', 'years president', 'restore order',
       'house democrats', 'coney barrett', 'michael flynn', 'text vote',
       'proxy voting', 'op ed', 'great news', 'welcome back',
       'china accountable', 'chuck schumer', 'senate democrats',
       'defunding police', 'middle east', 'make america', 'back home',
       'kamala harris', 'nursing home', 'red tape', 'million jobs',
       'way life', 'law enforcement', 'four years president',
       '