In [1]:
import tweepy
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt

# To split the data as necessary for modelling
from sklearn.model_selection import train_test_split

# To build a simple model
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# To get rid of logistic regression default solver warnings that appear if sklearn hasn't been updated
import warnings
warnings.filterwarnings('ignore')

# To "pickle" things, like accuracies or even an entire fitted model
import joblib

# To cross-validate
from sklearn.model_selection import cross_val_score

# To try scaling the data in various ways
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

# To try dimensionality reduction
from sklearn.decomposition import PCA

# To do a cross-validated grid search
from sklearn.model_selection import GridSearchCV

from bs4 import BeautifulSoup
import re
import preprocessor as p
p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION, p.OPT.SMILEY, p.OPT.NUMBER)
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import nltk
nltk.download('stopwords')
ENGLISH_STOP_WORDS = stopwords.words('english')
from nltk.corpus import stopwords
from wordcloud import WordCloud

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chadh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


NameError: name 'stopwords' is not defined

In [2]:
# Reading in the data from the CSV file we created
df1=pd.read_csv('TwitterDataScrape.csv',index_col=0)

In [3]:
# Check
df1

Unnamed: 0,Timestamp of Tweet,Tweet Text,Twitter Handle,Is Republican
0,2020-09-12 21:56:57,"RT @rpof_gamble: Rain🌧 or Shine🌞, Raquel &amp;...",GOP,1
1,2020-09-12 21:55:09,RT @mfreckletonGOP: The road to November conti...,GOP,1
2,2020-09-12 21:54:40,RT @oliviaintheusa: Trump Victory Fellow &amp;...,GOP,1
3,2020-09-12 21:53:56,RT @EliseStefanik: HUGE enthusiasm for @realDo...,GOP,1
4,2020-09-12 21:52:12,RT @sukyeesmith: Incredible #VeteransForTrump ...,GOP,1
...,...,...,...,...
10995,2020-08-07 13:04:00,While this president only cares about the rich...,TheDemocrats,0
10996,2020-08-07 02:57:00,RT @DemConvention: If we want to truly honor t...,TheDemocrats,0
10997,2020-08-07 01:57:00,Trump failed to take decisive action against t...,TheDemocrats,0
10998,2020-08-07 00:57:00,RT @JoeBiden: The fight for civil rights is wh...,TheDemocrats,0


In [8]:
# Learning from our previous mistake of trying to find information from the Twitter handles
df1.drop('Twitter Handle',axis = 1, inplace = True)

In [9]:
# Checking to see if the table dropped
df1

Unnamed: 0,Timestamp of Tweet,Tweet Text,Is Republican
0,2020-09-12 21:56:57,"RT @rpof_gamble: Rain🌧 or Shine🌞, Raquel &amp;...",1
1,2020-09-12 21:55:09,RT @mfreckletonGOP: The road to November conti...,1
2,2020-09-12 21:54:40,RT @oliviaintheusa: Trump Victory Fellow &amp;...,1
3,2020-09-12 21:53:56,RT @EliseStefanik: HUGE enthusiasm for @realDo...,1
4,2020-09-12 21:52:12,RT @sukyeesmith: Incredible #VeteransForTrump ...,1
...,...,...,...
10995,2020-08-07 13:04:00,While this president only cares about the rich...,0
10996,2020-08-07 02:57:00,RT @DemConvention: If we want to truly honor t...,0
10997,2020-08-07 01:57:00,Trump failed to take decisive action against t...,0
10998,2020-08-07 00:57:00,RT @JoeBiden: The fight for civil rights is wh...,0


In retrospect, it would've been smarter to save the csv file AFTER the cleaning process however let's use this as an opportunity to create a singular for loop to quickly clean the data!

In [10]:
# For loop for cleaning
prelist1 = []
prelist2 = []
prelist3 = []
for text in df1['Tweet Text']:
    souper2 = BeautifulSoup(text, "lxml")
    prelist1.append(souper2.get_text())
        
for text in prelist1:
    prelist2.append(p.clean(text))
    
for text in prelist2:
    prelist3.append(re.sub(r"^RT.*:","",text))

df1['Cleaned Tweet Text']= prelist3

In [11]:
# Check
df1

Unnamed: 0,Timestamp of Tweet,Tweet Text,Is Republican,Cleaned Tweet Text
0,2020-09-12 21:56:57,"RT @rpof_gamble: Rain🌧 or Shine🌞, Raquel &amp;...",1,"Rain or Shine, Raquel & I are hitting doors f..."
1,2020-09-12 21:55:09,RT @mfreckletonGOP: The road to November conti...,1,The road to November continues! Team and #TVM...
2,2020-09-12 21:54:40,RT @oliviaintheusa: Trump Victory Fellow &amp;...,1,Trump Victory Fellow & Campus Team Leader Car...
3,2020-09-12 21:53:56,RT @EliseStefanik: HUGE enthusiasm for @realDo...,1,HUGE enthusiasm for at our Rally in Cambridge...
4,2020-09-12 21:52:12,RT @sukyeesmith: Incredible #VeteransForTrump ...,1,Incredible #VeteransForTrump and #Sportsmenfo...
...,...,...,...,...
10995,2020-08-07 13:04:00,While this president only cares about the rich...,0,While this president only cares about the rich...
10996,2020-08-07 02:57:00,RT @DemConvention: If we want to truly honor t...,0,If we want to truly honor the legacy of John ...
10997,2020-08-07 01:57:00,Trump failed to take decisive action against t...,0,Trump failed to take decisive action against t...
10998,2020-08-07 00:57:00,RT @JoeBiden: The fight for civil rights is wh...,0,The fight for civil rights is what drove me t...


In [95]:
# Dropping the unclean column
df1.drop('Tweet Text',axis = 1, inplace = True)

In [96]:
# Check
df1

Unnamed: 0,Timestamp of Tweet,Is Republican,Cleaned Tweet Text
0,2020-09-12 21:56:57,1,"Rain or Shine, Raquel & I are hitting doors f..."
1,2020-09-12 21:55:09,1,The road to November continues! Team and #TVM...
2,2020-09-12 21:54:40,1,Trump Victory Fellow & Campus Team Leader Car...
3,2020-09-12 21:53:56,1,HUGE enthusiasm for at our Rally in Cambridge...
4,2020-09-12 21:52:12,1,Incredible #VeteransForTrump and #Sportsmenfo...
...,...,...,...
10995,2020-08-07 13:04:00,0,While this president only cares about the rich...
10996,2020-08-07 02:57:00,0,If we want to truly honor the legacy of John ...
10997,2020-08-07 01:57:00,0,Trump failed to take decisive action against t...
10998,2020-08-07 00:57:00,0,The fight for civil rights is what drove me t...


In [12]:
# Dropping the timestamp column since we wont need it
df2 = df1.drop('Timestamp of Tweet',axis = 1)

In [13]:
# Check
df2

Unnamed: 0,Tweet Text,Is Republican,Cleaned Tweet Text
0,"RT @rpof_gamble: Rain🌧 or Shine🌞, Raquel &amp;...",1,"Rain or Shine, Raquel & I are hitting doors f..."
1,RT @mfreckletonGOP: The road to November conti...,1,The road to November continues! Team and #TVM...
2,RT @oliviaintheusa: Trump Victory Fellow &amp;...,1,Trump Victory Fellow & Campus Team Leader Car...
3,RT @EliseStefanik: HUGE enthusiasm for @realDo...,1,HUGE enthusiasm for at our Rally in Cambridge...
4,RT @sukyeesmith: Incredible #VeteransForTrump ...,1,Incredible #VeteransForTrump and #Sportsmenfo...
...,...,...,...
10995,While this president only cares about the rich...,0,While this president only cares about the rich...
10996,RT @DemConvention: If we want to truly honor t...,0,If we want to truly honor the legacy of John ...
10997,Trump failed to take decisive action against t...,0,Trump failed to take decisive action against t...
10998,RT @JoeBiden: The fight for civil rights is wh...,0,The fight for civil rights is what drove me t...


In [14]:
# Setting the variables
X = df2['Cleaned Tweet Text']
y = df2['Is Republican']

In [15]:
# Creating train test split before the vectorizing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=9)

In [18]:
# Running our most successful model from the last workbook
tfidf = TfidfVectorizer(min_df=5, stop_words=ENGLISH_STOP_WORDS)
tfidf.fit(X_train)
X_train_tfidf = tfidf.transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# fit the model
my_best_model = LogisticRegression(C=1)
my_best_model.fit(X_train_tfidf, y_train)

# extract the coefficients
coefs = my_best_model.coef_
tokens = tfidf.get_feature_names()
results = pd.DataFrame({'tokens': tokens, 'coef': coefs[0]})

# Sort the rows by the coefficient of the word/token (from lowest to highest)
results.sort_values(by='coef', inplace=True)

print("Democratic Tokens")
display(results.head(50).tokens.values)


print("Republican Tokens")
display(results.tail(50).tokens.values)

Democratic Tokens


array(['demconvention', 'donald', 'democracy', 'climate', 'build', 'plan',
       'heroesact', 'trump', 'need', 'elect', 'white', 'racism', 'fight',
       'together', 'change', 'crisis', 'vote', 'everyone', 'fitn', 'us',
       'people', 'black', 'voting', 'ballot', 'must', 'moment', 'promise',
       'hampshire', 'health', 'country', 'pandemic', 'mail', 'better',
       'covid', 'dejoy', 'leadership', 'workers', 'families', 'iowa',
       'early', 'next', 'wealth', 'future', 'racial', 'got', 'tonight',
       'failed', 'family', 'capitol', 'mask'], dtype=object)

Republican Tokens


array(['proverbs', 'tennessee', 'covid19', 'comeback', 'open', 'nancy',
       'pence', 'texas', 'republican', 'chinas', 'reform', 'added',
       'thread', 'government', 'football', 'weeks', 'american', 'rate',
       'police', '100yearsofwomenssuffrage', 'dems', 'operation',
       'liberal', 'great', 'houston', 'home', 'greatest', 'law',
       'speaker', 'lord', 'wewanttoplay', 'florida', 'pennsylvania',
       'ccp', 'cuomo', 'fbi', 'carolina', 'nursing', 'god', 'mob',
       'biden', 'communist', 'chinese', 'fisa', 'left', 'democrat',
       'pelosi', 'china', 'democrats', 'rnc2020'], dtype=object)

In [35]:
# Optimizing min_df parameter
dfvalue = [1,2,3,4,5,6,7,8,9,10]
nltk.download('stopwords')
ENGLISH_STOP_WORDS = stopwords.words('english')

results_df = pd.DataFrame(index=dfvalue, 
                          columns=["Training Accuracies", "Test Accuracies", "Min_DF"])
for d in dfvalue:
    tfidf = TfidfVectorizer(min_df=d, stop_words=ENGLISH_STOP_WORDS)
    tfidf.fit(X_train)
    X_train_tfidf = tfidf.transform(X_train)
    X_test_tfidf = tfidf.transform(X_test)

    # fit the model
    lr_model = LogisticRegression(C=1)
    lr_model.fit(X_train_tfidf, y_train)
    
    results_df.loc[d, "Training Accuracies"] = lr_model.score(X_train_tfidf, y_train)
    results_df.loc[d, "Test Accuracies"] = lr_model.score(X_test_tfidf, y_test)
    results_df.loc[d, "Min_DF"] = d

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chadh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
# display with a heatmap background
display(results_df.apply(pd.to_numeric).style.background_gradient(cmap='Blues', 
                                                                  axis=0, 
                                                                  subset = ["Training Accuracies", "Test Accuracies"]))

Unnamed: 0,Training Accuracies,Test Accuracies,Min_DF
1,0.903515,0.819273,1
2,0.899273,0.815273,2
3,0.895152,0.814909,3
4,0.890909,0.813091,4
5,0.886545,0.811273,5
6,0.884364,0.808727,6
7,0.881939,0.806909,7
8,0.878424,0.802182,8
9,0.87503,0.800727,9
10,0.87103,0.800727,10


Looks like the min_df of 5 worked best so we will stick with that. Lets try some different ML models.

In [None]:
# GridSearch for the best TF-IDF model

from sklearn.pipeline import make_pipeline

# 2. Setup the param grid
param_grid = [
    {'classifier__C':[0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]}, 
    {'classifier':[KNeighborsClassifier()],
     'classifier__n_neighbors':[1,2,3,4,5,6,7,8,9,10]},
    {'classifier':[DecisionTreeClassifier()],
     'classifier__max_depth':[1,2,3,4,5,6,7,8,9,10]}] # Decision Tree with PCA

In [184]:
from sklearn.pipeline import Pipeline
# 1. Make the pipeline (instantiate a placeholder)
estimators = [('classifier', LogisticRegression())]
mypipeline = Pipeline(estimators)

In [36]:
# prepare the data
tfidf = TfidfVectorizer(min_df=5, stop_words=ENGLISH_STOP_WORDS)
tfidf.fit(X_train)
X_train_tfidf = tfidf.transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [37]:
# Checking the shapes
X_train_tfidf.shape

(8250, 3654)

In [187]:
y_train.shape

(8250,)

In [188]:
mygs = GridSearchCV(mypipeline, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

In [189]:
# 4. Fit
mygs.fit(X_train_tfidf, y_train)

Fitting 5 folds for each of 28 candidates, totalling 140 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed:    8.0s finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('classifier', LogisticRegression())]),
             n_jobs=-1,
             param_grid=[{'classifier__C': [0.0001, 0.001, 0.01, 0.1, 1, 10,
                                            100, 1000]},
                         {'classifier': [KNeighborsClassifier()],
                          'classifier__n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                      10]},
                         {'classifier': [DecisionTreeClassifier()],
                          'classifier__max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                    10]}],
             verbose=1)

In [190]:
mygs.best_estimator_

Pipeline(steps=[('classifier', LogisticRegression(C=1))])

In [213]:
# 5. Score
mygs.score(X_test_tfidf, y_test)

0.8112727272727273

Looks like the original Logistic Regression model we have already run worked the best: TF-IDF vectorizer with min_df of 5 and an L2 regularization of C=1. Lets try a count vectorizer for experiment's sake even though it doesnt perform well and see if the highly predictive words change.

In [19]:
# Fitting a countvectorizer and taking a look at the tokens
countvec = CountVectorizer(min_df=5, stop_words=ENGLISH_STOP_WORDS)
countvec.fit(X_train)
X_train_countvec = countvec.transform(X_train)
X_test_countvec = countvec.transform(X_test)

# fit the model
my_best_model2 = LogisticRegression(C=1)
my_best_model2.fit(X_train_countvec, y_train)

# extract the coefficients
coefs = my_best_model2.coef_
tokens = countvec.get_feature_names()
results2 = pd.DataFrame({'tokens': tokens, 'coef': coefs[0]})

# Sort the rows by the coefficient of the word/token (from lowest to highest)
results2.sort_values(by='coef', inplace=True)

print("Democratic Tokens")
display(results2.head(50).tokens.values)


print("Republican Tokens")
display(results2.tail(50).tokens.values)

Democratic Tokens


array(['demconvention', 'climate', 'heroesact', 'fitn', 'donald',
       'democracy', 'racism', 'elect', 'international',
       'justiceinpolicing', 'g7parliament', 'urgently', 'promisesbroken',
       'dejoy', 'educators', 'promise', 'bennet', 'ice', 'build',
       'progressive', 'bounties', 'iowa', 'mind', 'trumpchaos',
       'affordable', 'etc', 'something', 'klobuchar', 'misinformation',
       'realdealroadtrip', 'moral', 'nhprimary2020', 'dontmesswithusps',
       'daughter', 'housing', 'investing', 'el', 'capitol', 'wealth',
       'colorado', 'transform', 'presidency', 'accountability',
       'financial', 'designed', 'organize', 'hampshire', 'familiesfirst',
       'square', 'postmaster'], dtype=object)

Republican Tokens


array(['robert', 'supply', 'analysis', 'phase', 'tiktok', 'restrictions',
       'proverbs', 'judges', 'wow', 'florida', 'comeback', 'confidence',
       'pence', 'psalms', 'weeks', 'fbi', 'laid', 'storm',
       'smallbusiness', 'expected', 'rebuilding', 'proxy', 'kelley',
       'tennessee', 'usmca', '100yearsofwomenssuffrage', 'added',
       'operation', 'left', 'football', 'china', 'liberal', 'cuomo',
       'god', 'greatest', 'pennsylvania', 'lord', 'wewanttoplay',
       'thread', 'mob', 'houston', 'carolina', 'ccp', 'democrats',
       'communist', 'chinese', 'fisa', 'democrat', 'pelosi', 'rnc2020'],
      dtype=object)

Looks like most of the words stayed the same here with some minor differences!

In [212]:
my_best_model2.score(X_test_countvec, y_test)

0.8141818181818182

In [201]:
from sklearn.pipeline import make_pipeline

# 2. Setup the param grid
param_grid2 = [
    {'classifier__C':[0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]}, 
    {'classifier':[KNeighborsClassifier()],
     'classifier__n_neighbors':[1,2,3,4,5,6,7,8,9,10]},
    {'classifier':[DecisionTreeClassifier()],
     'classifier__max_depth':[1,2,3,4,5,6,7,8,9,10]}] # Decision Tree with PCA

In [202]:
from sklearn.pipeline import Pipeline
# 1. Make the pipeline (instantiate a placeholder)
estimators2 = [('classifier', LogisticRegression())]
mypipeline2 = Pipeline(estimators2)

In [203]:
X_train_countvec.shape

(8250, 3654)

In [204]:
y_train.shape

(8250,)

In [205]:
mygs2 = GridSearchCV(mypipeline2, param_grid=param_grid2, cv=5, n_jobs=-1, verbose=1)

In [206]:
# 4. Fit
mygs2.fit(X_train_countvec, y_train)

Fitting 5 folds for each of 28 candidates, totalling 140 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 129 out of 140 | elapsed:    6.6s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed:    6.7s finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('classifier', LogisticRegression())]),
             n_jobs=-1,
             param_grid=[{'classifier__C': [0.0001, 0.001, 0.01, 0.1, 1, 10,
                                            100, 1000]},
                         {'classifier': [KNeighborsClassifier()],
                          'classifier__n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                      10]},
                         {'classifier': [DecisionTreeClassifier()],
                          'classifier__max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                    10]}],
             verbose=1)

In [207]:
mygs.best_estimator_

Pipeline(steps=[('classifier', LogisticRegression(C=1))])

In [208]:
# 5. Score
mygs.score(X_test_countvec, y_test)

0.7654545454545455

In [24]:
# Authenticate to Twitter
auth = tweepy.OAuthHandler("OENDOLh2rnnwsUKOO6XO1WED9", 
    "opxleI1SC7wrCLDZpQXWpoDhL2ntNXvv7fSx09OiQnh9rqWStd")
auth.set_access_token("1292145515925307393-YT3Bg347EwCst3XgRS43EtLM7i9B8l", 
    "VvX4ikUwSukdZzUMyMCYa4tUM4He4craOxZXDAb5BtSDK")

api = tweepy.API(auth)

try:
    api.verify_credentials()
    print("Authentication OK")
except:
    print("Error during authentication")

Authentication OK


In [25]:
# Setting notifier for hitting the wait limit
api = tweepy.API(auth, wait_on_rate_limit=True,
    wait_on_rate_limit_notify=True)

In [334]:
count = 500
anguskingtweets = []

tweets = tweepy.Cursor(api.user_timeline,id='SenAngusKing', include_rts = True, tweet_mode = 'extended').items(count)
tweets_list2 = [[tweet.full_text] for tweet in tweets]
anguskingtweets.extend(tweets_list2)

In [335]:
anguskingtweets = pd.DataFrame(anguskingtweets, dtype='string')

In [336]:
anguskingtweets = anguskingtweets.astype('string')

In [337]:
anguskingtweets

Unnamed: 0,0
0,"For decades @PortlandChamber’s Eggs &amp; Issues has convened for important conversations – and now, that tradition continues virtually! Grateful to speak to hundreds of Maine people about election security, economic development, and how we can support each other during the pandemic. https://t.co/RB9pS7qYNR"
1,"Maine’s artists drive significant economic activity, inspire us – and employ thousands of Maine people in communities across our state. During yesterday’s call with @MaineArts, I emphasized my belief that Congress should provide aid to this hard-hit industry. https://t.co/7FMmqGZM8O"
2,"On the anniversary of #September11, we must come together to honor those who were lost on that terrible day, and to recommit ourselves to the values that shone through in the aftermath of these vicious attacks. https://t.co/TcXUEHu2iA"
3,I’ll be joining @JohnBerman on @NewDay at ~7:15 — hope you can tune in.
4,"Before passing the CARES Act, the Senate voted down Senator McConnell’s partisan proposal – twice – leading to negotiations that secured vital funding for Maine and the nation. Today's vote should do the same; the American people are counting on it. My full statement: https://t.co/B0P76RmlrA"
...,...
495,"Good News for Maine: @YorkCountyCAP (Sanford area) and Downeast Community Partners (Ellsworth area) have received a total of $2.5m towards Head Start programs – a fun, safe way for Maine kids to learn and grow while their parents attend school or work. https://t.co/9keC0g4cqE"
496,"Don’t wait! The ACA deadline is next Sunday (12/15) which leaves only 7 days to review your options and find the best healthcare plan for you and your family. To sign up, Maine people can call (800) 965-7476 or visit https://t.co/qn8HPHH2iH"
497,"Today, we remember the horror that shook our country 78 years ago: the attack on Pearl Harbor. That grave day claimed the lives of over 2,000, wounded many others, and plunged the U.S. into WWII. We memorialize their patriotism, bravery, and sense of duty in the face of peril."
498,"Extending a big thanks to Sen. Alexander, and the folks at @ORNL for welcoming me to their lab! The innovations underway here, in collaboration with @UMaine and @ENERGY, are helping to revolutionize Maine’s forest economy. Great news for all who rely on this historic industry. https://t.co/ajWKf51hbJ"


In [338]:
anguslist1 = []
for text in anguskingtweets[0]:
    souper2 = BeautifulSoup(text, "lxml")
    anguslist1.append(souper2.get_text())

In [None]:
p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION, p.OPT.SMILEY, p.OPT.NUMBER)

In [339]:
anguslist2 =[]
for text in anguslist1:
    anguslist2.append(p.clean(text))

In [340]:
anguslist3 =[]
for text in anguslist2:
    anguslist3.append(re.sub(r"^RT.*:","",text))

In [341]:
angus_test_tfidf = tfidf.transform(anguslist3)

In [342]:
angus = my_best_model.predict_proba(angus_test_tfidf)

In [343]:
angusdf = pd.DataFrame(angus)

In [345]:
angusdf

Unnamed: 0,0,1
0,0.849156,0.150844
1,0.757710,0.242290
2,0.798679,0.201321
3,0.589169,0.410831
4,0.763283,0.236717
...,...,...
495,0.400304,0.599696
496,0.713842,0.286158
497,0.680503,0.319497
498,0.328771,0.671229


In [346]:
angusdf[0].mean()

0.5995061041418653

In [347]:
angusdf[1].mean()

0.4004938958581348

The above method, although initially seemed correct was definetely the wrong way to go about it. Lets instead get a prediction for each tweet and then get a value count of how many tweets get classified as either republican or democratic.

In [348]:
testpredict2 = []
for x in range(len(angusdf[0])):
    testpredict2.append(angusdf.iloc[x,:].idxmax(axis=1))
    

In [349]:
anguspredic = pd.DataFrame(testpredict2)

In [350]:
anguspredic[0].value_counts()

0    355
1    145
Name: 0, dtype: int64

In [351]:
355/500

0.71

Angus King has 71% of his tweets classify as democratic. This makes sense because although he is an independant, he caucuses with the democratic party for committee assignment purposes. I expect any actual Democrat to have more tweets than Angus King that classify as so.

In [79]:
count = 500
trumptweets = []

tweets = tweepy.Cursor(api.user_timeline,id='realDonaldTrump', include_rts = True, tweet_mode = 'extended').items(count)
tweets_list2 = [[tweet.full_text] for tweet in tweets]
trumptweets.extend(tweets_list2)

In [80]:
trumptweets = pd.DataFrame(trumptweets, dtype='string')

In [81]:
trumptweets = trumptweets.astype('string')

In [82]:
trumptweets

Unnamed: 0,0
0,I want to defend &amp; preserve our nation’s h...
1,Comey is a disaster who cheated and lied. How ...
2,Last night I did what the corrupt media has re...
3,"Thank you Duluth, Minnesota! https://t.co/OY9..."
4,RT @paulsperry_: all this memory loss and out-...
...,...
495,Michelle @FischbachMN7 is running for Congress...
496,Congressman @PeteStauber fights for the hard-w...
497,".@ScottTaylorVA is a fighter for Virginia, and..."
498,Bob Good (@GoodForCongress) will be a terrific...


In [83]:
trumplist1 = []
for text in trumptweets[0]:
    souper2 = BeautifulSoup(text, "lxml")
    trumplist1.append(souper2.get_text())

In [85]:
trumplist2 =[]
for text in trumplist1:
    trumplist2.append(p.clean(text))

In [86]:
trumplist3 =[]
for text in trumplist2:
    trumplist3.append(re.sub(r"^RT.*:","",text))

In [87]:
trump_test_tfidf = tfidf.transform(trumplist3)

In [88]:
trump = my_best_model.predict_proba(trump_test_tfidf)

In [89]:
trumpdf = pd.DataFrame(trump)

In [90]:
trumpdf

Unnamed: 0,0,1
0,0.050463,0.949537
1,0.180065,0.819935
2,0.475768,0.524232
3,0.368139,0.631861
4,0.329517,0.670483
...,...,...
495,0.245745,0.754255
496,0.441464,0.558536
497,0.265884,0.734116
498,0.199918,0.800082


In [91]:
testpredict = []
for x in range(len(trumpdf[0])):
    testpredict.append(trumpdf.iloc[x,:].idxmax(axis=1))
    

In [92]:
trumppredic = pd.DataFrame(testpredict)

In [93]:
trumppredic[0].value_counts()

1    413
0     87
Name: 0, dtype: int64

In [94]:
87/500

0.174

In [95]:
413/500

0.826

82.6% of Donald Trumps tweets classify as Republican!

In [112]:
count = 500
mrtweets = []

tweets = tweepy.Cursor(api.user_timeline,id='MittRomney', include_rts = True, tweet_mode = 'extended').items(count)
tweets_list2 = [[tweet.full_text] for tweet in tweets]
mrtweets.extend(tweets_list2)

In [113]:
mrtweets = pd.DataFrame(mrtweets, dtype='string')

In [114]:
mrtweets = mrtweets.astype('string')

In [115]:
mrtweets

Unnamed: 0,0
0,They got my casual Saturday night look just ri...
1,"Happy to hear all is well, Ron! Ann and I will..."
2,Fundamental to democracy is the peaceful trans...
3,My statement on the passing of Ruth Bader Gins...
4,Never forget the heartbreak of love and lives ...
...,...
495,The murder of Boris Nemtsov reminds us that ty...
496,My heart goes out to @AuditorSchweich's family...
497,"Love you too, sweetie. RT @AnnDRomney One of m..."
498,"With the angels, we weep today for Kayla Muell..."


In [116]:
mrlist1 = []
for text in mrtweets[0]:
    souper2 = BeautifulSoup(text, "lxml")
    mrlist1.append(souper2.get_text())

In [117]:
mrlist2 =[]
for text in mrlist1:
    mrlist2.append(p.clean(text))

In [118]:
mrlist3 =[]
for text in mrlist2:
    mrlist3.append(re.sub(r"^RT.*:","",text))

In [119]:
mr_test_tfidf = tfidf.transform(mrlist3)

In [120]:
mr = my_best_model.predict_proba(mr_test_tfidf)

In [121]:
mrdf = pd.DataFrame(mr)

In [122]:
mrdf

Unnamed: 0,0,1
0,0.827318,0.172682
1,0.259007,0.740993
2,0.458034,0.541966
3,0.580198,0.419802
4,0.744134,0.255866
...,...,...
495,0.358638,0.641362
496,0.604397,0.395603
497,0.590624,0.409376
498,0.234946,0.765054


In [123]:
testpredict3 = []
for x in range(len(mrdf[0])):
    testpredict3.append(mrdf.iloc[x,:].idxmax(axis=1))
    

In [124]:
mrpredic = pd.DataFrame(testpredict3)

In [125]:
mrpredic[0].value_counts()

1    286
0    214
Name: 0, dtype: int64

In [126]:
286/500

0.572

In [107]:
count = 500
botweets = []

tweets = tweepy.Cursor(api.user_timeline,id='BarackObama', include_rts = True, tweet_mode = 'extended').items(count)
tweets_list2 = [[tweet.full_text] for tweet in tweets]
botweets.extend(tweets_list2)

In [353]:
botweets = pd.DataFrame(botweets, dtype='string')

In [354]:
botweets = botweets.astype('string')

In [355]:
botweets

Unnamed: 0,0
0,"Today we remember the lives we lost on 9/11. Even the smallest act of service is a way to honor them. And just like Jay Winuk, we can honor their memories through service. https://t.co/BdKDozK5t0 https://t.co/Jc4vg4Dqoh"
1,The fires across the West Coast are just the latest examples of the very real ways our changing climate is changing our communities. Protecting our planet is on the ballot. Vote like your life depends on it—because it does. https://t.co/gKGegXWxQu
2,"Great to catch up with our next Vice President, @KamalaHarris. I wanted to make sure to share a few tips about serving alongside our friend @JoeBiden. https://t.co/ncidvmylch"
3,"RT @JoeBiden: Today, we honor the generations of union workers who fought for the rights, power, wages, and benefits that built and sustain…"
4,"This Labor Day, let’s thank all those who've kept our country going this year—nurses, teachers, delivery drivers, food service workers, and many more. We can honor them by building our system back even better—so that essential workers are treated like it, pandemic or not."
...,...
495,We remember everyone we lost on 9/11 and honor all who defend our country and our ideals. No act of terror will ever change who we are.
496,"RT @GeorgeHWBush: Across the U.S., Americans have answered the call to help with hurricane recovery. Praying for all Floridians. #Irma http…"
497,Proud of these McKinley Tech students—inspiring young minds that make me hopeful about our future. https://t.co/nqYC1mjjTB
498,Americans always answer the call. https://t.co/SV1jixOExu https://t.co/ktEvL6s89d


In [356]:
bolist1 = []
for text in botweets[0]:
    souper2 = BeautifulSoup(text, "lxml")
    bolist1.append(souper2.get_text())

In [357]:
bolist2 =[]
for text in bolist1:
    bolist2.append(p.clean(text))

In [358]:
bolist3 =[]
for text in bolist2:
    bolist3.append(re.sub(r"^RT.*:","",text))

In [359]:
bo_test_tfidf = tfidf.transform(bolist3)

In [360]:
bo = my_best_model.predict_proba(bo_test_tfidf)

In [361]:
bodf = pd.DataFrame(bo)

In [362]:
bodf

Unnamed: 0,0,1
0,0.738570,0.261430
1,0.782382,0.217618
2,0.656545,0.343455
3,0.881509,0.118491
4,0.909949,0.090051
...,...,...
495,0.827470,0.172530
496,0.247807,0.752193
497,0.559868,0.440132
498,0.627293,0.372707


In [363]:
testpredict3 = []
for x in range(len(bodf[0])):
    testpredict3.append(bodf.iloc[x,:].idxmax(axis=1))
    

In [364]:
bopredic = pd.DataFrame(testpredict3)

In [365]:
bopredic[0].value_counts()

0    388
1    112
Name: 0, dtype: int64

In [366]:
388/500

0.776

Makes sense that Obama has more tweets than Angus King that classify as Democratic!

In [26]:
count = 500
qatweets = []

tweets = tweepy.Cursor(api.user_timeline,id='QanonAnonymous', include_rts = True, tweet_mode = 'extended').items(count)
tweets_list2 = [[tweet.full_text] for tweet in tweets]
qatweets.extend(tweets_list2)

In [28]:
qatweets = pd.DataFrame(qatweets, dtype='string')

In [29]:
qatweets

Unnamed: 0,0
0,@JordanUhl @rickyftw @mattg00d @tomfromstray @...
1,RT @JordanUhl: I'm in. Don't forget @rickyftw ...
2,The QAA boys prepare to go toe-to-toe with the...
3,RT @travis_view: Broke: Biden was wearing a de...
4,RT @julianfeeld: QAnon dating profiles in a de...
...,...
495,RT @julianfeeld: This self-described shaman is...
496,Here the crowd breaks into “where we go one we...
497,Today a group of pizzagate and QAnon believers...
498,@julianfeeld Here with @travis_view and anothe...


In [30]:
qalist1 = []
for text in qatweets[0]:
    souper2 = BeautifulSoup(text, "lxml")
    qalist1.append(souper2.get_text())

In [31]:
qalist2 =[]
for text in qalist1:
    qalist2.append(p.clean(text))

In [32]:
qalist3 =[]
for text in qalist2:
    qalist3.append(re.sub(r"^RT.*:","",text))

In [38]:
qa_test_tfidf = tfidf.transform(qalist3)

In [39]:
qa = my_best_model.predict_proba(qa_test_tfidf)

In [40]:
qadf = pd.DataFrame(qa)

In [41]:
qadf

Unnamed: 0,0,1
0,0.227280,0.772720
1,0.209569,0.790431
2,0.245298,0.754702
3,0.188042,0.811958
4,0.325461,0.674539
...,...,...
495,0.580965,0.419035
496,0.560535,0.439465
497,0.281256,0.718744
498,0.414493,0.585507


In [43]:
testpredict4 = []
for x in range(len(qadf[0])):
    testpredict4.append(qadf.iloc[x,:].idxmax(axis=1))
    

In [44]:
qapredic = pd.DataFrame(testpredict4)

In [45]:
qapredic[0].value_counts()

1    352
0    148
Name: 0, dtype: int64

In [46]:
352/500

0.704

In [47]:
count = 500
aftweets = []

tweets = tweepy.Cursor(api.user_timeline,id='antifaintl', include_rts = True, tweet_mode = 'extended').items(count)
tweets_list2 = [[tweet.full_text] for tweet in tweets]
aftweets.extend(tweets_list2)

In [48]:
aftweets = pd.DataFrame(aftweets, dtype='string')

In [49]:
aftweets

Unnamed: 0,0
0,@Florida_Thinker https://t.co/rde3E5fgKA
1,"RT @ThoughtSlime: Boy, can I pick a timely sub..."
2,"@moranmatthewp NOPE, 'TWAS A GLITCH! THEY'RE ..."
3,@artipatel Does Canada have Outward Bound?
4,Reminder: https://t.co/lrtrRlocF5
...,...
495,RT @MetroATLDSA: .@flowerunited won and declar...
496,@HarmReduxDoc That's how you know he's Threepe...
497,#ThreeperTriggerDiscipline - who wants to give...
498,"Even in our wildest imaginations, we could not..."


In [50]:
aflist1 = []
for text in aftweets[0]:
    souper2 = BeautifulSoup(text, "lxml")
    aflist1.append(souper2.get_text())

In [51]:
aflist2 =[]
for text in aflist1:
    aflist2.append(p.clean(text))

In [52]:
aflist3 =[]
for text in aflist2:
    aflist3.append(re.sub(r"^RT.*:","",text))

In [53]:
af_test_tfidf = tfidf.transform(aflist3)

In [54]:
af = my_best_model.predict_proba(af_test_tfidf)

In [55]:
afdf = pd.DataFrame(af)

In [56]:
afdf

Unnamed: 0,0,1
0,0.325225,0.674775
1,0.388003,0.611997
2,0.221795,0.778205
3,0.325225,0.674775
4,0.464733,0.535267
...,...,...
495,0.434815,0.565185
496,0.803573,0.196427
497,0.697385,0.302615
498,0.792981,0.207019


In [57]:
testpredict5 = []
for x in range(len(afdf[0])):
    testpredict5.append(afdf.iloc[x,:].idxmax(axis=1))
    

In [58]:
afpredic = pd.DataFrame(testpredict5)

In [59]:
afpredic[0].value_counts()

1    409
0     91
Name: 0, dtype: int64

In [60]:
409/500

0.818