In [96]:
# pandas and numpy
import pandas as pd
import numpy as numpy

# punctuation, stop words and English language model
from string import punctuation
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_web_sm
nlp = en_core_web_sm.load()

# textblob
from textblob import TextBlob

# countvectorizer, tfidfvectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# gensim
import gensim
from gensim import models

# plotting
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [97]:
congress_tweets = pd.read_csv("116th Congressional Tweets and Demographics.csv")
congress_tweets = congress_tweets.dropna(subset = ['text']).reset_index()
# fill in this line of code with a sufficient number of tweets, depending on your computational resources
congress_tweets.head()

Unnamed: 0,index,tweet_id,screen_name,datetime,text,name_wikipedia,position,joined_congress_date,birthday,gender,state,district_number,party,trump_2016_state_share,clinton_2016_state_share,obama_2012_state_share,romney_2012_state_share
0,0,1.08101e+18,RepByrne,2019-01-03T21:23:00-05:00,Great news for Baldwin County! The economy of ...,Bradley Byrne,Rep,8-Jan-14,2/16/1955,M,AL,1,Republican,1318255,729547,795696,1255925
1,1,1.08088e+18,RepByrne,2019-01-03T12:30:38-05:00,Outstanding news today from @Airbus! @JetBlue ...,Bradley Byrne,Rep,8-Jan-14,2/16/1955,M,AL,1,Republican,1318255,729547,795696,1255925
2,2,1.08083e+18,RepByrne,2019-01-03T09:12:07-05:00,RT @senatemajldr Democrats will have to get se...,Bradley Byrne,Rep,8-Jan-14,2/16/1955,M,AL,1,Republican,1318255,729547,795696,1255925
3,3,1.08089e+18,RepByrne,2019-01-03T13:20:53-05:00,Here is a sign of things to come: As Democrats...,Bradley Byrne,Rep,8-Jan-14,2/16/1955,M,AL,1,Republican,1318255,729547,795696,1255925
4,4,1.08087e+18,RepByrne,2019-01-03T12:10:26-05:00,Let's understand what we're dealing with here:...,Bradley Byrne,Rep,8-Jan-14,2/16/1955,M,AL,1,Republican,1318255,729547,795696,1255925


In [98]:
# using only the text column of the dataframe
#text = congress_tweets[['text']]
#text = congress_tweets.filter(['text'], axis=1)
#text.head()
congress_tweets.shape

(946791, 17)

In [99]:
congress_tweets = congress_tweets.sample(frac=0.001, replace = True, random_state=1)

In [6]:
# converting dataframe to string for NLP
#text = str(text)
#text
#write.csv for dataframe
# json.dump or something
#text['text']

"                                                     text\n0       Great news for Baldwin County! The economy of ...\n1       Outstanding news today from @Airbus! @JetBlue ...\n2       RT @senatemajldr Democrats will have to get se...\n3       Here is a sign of things to come: As Democrats...\n4       Let's understand what we're dealing with here:...\n...                                                   ...\n946786  From our family to yours, we wish you a very M...\n946787  Bobbi and I wish everyone a wonderful Christma...\n946788  God bless the six brave law enforcement office...\n946789  Wishing a healthy and very happy New Year to e...\n946790  Bobbi and I wish you all a healthy and happy N...\n\n[946791 rows x 1 columns]"

In [79]:
nlp = en_core_web_sm.load()
##cfpb['tokens'] = cfpb['Consumer complaint narrative'].map(lambda x: rem_punc_stop(x))
congress_tweets['text_parsed'] = congress_tweets['text'].apply(nlp)

In [80]:
congress_tweets.head()

Unnamed: 0,index,tweet_id,screen_name,datetime,text,name_wikipedia,position,joined_congress_date,birthday,gender,state,district_number,party,trump_2016_state_share,clinton_2016_state_share,obama_2012_state_share,romney_2012_state_share,text_parsed
645097,645097,1.25882e+18,RepAGonzalez,2020-05-08T14:03:49-04:00,Rebuilding America will be a long and bumpy ro...,Anthony Gonzalez,Rep,3-Jan-19,9/19/1984,M,OH,16,Republican,2841005,2394164,2827709,2661437,"(Rebuilding, America, will, be, a, long, and, ..."


In [81]:
def rem_punc_stop(text):
    stop_words = STOP_WORDS
    punc = set(punctuation)
    
    punc_free = "".join([ch for ch in text if ch not in punc])
    
    doc = nlp(punc_free)
    
    spacy_words = [token.text for token in doc]
    
    spacy_words = [word for word in spacy_words if not word.startswith('http')]
    
    no_punc = [word for word in spacy_words if word not in stop_words]
    
    return no_punc

In [88]:
#tokens_reduced = rem_punc_stop(text)
#congress_tweets['text_parsed_new'] = rem_punc_stop(congress_tweets['text_parsed'])
congress_tweets['text'] =congress_tweets['text'].apply(rem_punc_stop)

In [100]:
bow_vector = CountVectorizer(tokenizer = rem_punc_stop, ngram_range=(1,1))

In [102]:
bow_matrix = bow_vector.fit_transform(congress_tweets['text'])

In [103]:
bow_matrix.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 1, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [104]:
feature_names = bow_vector.get_feature_names()
feature_names[0:10]

['\n', '\n\n', '\n\n ', '\n ', '\n \n', '\n\xa0\n', ' ', ' \n', ' \n\n', '  ']