In [51]:
import pandas as pd
import string
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [52]:
# YOU ONLY NEED TO RUN  THIS ONCE(IF YOU HAVEN'T)
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/hcao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# read the raw data and show the data
raw_data = pd.read_csv("./Data/Raw/ExtractedTweets.csv")
raw_data.head()

Unnamed: 0,Party,Handle,Tweet
0,Democrat,RepDarrenSoto,"Today, Senate Dems vote to #SaveTheInternet. P..."
1,Democrat,RepDarrenSoto,RT @WinterHavenSun: Winter Haven resident / Al...
2,Democrat,RepDarrenSoto,RT @NBCLatino: .@RepDarrenSoto noted that Hurr...
3,Democrat,RepDarrenSoto,RT @NALCABPolicy: Meeting with @RepDarrenSoto ...
4,Democrat,RepDarrenSoto,RT @Vegalteno: Hurricane season starts on June...


In [5]:
raw_data.Party.unique()

array(['Democrat', 'Republican'], dtype=object)

In [8]:
raw_data.info

<bound method DataFrame.info of             Party         Handle  \
0        Democrat  RepDarrenSoto   
1        Democrat  RepDarrenSoto   
2        Democrat  RepDarrenSoto   
3        Democrat  RepDarrenSoto   
4        Democrat  RepDarrenSoto   
...           ...            ...   
86455  Republican    RepTomPrice   
86456  Republican    RepTomPrice   
86457  Republican    RepTomPrice   
86458  Republican    RepTomPrice   
86459  Republican    RepTomPrice   

                                                   Tweet  
0      Today, Senate Dems vote to #SaveTheInternet. P...  
1      RT @WinterHavenSun: Winter Haven resident / Al...  
2      RT @NBCLatino: .@RepDarrenSoto noted that Hurr...  
3      RT @NALCABPolicy: Meeting with @RepDarrenSoto ...  
4      RT @Vegalteno: Hurricane season starts on June...  
...                                                  ...  
86455  Check out my op-ed on need for End Executive O...  
86456  Yesterday, Betty &amp; I had a great time lear...  
8645

# Data cleaning
Before we start to vectorizing the tweets, we firstly need to clean the tweets text

- Remove punctuations
- Remove special symbols, e.g. @xxx
- Remove stop words
- stemmming

TODO: nltk has a class called TweetTokenizer, poentially can be used.

In [12]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [26]:
def remove_punctuations(t):
    t = re.sub('@\w+','', t)
    t = "".join([char for char in t if char not in string.punctuation])
    t = re.sub('[0-9]+', '', t)
    return t

In [27]:
raw_data['tweet_no_punc'] = raw_data.Tweet.apply(lambda x: remove_punctuations(x))

In [28]:
raw_data.head(10)

Unnamed: 0,Party,Handle,Tweet,tweet_no_punc
0,Democrat,RepDarrenSoto,"Today, Senate Dems vote to #SaveTheInternet. P...",Today Senate Dems vote to SaveTheInternet Prou...
1,Democrat,RepDarrenSoto,RT @WinterHavenSun: Winter Haven resident / Al...,RT Winter Haven resident Alta Vista teacher ...
2,Democrat,RepDarrenSoto,RT @NBCLatino: .@RepDarrenSoto noted that Hurr...,RT noted that Hurricane Maria has left appro...
3,Democrat,RepDarrenSoto,RT @NALCABPolicy: Meeting with @RepDarrenSoto ...,RT Meeting with Thanks for taking the time ...
4,Democrat,RepDarrenSoto,RT @Vegalteno: Hurricane season starts on June...,RT Hurricane season starts on June st Puerto ...
5,Democrat,RepDarrenSoto,RT @EmgageActionFL: Thank you to all who came ...,RT Thank you to all who came out to our Orlan...
6,Democrat,RepDarrenSoto,Hurricane Maria left approx $90 billion in dam...,Hurricane Maria left approx billion in damage...
7,Democrat,RepDarrenSoto,RT @Tharryry: I am delighted that @RepDarrenSo...,RT I am delighted that will be voting for th...
8,Democrat,RepDarrenSoto,RT @HispanicCaucus: Trump's anti-immigrant pol...,RT Trumps antiimmigrant policies are hurting ...
9,Democrat,RepDarrenSoto,RT @RepStephMurphy: Great joining @WeAreUnidos...,RT Great joining and for a roundtable in Or...


In [34]:
def tokenization(t):
    t = re.split('\W+', t)
    return t

In [35]:
raw_data['tokenized'] = raw_data.tweet_no_punc.apply(lambda x: tokenization(x))

In [36]:
raw_data.head(10)

Unnamed: 0,Party,Handle,Tweet,tweet_no_punc,tokenized
0,Democrat,RepDarrenSoto,"Today, Senate Dems vote to #SaveTheInternet. P...",Today Senate Dems vote to SaveTheInternet Prou...,"[Today, Senate, Dems, vote, to, SaveTheInterne..."
1,Democrat,RepDarrenSoto,RT @WinterHavenSun: Winter Haven resident / Al...,RT Winter Haven resident Alta Vista teacher ...,"[RT, Winter, Haven, resident, Alta, Vista, tea..."
2,Democrat,RepDarrenSoto,RT @NBCLatino: .@RepDarrenSoto noted that Hurr...,RT noted that Hurricane Maria has left appro...,"[RT, noted, that, Hurricane, Maria, has, left,..."
3,Democrat,RepDarrenSoto,RT @NALCABPolicy: Meeting with @RepDarrenSoto ...,RT Meeting with Thanks for taking the time ...,"[RT, Meeting, with, Thanks, for, taking, the, ..."
4,Democrat,RepDarrenSoto,RT @Vegalteno: Hurricane season starts on June...,RT Hurricane season starts on June st Puerto ...,"[RT, Hurricane, season, starts, on, June, st, ..."
5,Democrat,RepDarrenSoto,RT @EmgageActionFL: Thank you to all who came ...,RT Thank you to all who came out to our Orlan...,"[RT, Thank, you, to, all, who, came, out, to, ..."
6,Democrat,RepDarrenSoto,Hurricane Maria left approx $90 billion in dam...,Hurricane Maria left approx billion in damage...,"[Hurricane, Maria, left, approx, billion, in, ..."
7,Democrat,RepDarrenSoto,RT @Tharryry: I am delighted that @RepDarrenSo...,RT I am delighted that will be voting for th...,"[RT, I, am, delighted, that, will, be, voting,..."
8,Democrat,RepDarrenSoto,RT @HispanicCaucus: Trump's anti-immigrant pol...,RT Trumps antiimmigrant policies are hurting ...,"[RT, Trumps, antiimmigrant, policies, are, hur..."
9,Democrat,RepDarrenSoto,RT @RepStephMurphy: Great joining @WeAreUnidos...,RT Great joining and for a roundtable in Or...,"[RT, Great, joining, and, for, a, roundtable, ..."


In [37]:
def remove_stop_words(t):
    t = [word for word in t if word not in stopword]
    return t

In [40]:
stopword = nltk.corpus.stopwords.words('english')

In [42]:
raw_data['no_stop_words'] = raw_data.tokenized.apply(lambda x: remove_stop_words(x))

In [43]:
raw_data.head(10)

Unnamed: 0,Party,Handle,Tweet,tweet_no_punc,tokenized,no_stop_words
0,Democrat,RepDarrenSoto,"Today, Senate Dems vote to #SaveTheInternet. P...",Today Senate Dems vote to SaveTheInternet Prou...,"[Today, Senate, Dems, vote, to, SaveTheInterne...","[Today, Senate, Dems, vote, SaveTheInternet, P..."
1,Democrat,RepDarrenSoto,RT @WinterHavenSun: Winter Haven resident / Al...,RT Winter Haven resident Alta Vista teacher ...,"[RT, Winter, Haven, resident, Alta, Vista, tea...","[RT, Winter, Haven, resident, Alta, Vista, tea..."
2,Democrat,RepDarrenSoto,RT @NBCLatino: .@RepDarrenSoto noted that Hurr...,RT noted that Hurricane Maria has left appro...,"[RT, noted, that, Hurricane, Maria, has, left,...","[RT, noted, Hurricane, Maria, left, approximat..."
3,Democrat,RepDarrenSoto,RT @NALCABPolicy: Meeting with @RepDarrenSoto ...,RT Meeting with Thanks for taking the time ...,"[RT, Meeting, with, Thanks, for, taking, the, ...","[RT, Meeting, Thanks, taking, time, meet, ED, ..."
4,Democrat,RepDarrenSoto,RT @Vegalteno: Hurricane season starts on June...,RT Hurricane season starts on June st Puerto ...,"[RT, Hurricane, season, starts, on, June, st, ...","[RT, Hurricane, season, starts, June, st, Puer..."
5,Democrat,RepDarrenSoto,RT @EmgageActionFL: Thank you to all who came ...,RT Thank you to all who came out to our Orlan...,"[RT, Thank, you, to, all, who, came, out, to, ...","[RT, Thank, came, Orlando, gala, It, successfu..."
6,Democrat,RepDarrenSoto,Hurricane Maria left approx $90 billion in dam...,Hurricane Maria left approx billion in damage...,"[Hurricane, Maria, left, approx, billion, in, ...","[Hurricane, Maria, left, approx, billion, dama..."
7,Democrat,RepDarrenSoto,RT @Tharryry: I am delighted that @RepDarrenSo...,RT I am delighted that will be voting for th...,"[RT, I, am, delighted, that, will, be, voting,...","[RT, I, delighted, voting, CRA, overrule, FCC,..."
8,Democrat,RepDarrenSoto,RT @HispanicCaucus: Trump's anti-immigrant pol...,RT Trumps antiimmigrant policies are hurting ...,"[RT, Trumps, antiimmigrant, policies, are, hur...","[RT, Trumps, antiimmigrant, policies, hurting,..."
9,Democrat,RepDarrenSoto,RT @RepStephMurphy: Great joining @WeAreUnidos...,RT Great joining and for a roundtable in Or...,"[RT, Great, joining, and, for, a, roundtable, ...","[RT, Great, joining, roundtable, Orlando, fede..."


In [44]:
ps = nltk.PorterStemmer()

In [45]:
def stemming(t):
    t = [ps.stem(word) for word in t]
    return t

In [46]:
raw_data['stemmed'] = raw_data.no_stop_words.apply(lambda x: stemming(x))

In [47]:
raw_data.head(10)

Unnamed: 0,Party,Handle,Tweet,tweet_no_punc,tokenized,no_stop_words,stemmed
0,Democrat,RepDarrenSoto,"Today, Senate Dems vote to #SaveTheInternet. P...",Today Senate Dems vote to SaveTheInternet Prou...,"[Today, Senate, Dems, vote, to, SaveTheInterne...","[Today, Senate, Dems, vote, SaveTheInternet, P...","[today, senat, dem, vote, savetheinternet, pro..."
1,Democrat,RepDarrenSoto,RT @WinterHavenSun: Winter Haven resident / Al...,RT Winter Haven resident Alta Vista teacher ...,"[RT, Winter, Haven, resident, Alta, Vista, tea...","[RT, Winter, Haven, resident, Alta, Vista, tea...","[RT, winter, haven, resid, alta, vista, teache..."
2,Democrat,RepDarrenSoto,RT @NBCLatino: .@RepDarrenSoto noted that Hurr...,RT noted that Hurricane Maria has left appro...,"[RT, noted, that, Hurricane, Maria, has, left,...","[RT, noted, Hurricane, Maria, left, approximat...","[RT, note, hurrican, maria, left, approxim, bi..."
3,Democrat,RepDarrenSoto,RT @NALCABPolicy: Meeting with @RepDarrenSoto ...,RT Meeting with Thanks for taking the time ...,"[RT, Meeting, with, Thanks, for, taking, the, ...","[RT, Meeting, Thanks, taking, time, meet, ED, ...","[RT, meet, thank, take, time, meet, ED, marucc..."
4,Democrat,RepDarrenSoto,RT @Vegalteno: Hurricane season starts on June...,RT Hurricane season starts on June st Puerto ...,"[RT, Hurricane, season, starts, on, June, st, ...","[RT, Hurricane, season, starts, June, st, Puer...","[RT, hurrican, season, start, june, st, puerto..."
5,Democrat,RepDarrenSoto,RT @EmgageActionFL: Thank you to all who came ...,RT Thank you to all who came out to our Orlan...,"[RT, Thank, you, to, all, who, came, out, to, ...","[RT, Thank, came, Orlando, gala, It, successfu...","[RT, thank, came, orlando, gala, It, success, ..."
6,Democrat,RepDarrenSoto,Hurricane Maria left approx $90 billion in dam...,Hurricane Maria left approx billion in damage...,"[Hurricane, Maria, left, approx, billion, in, ...","[Hurricane, Maria, left, approx, billion, dama...","[hurrican, maria, left, approx, billion, damag..."
7,Democrat,RepDarrenSoto,RT @Tharryry: I am delighted that @RepDarrenSo...,RT I am delighted that will be voting for th...,"[RT, I, am, delighted, that, will, be, voting,...","[RT, I, delighted, voting, CRA, overrule, FCC,...","[RT, I, delight, vote, cra, overrul, fcc, save..."
8,Democrat,RepDarrenSoto,RT @HispanicCaucus: Trump's anti-immigrant pol...,RT Trumps antiimmigrant policies are hurting ...,"[RT, Trumps, antiimmigrant, policies, are, hur...","[RT, Trumps, antiimmigrant, policies, hurting,...","[RT, trump, antiimmigr, polici, hurt, small, b..."
9,Democrat,RepDarrenSoto,RT @RepStephMurphy: Great joining @WeAreUnidos...,RT Great joining and for a roundtable in Or...,"[RT, Great, joining, and, for, a, roundtable, ...","[RT, Great, joining, roundtable, Orlando, fede...","[RT, great, join, roundtabl, orlando, feder, i..."


In [48]:
wn = nltk.WordNetLemmatizer()

In [50]:
# no need if already stemed, but we can see which one works better tho
def lemmatizer(t):
    t = [wn.lemmatize(w) for w in t]
    return t

# Vectorization