# Tweet Analysis Notebook

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
import matplotlib.pyplot as plt
import src.tweet_func as t
import nltk
from nltk.corpus import stopwords
plt.style.use('ggplot')

In [2]:
def remove_empties(x):
    """Removes empty strings """
    res = []
    for val in x:
        if val != '' and val != ' ':
            res.append(val)
    return res

def remove_stopwords(x, stop_words):
    """Removes any word from x that is contained in stop_words"""
    valids = []
    for word in x:
        if word not in stop_words:
            valids.append(word)
    return valids

def remove_phrase(x, leading_char):
    """Removes phrases like mentions or hashtags from a list
    input: x - list
           leading_char - character like @ or #
    output: list without phrases"""
    filtered_list = []
    for word in x:
        if word[0] != leading_char:
            filtered_list.append(word)
    return filtered_list

def process_tweets(df, *args):
    """process tweet data by applying filters specified in args.
    choices are:
        get_mentions     = create column with @twitter_user mentions
        get_hashtags     = create column with #hashtag(s)
        split_tweets     = create column with a list of words from tweet
        lowercase        = force all letters to lowercase in split_tweets list
        remove_nonalpha  = removes special characters and numbers from list
        remove_stopwords = removes common words referencing stop_words list
        
        return: processed dataframe"""
    
    if 'get_mentions' in args:
        df['mentions'] = df['tweets'].str.findall(r'@.*?(?=\s|$)')
    if 'get_hashtags' in args:
        df['hashtags'] = df['tweets'].str.findall(r'#.*?(?=\s|$)')
    if 'split_tweets' in args:
        df['split_tweets'] = df['tweets'].str.split(' ')
        df['split_tweets'] = df['split_tweets'].apply(lambda x: list(remove_empties(x)))
    if 'remove_mentions' in args:
        df['split_tweets'] = df['split_tweets'].apply(lambda x: list(remove_phrase(x, '@')))
    if 'remove_hashtags' in args:
        df['split_tweets'] = df['split_tweets'].apply(lambda x: list(remove_phrase(x, '#')))
    if 'lowercase' in args:
        df['split_tweets'] = df['split_tweets'].apply(lambda x: list(map(str.lower, x)))
    if 'remove_nonalpha' in args:
        df['split_tweets'] = df['split_tweets'].apply(lambda x: list(map(t.letters, x)))
    if 'remove_stopwords' in args:
        df['split_tweets'] = df['split_tweets'].apply(lambda x: list(remove_stopwords(x, stop_words)))
    return df

In [3]:
df = pd.read_csv('data/senators.csv', encoding = "ISO-8859-1")
stop_words = set(stopwords.words('english'))
df.rename(columns={"text": "tweets"}, inplace=True)
retweets_df = df.copy()[df['tweets'].str.contains('RT @')]
tweets_df = df[~df["tweets"].str.contains('RT @')]
tweets_df = process_tweets(tweets_df, 
                           'get_mentions', 
                           'get_hashtags', 
                           'split_tweets', 
                          'remove_mentions', 
                           'remove_hashtags', 
                           'lowercase', 
                           'remove_nonalpha', 
                           'remove_stopwords'
                          )
retweets_df = process_tweets(retweets_df, 
                           'get_mentions', 
                           'get_hashtags', 
                           'split_tweets', 
                          'remove_mentions', 
                           'remove_hashtags', 
                           'lowercase', 
                           'remove_nonalpha', 
                           'remove_stopwords'
                          )


In [4]:
tweets_df.head()

Unnamed: 0,created_at,tweets,url,replies,retweets,favorites,user,bioguide_id,party,state,mentions,hashtags,split_tweets
0,10/19/17 21:47,We released bipartisan healthcare bill today &...,https://twitter.com/amyklobuchar/status/921130...,21,129,533,amyklobuchar,K000367,D,MN,[],[],"[released, bipartisan, healthcare, bill, today..."
1,10/19/17 18:48,I spoke with @Morning_Joe team abt #HonestAds ...,https://twitter.com/amyklobuchar/status/921085...,8,46,150,amyklobuchar,K000367,D,MN,[@Morning_Joe],[#HonestAds],"[spoke, team, abt, bill, yesterdays, sessions,..."
2,10/19/17 18:14,Lots of interest in my bill with Senators Warn...,https://twitter.com/amyklobuchar/status/921077...,36,227,932,amyklobuchar,K000367,D,MN,[],[],"[lots, interest, bill, senators, warner, mccai..."
3,10/19/17 18:04,"Today's the day @MarkWarner, @SenJohnMcCain &a...",https://twitter.com/amyklobuchar/status/921074...,17,167,550,amyklobuchar,K000367,D,MN,"[@MarkWarner,, @SenJohnMcCain]",[#HonestAds],"[todays, day, amp, intro, bill, protect, democ..."
4,10/19/17 16:33,".@MarkWarner, @SenJohnMcCain &amp; I put toget...",https://twitter.com/amyklobuchar/status/921051...,31,279,893,amyklobuchar,K000367,D,MN,"[@MarkWarner,, @SenJohnMcCain]",[#HonestAds],"[markwarner, amp, put, together, bill, make, p..."


In [5]:
retweets_df.head()

Unnamed: 0,created_at,tweets,url,replies,retweets,favorites,user,bioguide_id,party,state,mentions,hashtags,split_tweets
34,10/13/17 20:59,RT @MarkWarner: This is why we need more trans...,https://twitter.com/amyklobuchar/status/918944...,33,188,500,amyklobuchar,K000367,D,MN,"[@MarkWarner:, @AmyKlobuchar]",[],"[rt, need, transparency, online, political, ad..."
36,10/13/17 18:32,RT @mspairport: Senator @amyklobuchar said we ...,https://twitter.com/amyklobuchar/status/918907...,5,13,68,amyklobuchar,K000367,D,MN,"[@mspairport:, @amyklobuchar]",[#humblebrag],"[rt, senator, said, jewel, state, httpstcofzun..."
88,9/28/17 20:14,RT @JerryMoran: Intro'd bipartisan #StartupAct...,https://twitter.com/amyklobuchar/status/913497...,10,10,47,amyklobuchar,K000367,D,MN,"[@JerryMoran:, @MarkWarner,, @RoyBlunt, @amykl...",[#StartupAct],"[rt, introd, bipartisan, w, amp, encourage, in..."
135,9/13/17 14:27,RT @martinmatishak: .@amyklobuchar to @DHSgov ...,https://twitter.com/amyklobuchar/status/907974...,0,11,14,amyklobuchar,K000367,D,MN,"[@martinmatishak:, @amyklobuchar, @DHSgov]","[#elections, #cybersecurity]","[rt, amyklobuchar, , kaspersky, protecting, us..."
167,9/3/17 20:00,RT @MinnesotaDFL: She's here! We're excited to...,https://twitter.com/amyklobuchar/status/904433...,17,33,198,amyklobuchar,K000367,D,MN,"[@MinnesotaDFL:, @amyklobuchar]",[],"[rt, shes, excited, sen, dfl, booth, httpstcoe..."


How many tweets are retweets?

In [6]:
print(f'shape of tweets dataframe: {tweets_df.shape}')
print(f'shape of retweets dataframe: {retweets_df.shape}')
print(f'percent not retweets: {len(tweets_df)/len(df):2.2f}')
print(f'percent retweets: {len(retweets_df)/len(df):2.2f}')

shape of tweets dataframe: (242213, 13)
shape of retweets dataframe: (46402, 13)
percent not retweets: 0.84
percent retweets: 0.16


In [7]:
t.filter_df(retweets_df, 'favorites', 1000000, max)

Unnamed: 0,created_at,tweets,url,replies,retweets,favorites,user,bioguide_id,party,state,mentions,hashtags,split_tweets
87415,1/20/17 15:49,RT @POTUS: It's been the honor of my life to s...,https://twitter.com/SenatorCantwell/status/822...,19158,632470,1642617,SenatorCantwell,C000127,D,WA,[@POTUS:],[],"[rt, honor, life, serve, made, better, leader,..."
89414,7/20/17 16:48,RT @BarackObama: John McCain is an American he...,https://twitter.com/SenatorCardin/status/88807...,29569,419445,2108865,SenatorCardin,C000141,D,MD,[@BarackObama:],[],"[rt, john, mccain, american, hero, amp, one, b..."
92567,7/20/17 2:43,RT @BarackObama: John McCain is an American he...,https://twitter.com/SenatorCarper/status/88786...,29569,419445,2108865,SenatorCarper,C000174,D,DE,[@BarackObama:],[],"[rt, john, mccain, american, hero, amp, one, b..."
120520,10/3/17 14:19,RT @MichelleObama: Happy 25th anniversary @bar...,https://twitter.com/SenatorLeahy/status/915219...,25827,287230,1315041,SenatorLeahy,L000174,D,VT,"[@MichelleObama:, @barackobama.]",[],"[rt, happy, th, anniversary, quarter, century,..."
129663,10/10/17 20:59,"RT @Malala: 5 years ago, I was shot in an atte...",https://twitter.com/SenatorShaheen/status/9178...,11053,356284,1156950,SenatorShaheen,S001181,D,NH,[@Malala:],[],"[rt, , years, ago, shot, attempt, stop, speaki..."
160184,1/20/17 22:13,RT @POTUS44: It's been the honor of my life to...,https://twitter.com/SenBooker/status/822567739...,19158,632470,1642608,SenBooker,B001288,D,NJ,[@POTUS44:],[],"[rt, honor, life, serve, made, better, leader,..."
196359,1/20/17 1:25,RT @FLOTUS: Being your First Lady has been the...,https://twitter.com/SenGillibrand/status/82225...,19142,368413,1167951,SenGillibrand,G000555,D,NY,[@FLOTUS:],[],"[rt, first, lady, honor, lifetime, bottom, hea..."


In [8]:
t.filter_df(tweets_df, 'favorites', 500000, max)

Unnamed: 0,created_at,tweets,url,replies,retweets,favorites,user,bioguide_id,party,state,mentions,hashtags,split_tweets
244285,8/15/17 22:06,".@realDonaldTrump, you are embarrassing our co...",https://twitter.com/SenSanders/status/89758034...,7510,201134,565707,SenSanders,S000033,I,VT,"[@realDonaldTrump,]",[],"[realdonaldtrump, embarrassing, country, milli..."
245162,2/25/17 13:56,.@realDonaldTrump They did. It wasn't. https:/...,https://twitter.com/SenSanders/status/83548856...,11072,207143,520384,SenSanders,S000033,I,VT,[@realDonaldTrump],[],"[realdonaldtrump, wasnt, httpstcoxqtrjper]"
245420,1/21/17 22:15,"President Trump, you made a big mistake. By tr...",https://twitter.com/SenSanders/status/82293062...,13929,454660,975012,SenSanders,S000033,I,VT,[],[],"[president, trump, made, big, mistake, trying,..."


In [9]:
t.filter_df(tweets_df, 'retweets', 0, max)

Unnamed: 0,created_at,tweets,url,replies,retweets,favorites,user,bioguide_id,party,state,mentions,hashtags,split_tweets
245420,1/21/17 22:15,"President Trump, you made a big mistake. By tr...",https://twitter.com/SenSanders/status/82293062...,13929,454660,975012,SenSanders,S000033,I,VT,[],[],"[president, trump, made, big, mistake, trying,..."


In [10]:
t.filter_df(retweets_df, 'retweets', 0, max)

Unnamed: 0,created_at,tweets,url,replies,retweets,favorites,user,bioguide_id,party,state,mentions,hashtags,split_tweets
167894,4/11/17 17:47,RT @carterjwm: HELP ME PLEASE. A MAN NEEDS HIS...,https://twitter.com/SenCortezMasto/status/8518...,38225,3644423,1016576,SenCortezMasto,C001113,D,NV,[@carterjwm:],[],"[rt, help, please, man, needs, nuggs, httpstco..."


In [11]:
t.filter_df(retweets_df, 'replies', 0, max)

Unnamed: 0,created_at,tweets,url,replies,retweets,favorites,user,bioguide_id,party,state,mentions,hashtags,split_tweets
155684,8/12/17 20:09,RT @realDonaldTrump: We ALL must be united &am...,https://twitter.com/SenBobCorker/status/896463...,66872,59242,194351,SenBobCorker,C001071,R,TN,[@realDonaldTrump:],[],"[rt, must, united, amp, condemn, hate, stands,..."
173545,8/13/17 0:06,RT @realDonaldTrump: We ALL must be united &am...,https://twitter.com/sendavidperdue/status/8965...,66872,59242,194346,sendavidperdue,P000612,R,GA,[@realDonaldTrump:],[],"[rt, must, united, amp, condemn, hate, stands,..."


In [12]:
t.filter_df(tweets_df, 'replies', 0, max)

Unnamed: 0,created_at,tweets,url,replies,retweets,favorites,user,bioguide_id,party,state,mentions,hashtags,split_tweets
211234,9/22/17 18:06,I cannot in good conscience vote for Graham-Ca...,https://twitter.com/SenJohnMcCain/status/91129...,38416,56606,190133,SenJohnMcCain,M000303,R,AZ,[],[],"[cannot, good, conscience, vote, grahamcassidy..."
