# Tweet Analysis Notebook

### Things to look at:

* Key words: which words could contribute to more retweets/favorites/replies?

* Hashtags: Does adding hashtags contribute to more retweets/favorites/replies?

* Tags: Could tagging specific people contribute to more retweets/favorites/replies?

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import src.tweet_func as t
plt.style.use('ggplot')

## Read in the data

In [2]:
df = pd.read_csv('data/senators.csv', encoding = "ISO-8859-1")

In [3]:
df.shape

(288615, 10)

In [4]:
df.head()

Unnamed: 0,created_at,text,url,replies,retweets,favorites,user,bioguide_id,party,state
0,10/19/17 21:47,We released bipartisan healthcare bill today &...,https://twitter.com/amyklobuchar/status/921130...,21,129,533,amyklobuchar,K000367,D,MN
1,10/19/17 18:48,I spoke with @Morning_Joe team abt #HonestAds ...,https://twitter.com/amyklobuchar/status/921085...,8,46,150,amyklobuchar,K000367,D,MN
2,10/19/17 18:14,Lots of interest in my bill with Senators Warn...,https://twitter.com/amyklobuchar/status/921077...,36,227,932,amyklobuchar,K000367,D,MN
3,10/19/17 18:04,"Today's the day @MarkWarner, @SenJohnMcCain &a...",https://twitter.com/amyklobuchar/status/921074...,17,167,550,amyklobuchar,K000367,D,MN
4,10/19/17 16:33,".@MarkWarner, @SenJohnMcCain &amp; I put toget...",https://twitter.com/amyklobuchar/status/921051...,31,279,893,amyklobuchar,K000367,D,MN


In [5]:
for column in df.columns:
    print(column)

created_at
text
url
replies
retweets
favorites
user
bioguide_id
party
state


In [44]:
df = pd.read_csv('data/senators.csv', encoding = "ISO-8859-1")
# tweets_df = df.copy()[['text']][:50] # only take 50 for now
df.rename(columns={"text": "tweets"}, inplace=True)
retweets_df = df.copy()[df['tweets'].str.contains('RT @')]
tweets_df = df[~df["tweets"].str.contains('RT @')]
# tweets_
# tweets_df['split_tweets'] = tweets_df['tweets'].str.split(' ')
# tweets_df['split_tweets'] = tweets_df['split_tweets'].apply(lambda x: list(map(t.letters, x)))
# tweets_df['split_tweets'] = tweets_df['split_tweets'].apply(lambda x: list(map(str.lower, x)))
#tweets_df['word_count'] = tweets_df['split_tweets'].apply(lambda x: t.count_dictionary(x))

In [51]:
print(f'percent not retweets: {len(tweets_df)/len(df):2.2f}')
print(f'percent retweets: {len(retweets_df)/len(df):2.2f}')

percent not retweets: 0.84
percent retweets: 0.16


In [8]:
tweets_df.shape

(288615, 11)

In [15]:
df['text'].iloc[5]

"Great chat w @chrislhayes about Sessions hearing &amp; my concerns w AG's answers on Russia &amp; press freedom. Don't wantäó_ https://t.co/lP02MrySvd"

In [14]:
df['url'].iloc[5]

'https://twitter.com/amyklobuchar/status/921031727556022274'

In [31]:
count_d = dict()
for word_list in tweets_df['split_tweets']:
    for word in word_list:
        if word not in count_d.keys():
            count_d[word] = 1
        else:
            count_d[word] += 1

In [36]:
max(count_d, key=count_d.get)

'to'

In [37]:
count_d['to']

194869

In [40]:
for k,v in count_d.items():
    if v > 10000:
        print(k,v)

we 28904
bill 14882
today 22770
amp 68558
to 194869
this 30203
i 31329
with 30772
on 63818
of 86755
in 86045
my 28761
and 54468
the 166934
our 33026
who 11603
an 10469
as 12872
great 14979
w 14952
about 17489
by 16176
it 16141
that 21487
is 42254
for 76704
a 65312
at 31015
will 20723
 84506
be 20619
help 10431
not 10783
their 12109
more 15779
from 21737
you 21844
all 12101
rt 47520
are 18437
have 14603
new 10018
us 15027
health 10954
senate 14603


In [21]:
def is_retweet(lst):
    if 'rt' in lst:
        return True
    else:
        return False

In [35]:
df_sub = df.copy()[:1000]

In [37]:
df[df['text'].str.contains('RT @')]

46402

In [20]:
tweets_df[['split_tweets']].apply(lambda x: list(map('rt' not in x, x)))

TypeError: 'bool' object is not callable

In [52]:
t.filter_df(retweets_df, 'favorites', 1000000, max)

Unnamed: 0,created_at,tweets,url,replies,retweets,favorites,user,bioguide_id,party,state
87415,1/20/17 15:49,RT @POTUS: It's been the honor of my life to s...,https://twitter.com/SenatorCantwell/status/822...,19158,632470,1642617,SenatorCantwell,C000127,D,WA
89414,7/20/17 16:48,RT @BarackObama: John McCain is an American he...,https://twitter.com/SenatorCardin/status/88807...,29569,419445,2108865,SenatorCardin,C000141,D,MD
92567,7/20/17 2:43,RT @BarackObama: John McCain is an American he...,https://twitter.com/SenatorCarper/status/88786...,29569,419445,2108865,SenatorCarper,C000174,D,DE
120520,10/3/17 14:19,RT @MichelleObama: Happy 25th anniversary @bar...,https://twitter.com/SenatorLeahy/status/915219...,25827,287230,1315041,SenatorLeahy,L000174,D,VT
129663,10/10/17 20:59,"RT @Malala: 5 years ago, I was shot in an atte...",https://twitter.com/SenatorShaheen/status/9178...,11053,356284,1156950,SenatorShaheen,S001181,D,NH
160184,1/20/17 22:13,RT @POTUS44: It's been the honor of my life to...,https://twitter.com/SenBooker/status/822567739...,19158,632470,1642608,SenBooker,B001288,D,NJ
196359,1/20/17 1:25,RT @FLOTUS: Being your First Lady has been the...,https://twitter.com/SenGillibrand/status/82225...,19142,368413,1167951,SenGillibrand,G000555,D,NY


In [57]:
t.filter_df(tweets_df, 'favorites', 500000, max)

Unnamed: 0,created_at,tweets,url,replies,retweets,favorites,user,bioguide_id,party,state
244285,8/15/17 22:06,".@realDonaldTrump, you are embarrassing our co...",https://twitter.com/SenSanders/status/89758034...,7510,201134,565707,SenSanders,S000033,I,VT
245162,2/25/17 13:56,.@realDonaldTrump They did. It wasn't. https:/...,https://twitter.com/SenSanders/status/83548856...,11072,207143,520384,SenSanders,S000033,I,VT
245420,1/21/17 22:15,"President Trump, you made a big mistake. By tr...",https://twitter.com/SenSanders/status/82293062...,13929,454660,975012,SenSanders,S000033,I,VT


In [60]:
t.filter_df(tweets_df, 'retweets', 0, max)

Unnamed: 0,created_at,tweets,url,replies,retweets,favorites,user,bioguide_id,party,state
245420,1/21/17 22:15,"President Trump, you made a big mistake. By tr...",https://twitter.com/SenSanders/status/82293062...,13929,454660,975012,SenSanders,S000033,I,VT


In [61]:
t.filter_df(retweets_df, 'retweets', 0, max)

Unnamed: 0,created_at,tweets,url,replies,retweets,favorites,user,bioguide_id,party,state
167894,4/11/17 17:47,RT @carterjwm: HELP ME PLEASE. A MAN NEEDS HIS...,https://twitter.com/SenCortezMasto/status/8518...,38225,3644423,1016576,SenCortezMasto,C001113,D,NV


In [62]:
t.filter_df(retweets_df, 'replies', 0, max)

Unnamed: 0,created_at,tweets,url,replies,retweets,favorites,user,bioguide_id,party,state
155684,8/12/17 20:09,RT @realDonaldTrump: We ALL must be united &am...,https://twitter.com/SenBobCorker/status/896463...,66872,59242,194351,SenBobCorker,C001071,R,TN
173545,8/13/17 0:06,RT @realDonaldTrump: We ALL must be united &am...,https://twitter.com/sendavidperdue/status/8965...,66872,59242,194346,sendavidperdue,P000612,R,GA


In [63]:
t.filter_df(tweets_df, 'replies', 0, max)

Unnamed: 0,created_at,tweets,url,replies,retweets,favorites,user,bioguide_id,party,state
211234,9/22/17 18:06,I cannot in good conscience vote for Graham-Ca...,https://twitter.com/SenJohnMcCain/status/91129...,38416,56606,190133,SenJohnMcCain,M000303,R,AZ


In [50]:
df.head()

Unnamed: 0,created_at,text,url,replies,retweets,favorites,user,bioguide_id,party,state
0,10/19/17 21:47,We released bipartisan healthcare bill today &...,https://twitter.com/amyklobuchar/status/921130...,21,129,533,amyklobuchar,K000367,D,MN
1,10/19/17 18:48,I spoke with @Morning_Joe team abt #HonestAds ...,https://twitter.com/amyklobuchar/status/921085...,8,46,150,amyklobuchar,K000367,D,MN
2,10/19/17 18:14,Lots of interest in my bill with Senators Warn...,https://twitter.com/amyklobuchar/status/921077...,36,227,932,amyklobuchar,K000367,D,MN
3,10/19/17 18:04,"Today's the day @MarkWarner, @SenJohnMcCain &a...",https://twitter.com/amyklobuchar/status/921074...,17,167,550,amyklobuchar,K000367,D,MN
4,10/19/17 16:33,".@MarkWarner, @SenJohnMcCain &amp; I put toget...",https://twitter.com/amyklobuchar/status/921051...,31,279,893,amyklobuchar,K000367,D,MN


In [71]:
import nltk
from nltk.corpus import stopwords
set(stopwords.words('english'))

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [72]:
stopwords

<WordListCorpusReader in '/home/azathoth/nltk_data/corpora/stopwords'>

In [67]:
from nltk.corpus import stopwords

In [70]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [74]:
from nltk.tokenize import word_tokenize 
  
example_sent = "This is a sample sentence, showing off the stop words filtration."
  
stop_words = set(stopwords.words('english')) 
  
word_tokens = word_tokenize(example_sent) 
  
filtered_sentence = [word for word in word_tokens if not w in stop_words] 
  
# filtered_sentence = [] 
  
# for w in word_tokens: 
#     if w not in stop_words: 
#         filtered_sentence.append(w) 
  
print(word_tokens) 
print(filtered_sentence) 

['This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.']
['This', 'sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']
