In [1]:
import GetOldTweets3 as got
import pandas as pd
import datetime as dt

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import re
import time

stopWords = set(stopwords.words('english'))

In [2]:
username = '@JoeBiden'
startDate = "2020-05-12"
endDate = "2020-05-19"

In [3]:
tweetCriteria = got.manager.TweetCriteria().setUsername(username).setSince(startDate).setUntil(endDate)
tweets = got.manager.TweetManager.getTweets(tweetCriteria)

In [4]:
df = pd.DataFrame([tweet.__dict__ for tweet in tweets])
originalDF = df

df = df.drop([], axis=1)

In [5]:
df.sample(3)

Unnamed: 0,username,to,text,retweets,favorites,replies,id,permalink,author_id,date,formatted_date,hashtags,mentions,geo,urls
0,JoeBiden,,One of President Trump's top advisors admitted...,4613,16987,1072,1262508260411621377,https://twitter.com/JoeBiden/status/1262508260...,939091,2020-05-18 22:20:00+00:00,Mon May 18 22:20:00 +0000 2020,,,,https://www.cnn.com/2020/05/17/politics/cdc-of...
55,JoeBiden,NPR,The Trump Administration clearly knows what's ...,8001,27272,1348,1260339651509919746,https://twitter.com/JoeBiden/status/1260339651...,939091,2020-05-12 22:42:43+00:00,Tue May 12 22:42:43 +0000 2020,,,,https://twitter.com/NPR/status/126001245611963...
4,JoeBiden,,In the middle of the worst public health crisi...,6009,19491,1510,1262386206865797124,https://twitter.com/JoeBiden/status/1262386206...,939091,2020-05-18 14:15:00+00:00,Mon May 18 14:15:00 +0000 2020,,,,https://joebiden.com/fact-sheet-how-joe-biden-...


## Finding number of important words (keywords)

In [6]:
# This function  returns count of nouns, verbs, aadjectives, adverbs in the given tweet
def getkeywordCount(tweet):
    # Remove non alphanumeric characters 
    tweet = re.sub(r'[^\w]', ' ', tweet)
    # make tokens
    wordTokens = word_tokenize(tweet)
    # remove stopwords &
    # perform lemmatisation 
    lemmatizer = WordNetLemmatizer() 
    filtered = [lemmatizer.lemmatize(w) for w in wordTokens if not w in stopWords]
    # tag parts of speech
    tagged = nltk.pos_tag(filtered)
    keywords = [w[0] for w in tagged if w[1][0] in ["N", "J", "V", "R"]]
    return len(keywords)

In [7]:
df['keywordCount'] = df.apply(lambda x : getkeywordCount(x["text"]), axis=1)

In [8]:
df.sample(3)

Unnamed: 0,username,to,text,retweets,favorites,replies,id,permalink,author_id,date,formatted_date,hashtags,mentions,geo,urls,keywordCount
35,JoeBiden,,"Sherian, a small business owner from South Car...",1457,4995,528,1261061226886574081,https://twitter.com/JoeBiden/status/1261061226...,939091,2020-05-14 22:30:00+00:00,Thu May 14 22:30:00 +0000 2020,,,,https://medium.com/sincerely-joe/sincerely-joe...,22
46,JoeBiden,CNN,I would trust the guy who’s one of our nation'...,31674,148057,7426,1260717021840445452,https://twitter.com/JoeBiden/status/1260717021...,939091,2020-05-13 23:42:15+00:00,Wed May 13 23:42:15 +0000 2020,,,,https://twitter.com/CNN/status/126033557062836...,15
31,JoeBiden,,We are months into this crisis. There is simpl...,20475,113307,7404,1261098722852777984,https://twitter.com/JoeBiden/status/1261098722...,939091,2020-05-15 00:59:00+00:00,Fri May 15 00:59:00 +0000 2020,,,,,11


## Normalise the columns

In [9]:
for feature in ["retweets", "favorites", "replies", 'keywordCount']:
    df[feature+"NormalisedCount"] = df[feature]/(df[feature].max())#-df[feature].min())

In [10]:
df.sample(3)

Unnamed: 0,username,to,text,retweets,favorites,replies,id,permalink,author_id,date,formatted_date,hashtags,mentions,geo,urls,keywordCount,retweetsNormalisedCount,favoritesNormalisedCount,repliesNormalisedCount,keywordCountNormalisedCount
39,JoeBiden,,With the void of leadership in the White House...,1798,5900,746,1260986986820599812,https://twitter.com/JoeBiden/status/1260986986...,939091,2020-05-14 17:35:00+00:00,Thu May 14 17:35:00 +0000 2020,,,,http://JoeBiden.com/Live,22,0.044232,0.023476,0.02605,0.578947
44,JoeBiden,,Not only does the Trump Administration want to...,7561,19037,1832,1260744136212140034,https://twitter.com/JoeBiden/status/1260744136...,939091,2020-05-14 01:30:00+00:00,Thu May 14 01:30:00 +0000 2020,,,,https://time.com/5836504/usda-snap-appeal-rule...,20,0.186007,0.075749,0.063973,0.526316
38,JoeBiden,,Small businesses are the backbone of this coun...,2264,7833,823,1261013914025750528,https://twitter.com/JoeBiden/status/1261013914...,939091,2020-05-14 19:22:00+00:00,Thu May 14 19:22:00 +0000 2020,,,,https://joebiden.com/smallbusinesses/,9,0.055696,0.031168,0.028739,0.236842


## Find out if the tweet is a retweet to some other persons tweet
#### We do this because retweeted tweets get a far greater reach and thus skews the scores

In [11]:
df["ifTo"] = df.apply(lambda x: 0.6 if x["to"] else 1,axis = 1)

In [12]:
df.sample(3)

Unnamed: 0,username,to,text,retweets,favorites,replies,id,permalink,author_id,date,...,hashtags,mentions,geo,urls,keywordCount,retweetsNormalisedCount,favoritesNormalisedCount,repliesNormalisedCount,keywordCountNormalisedCount,ifTo
47,JoeBiden,realDonaldTrump,Enough with the anger and division and fear-mo...,10005,47238,12770,1260701573765836805,https://twitter.com/JoeBiden/status/1260701573...,939091,2020-05-13 22:40:52+00:00,...,,,,https://twitter.com/realDonaldTrump/status/126...,20,0.246132,0.187961,0.445927,0.526316,0.6
40,JoeBiden,,As the economy continues to spiral downwards a...,7961,27437,2073,1260968615450271744,https://twitter.com/JoeBiden/status/1260968615...,939091,2020-05-14 16:22:00+00:00,...,,,,https://medium.com/@JoeBiden/statement-by-vice...,23,0.195847,0.109172,0.072389,0.605263,1.0
61,JoeBiden,,I believe everyone deserves a fair shot at suc...,15354,105992,9700,1260011558899499010,https://twitter.com/JoeBiden/status/1260011558...,939091,2020-05-12 00:59:00+00:00,...,,,,,9,0.377721,0.421745,0.338723,0.236842,1.0


## Get time elapsed since every tweet.
### We hypothise that older tweets get more exposure

In [13]:
df["formatted_date"] = df.apply(lambda x: x["formatted_date"][:-10]+x["formatted_date"][-4:], axis=1)

df["timeElapsedHours"] = df.apply(lambda x: (pd.Timestamp(endDate) - pd.to_datetime(x["formatted_date"])).total_seconds()//3600, axis = 1 )

df["timeElapsedScore"] = 1+0.04*(7*12-df["timeElapsedHours"])/7/24

## Assign Score to tweet on basis of normalised count of retweets, favorites, replies, keyword count and time Elapsed

In [14]:
df["score"] = (1.5*df["retweetsNormalisedCount"] + 1.2*df["repliesNormalisedCount"] + 1*df["favoritesNormalisedCount"] + 2*df["keywordCount"])*df["ifTo"]*df["timeElapsedScore"]

In [15]:
sortedDF = df.sort_values(by=['score'], inplace=False,  ascending=False)

In [16]:
sortedDF.head(5)

Unnamed: 0,username,to,text,retweets,favorites,replies,id,permalink,author_id,date,...,urls,keywordCount,retweetsNormalisedCount,favoritesNormalisedCount,repliesNormalisedCount,keywordCountNormalisedCount,ifTo,timeElapsedHours,timeElapsedScore,score
25,JoeBiden,,Los fracasos internacionales de Trump han abie...,1293,5125,1468,1261404739109113856,https://twitter.com/JoeBiden/status/1261404739...,939091,2020-05-15 21:15:00+00:00,...,,38,0.031809,0.020392,0.051262,1.0,1.0,74.0,1.002381,76.310882
26,JoeBiden,,Trump's international failures have cleared a ...,6525,32228,6326,1261403984100818944,https://twitter.com/JoeBiden/status/1261403984...,939091,2020-05-15 21:12:00+00:00,...,,27,0.160521,0.128236,0.220903,0.710526,1.0,74.0,1.002381,54.764182
8,JoeBiden,,As we reflect on the incredible legacy of Brow...,6168,30730,1360,1262133289982164992,https://twitter.com/JoeBiden/status/1262133289...,939091,2020-05-17 21:30:00+00:00,...,,26,0.151738,0.122275,0.047491,0.684211,1.0,26.0,1.01381,53.130586
6,JoeBiden,,"A year ago today, the House passed the #Equali...",6838,26337,1103,1262171039863394305,https://twitter.com/JoeBiden/status/1262171039...,939091,2020-05-18 00:00:00+00:00,...,,25,0.168221,0.104796,0.038517,0.657895,1.0,24.0,1.014286,51.123394
1,JoeBiden,,- Expand vote-by-mail and early voting - Imple...,9191,37621,2820,1262480579624235008,https://twitter.com/JoeBiden/status/1262480579...,939091,2020-05-18 20:30:00+00:00,...,,24,0.226106,0.149695,0.098474,0.631579,1.0,3.0,1.019286,49.544444


In [17]:
# finalDF = sortedDF.drop(['to', 'retweets', 'favorites', 'replies', 'date',
#        'hashtags', 'mentions', 'geo', 'urls', 'keywordCount',
#        'ifTo', 'timeElapsedHours', 'timeElapsedScore'], axis = 1)

# finalDF.head(5).to_csv("JoeBiden.csv", index = False)

In [19]:
tweetsDF = sortedDF.head(5)

In [20]:
tweetsDF.columns

Index(['username', 'to', 'text', 'retweets', 'favorites', 'replies', 'id',
       'permalink', 'author_id', 'date', 'formatted_date', 'hashtags',
       'mentions', 'geo', 'urls', 'keywordCount', 'retweetsNormalisedCount',
       'favoritesNormalisedCount', 'repliesNormalisedCount',
       'keywordCountNormalisedCount', 'ifTo', 'timeElapsedHours',
       'timeElapsedScore', 'score'],
      dtype='object')

In [23]:
tweetsList = []
for i, row in df.iterrows():
    d = {}
    d['username'] = row['username']
    d['date'] = row['formatted_date']
    d['replies'] = row['replies']
    d['retweets'] = row['retweets']
    d['favorites'] = row['favorites']
    d['text'] = row['text']
    d['link'] = row['permalink']
#     d['hashtag'] = row['hashtags']
#     d['mentions'] = row['mentions']
    d['author_id'] = row['author_id']
    tweetsList.append(d)

In [24]:
tweetsList[0]

{'username': 'JoeBiden',
 'date': 'Mon May 18 22:20:00 2020',
 'replies': 1072,
 'retweets': 4613,
 'favorites': 16987,
 'text': 'One of President Trump\'s top advisors admitted that this administration "really let the country down" on testing. He\'s right. It\'s long past time for Trump to step up and implement a national testing strategy so we can begin safely reopening the economy.',
 'link': 'https://twitter.com/JoeBiden/status/1262508260411621377',
 'author_id': 939091}