In [1]:
import GetOldTweets3 as got
import pandas as pd
import datetime as dt

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import re
import time

stopWords = set(stopwords.words('english'))

In [2]:
username = '@JoeBiden'
startDate = "2020-05-12"
endDate = "2020-05-19"

In [3]:
tweetCriteria = got.manager.TweetCriteria().setUsername(username).setSince(startDate).setUntil(endDate)
tweets = got.manager.TweetManager.getTweets(tweetCriteria)

In [4]:
df = pd.DataFrame([tweet.__dict__ for tweet in tweets])
originalDF = df

df = df.drop([], axis=1)

In [5]:
df.sample(3)

Unnamed: 0,username,to,text,retweets,favorites,replies,id,permalink,author_id,date,formatted_date,hashtags,mentions,geo,urls
58,JoeBiden,realDonaldTrump,That’s the plan. http://IWillVote.com,23430,128870,5095,1260267215921844225,https://twitter.com/JoeBiden/status/1260267215...,939091,2020-05-12 17:54:53+00:00,Tue May 12 17:54:53 +0000 2020,,,,"http://IWillVote.com,https://twitter.com/realD..."
45,JoeBiden,NPR,Moments of crisis require swift and decisive a...,4982,18009,1418,1260733099652022273,https://twitter.com/JoeBiden/status/1260733099...,939091,2020-05-14 00:46:08+00:00,Thu May 14 00:46:08 +0000 2020,,,,https://twitter.com/NPR/status/126040338410607...
16,JoeBiden,,We need a president who puts hardworking Ameri...,4079,16299,2032,1261760080422322176,https://twitter.com/JoeBiden/status/1261760080...,939091,2020-05-16 20:47:00+00:00,Sat May 16 20:47:00 +0000 2020,,,,


## Finding number of important words (keywords)

In [6]:
# This function  returns count of nouns, verbs, aadjectives, adverbs in the given tweet
def getkeywordCount(tweet):
    # Remove non alphanumeric characters 
    tweet = re.sub(r'[^\w]', ' ', tweet)
    # make tokens
    wordTokens = word_tokenize(tweet)
    # remove stopwords &
    # perform lemmatisation 
    lemmatizer = WordNetLemmatizer() 
    filtered = [lemmatizer.lemmatize(w) for w in wordTokens if not w in stopWords]
    # tag parts of speech
    tagged = nltk.pos_tag(filtered)
    keywords = [w[0] for w in tagged if w[1][0] in ["N", "J", "V", "R"]]
    return len(keywords)

In [7]:
df['keywordCount'] = df.apply(lambda x : getkeywordCount(x["text"]), axis=1)

In [8]:
df.sample(3)

Unnamed: 0,username,to,text,retweets,favorites,replies,id,permalink,author_id,date,formatted_date,hashtags,mentions,geo,urls,keywordCount
45,JoeBiden,NPR,Moments of crisis require swift and decisive a...,4982,18009,1418,1260733099652022273,https://twitter.com/JoeBiden/status/1260733099...,939091,2020-05-14 00:46:08+00:00,Thu May 14 00:46:08 +0000 2020,,,,https://twitter.com/NPR/status/126040338410607...,19
59,JoeBiden,,Here's the deal: We can choose who our country...,11332,53361,4275,1260225720435032065,https://twitter.com/JoeBiden/status/1260225720...,939091,2020-05-12 15:10:00+00:00,Tue May 12 15:10:00 +0000 2020,,,,,16
20,JoeBiden,,Science over fiction.,22560,163707,13255,1261665206184312838,https://twitter.com/JoeBiden/status/1261665206...,939091,2020-05-16 14:30:00+00:00,Sat May 16 14:30:00 +0000 2020,,,,,2


## Normalise the columns

In [9]:
for feature in ["retweets", "favorites", "replies", 'keywordCount']:
    df[feature+"NormalisedCount"] = df[feature]/(df[feature].max())#-df[feature].min())

In [10]:
df.sample(3)

Unnamed: 0,username,to,text,retweets,favorites,replies,id,permalink,author_id,date,formatted_date,hashtags,mentions,geo,urls,keywordCount,retweetsNormalisedCount,favoritesNormalisedCount,repliesNormalisedCount,keywordCountNormalisedCount
58,JoeBiden,realDonaldTrump,That’s the plan. http://IWillVote.com,23430,128870,5095,1260267215921844225,https://twitter.com/JoeBiden/status/1260267215...,939091,2020-05-12 17:54:53+00:00,Tue May 12 17:54:53 +0000 2020,,,,"http://IWillVote.com,https://twitter.com/realD...",4,0.576384,0.512779,0.177917,0.105263
34,JoeBiden,,Donald Trump can deflect and shift blame all h...,14149,37688,3054,1261070033905577985,https://twitter.com/JoeBiden/status/1261070033...,939091,2020-05-14 23:05:00+00:00,Thu May 14 23:05:00 +0000 2020,,,,,17,0.348069,0.149962,0.106645,0.447368
12,JoeBiden,,"Today, on the International Day Against Homoph...",8979,40872,1828,1262021805180506117,https://twitter.com/JoeBiden/status/1262021805...,939091,2020-05-17 14:07:00+00:00,Sun May 17 14:07:00 +0000 2020,#IDAHOTB,,,https://medium.com/@JoeBiden/statement-by-vice...,22,0.220886,0.162631,0.063834,0.578947


## Find out if the tweet is a retweet to some other persons tweet
#### We do this because retweeted tweets get a far greater reach and thus skews the scores

In [11]:
df["ifTo"] = df.apply(lambda x: 0.6 if x["to"] else 1,axis = 1)

In [12]:
df.sample(3)

Unnamed: 0,username,to,text,retweets,favorites,replies,id,permalink,author_id,date,...,hashtags,mentions,geo,urls,keywordCount,retweetsNormalisedCount,favoritesNormalisedCount,repliesNormalisedCount,keywordCountNormalisedCount,ifTo
20,JoeBiden,,Science over fiction.,22560,163707,13255,1261665206184312838,https://twitter.com/JoeBiden/status/1261665206...,939091,2020-05-16 14:30:00+00:00,...,,,,,2,0.554982,0.651396,0.462863,0.052632,1.0
42,JoeBiden,,Happy Birthday to the one and only @StevieWonder.,2106,18265,700,1260764765556740098,https://twitter.com/JoeBiden/status/1260764765...,939091,2020-05-14 02:51:58+00:00,...,,@StevieWonder,,,3,0.051808,0.072677,0.024444,0.078947,1.0
3,JoeBiden,,Donald Trump is the most corrupt president in ...,37786,206478,28637,1262420180690505733,https://twitter.com/JoeBiden/status/1262420180...,939091,2020-05-18 16:30:00+00:00,...,,,,,12,0.929545,0.821584,1.0,0.315789,1.0


## Get time elapsed since every tweet.
### We hypothise that older tweets get more exposure

In [13]:
df["formatted_date"] = df.apply(lambda x: x["formatted_date"][:-10]+x["formatted_date"][-4:], axis=1)

df["timeElapsedHours"] = df.apply(lambda x: (pd.Timestamp(endDate) - pd.to_datetime(x["formatted_date"])).total_seconds()//3600, axis = 1 )

df["timeElapsedScore"] = 1+0.04*(7*12-df["timeElapsedHours"])/7/24

## Assign Score to tweet on basis of normalised count of retweets, favorites, replies, keyword count and time Elapsed

In [14]:
df["score"] = (1.5*df["retweetsNormalisedCount"] + 1.2*df["repliesNormalisedCount"] + 1*df["favoritesNormalisedCount"] + 2*df["keywordCount"])*df["ifTo"]*df["timeElapsedScore"]

In [15]:
sortedDF = df.sort_values(by=['score'], inplace=False,  ascending=False)

In [16]:
sortedDF.head(5)

Unnamed: 0,username,to,text,retweets,favorites,replies,id,permalink,author_id,date,...,urls,keywordCount,retweetsNormalisedCount,favoritesNormalisedCount,repliesNormalisedCount,keywordCountNormalisedCount,ifTo,timeElapsedHours,timeElapsedScore,score
25,JoeBiden,,Los fracasos internacionales de Trump han abie...,1293,5126,1468,1261404739109113856,https://twitter.com/JoeBiden/status/1261404739...,939091,2020-05-15 21:15:00+00:00,...,,38,0.031808,0.020397,0.051262,1.0,1.0,74.0,1.002381,76.310885
26,JoeBiden,,Trump's international failures have cleared a ...,6525,32228,6326,1261403984100818944,https://twitter.com/JoeBiden/status/1261403984...,939091,2020-05-15 21:12:00+00:00,...,,27,0.160517,0.128236,0.220903,0.710526,1.0,74.0,1.002381,54.764176
8,JoeBiden,,As we reflect on the incredible legacy of Brow...,6168,30730,1360,1262133289982164992,https://twitter.com/JoeBiden/status/1262133289...,939091,2020-05-17 21:30:00+00:00,...,,26,0.151734,0.122276,0.047491,0.684211,1.0,26.0,1.01381,53.13058
6,JoeBiden,,"A year ago today, the House passed the #Equali...",6838,26338,1103,1262171039863394305,https://twitter.com/JoeBiden/status/1262171039...,939091,2020-05-18 00:00:00+00:00,...,,25,0.168216,0.1048,0.038517,0.657895,1.0,24.0,1.014286,51.123392
1,JoeBiden,,- Expand vote-by-mail and early voting - Imple...,9191,37622,2820,1262480579624235008,https://twitter.com/JoeBiden/status/1262480579...,939091,2020-05-18 20:30:00+00:00,...,,24,0.226101,0.149699,0.098474,0.631579,1.0,3.0,1.019286,49.544441


In [17]:
# finalDF = sortedDF.drop(['to', 'retweets', 'favorites', 'replies', 'date',
#        'hashtags', 'mentions', 'geo', 'urls', 'keywordCount',
#        'ifTo', 'timeElapsedHours', 'timeElapsedScore'], axis = 1)

# finalDF.head(5).to_csv("JoeBiden.csv", index = False)

In [21]:
tweetsDF = sortedDF.head(5)

In [22]:
tweetsDF.columns

Index(['username', 'to', 'text', 'retweets', 'favorites', 'replies', 'id',
       'permalink', 'author_id', 'date', 'formatted_date', 'hashtags',
       'mentions', 'geo', 'urls', 'keywordCount', 'retweetsNormalisedCount',
       'favoritesNormalisedCount', 'repliesNormalisedCount',
       'keywordCountNormalisedCount', 'ifTo', 'timeElapsedHours',
       'timeElapsedScore', 'score'],
      dtype='object')

In [25]:
tweetsList = []
for i, row in df.iterrows():
    d = {}
    d['username'] = row['username']
    d['date'] = row['formatted_date']
    d['replies'] = row['replies']
    d['retweets'] = row['retweets']
    d['favorites'] = row['favorites']
    d['text'] = d['text']
    tweetsList.append(d)

In [27]:
tweetsList

[{'username': 'JoeBiden',
  'date': 'Fri May 15 21:15:00 2020',
  'replies': 1468,
  'retweets': 1293,
  'favorites': 5126},
 {'username': 'JoeBiden',
  'date': 'Fri May 15 21:12:00 2020',
  'replies': 6326,
  'retweets': 6525,
  'favorites': 32228},
 {'username': 'JoeBiden',
  'date': 'Sun May 17 21:30:00 2020',
  'replies': 1360,
  'retweets': 6168,
  'favorites': 30730},
 {'username': 'JoeBiden',
  'date': 'Mon May 18 00:00:00 2020',
  'replies': 1103,
  'retweets': 6838,
  'favorites': 26338},
 {'username': 'JoeBiden',
  'date': 'Mon May 18 20:30:00 2020',
  'replies': 2820,
  'retweets': 9191,
  'favorites': 37622}]