In [1]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
import seaborn as sns
count = CountVectorizer()
porter = PorterStemmer()

# Part 1  - making the dataset that will be used to predict tweet likes

In [2]:
analyser = SentimentIntensityAnalyzer()

In [3]:
#loading in data file
#converting to a pandas dataframe
df = pd.read_csv("tweets_with_retweet_count.csv")
#df = df.sample(frac = .25)
#df.info()

In [4]:
df.head()

Unnamed: 0,text,created_at,tweet_id,url,source,user_name,screen_name,user_location,user_verified,user_followers_count,is_quoted_tweet,quote_count,reply_count,retweet_count,favorite_count,hashtags,user_mentions,full_text
0,Conflicting reports. Some say he has passed aw...,2020-02-06 20:31:38.000000,1.225517e+18,https://twitter.com/barbara_volkwyn/status/122...,Twitter Web App,Michael Volkwyn's sister,barbara_volkwyn,"Cape Town, South Africa",0.0,649.0,0.0,0.0,0.0,0,0,#Coronavirus #CoronavirusOutbreak,,Conflicting reports. Some say he has passed aw...
1,Coronavirus: Reports of Brighton patient havin...,2020-02-06 20:31:38.000000,1.225517e+18,https://twitter.com/Fund999ER/status/122551742...,Twitter for Android,#FundOurERServices,Fund999ER,,0.0,371.0,0.0,0.0,0.0,0,0,#coronavirus #coronavirusuk,,Coronavirus: Reports of Brighton patient havin...
2,People are worried about the #CoronaVirus and ...,2020-02-06 20:31:40.000000,1.225517e+18,https://twitter.com/deuceohsixx/status/1225517...,Twitter for iPhone,D.E. Jones,deuceohsixx,Seattle 🇺🇸,0.0,8488.0,0.0,0.0,0.0,0,5,#CoronaVirus,,People are worried about the #CoronaVirus and ...
3,Complete breakdown of these meds is here:\n\nh...,2020-02-06 20:31:49.000000,1.225517e+18,https://twitter.com/ContagionTrack/status/1225...,Twitter Web App,Contagion Tracker,ContagionTrack,Pandemic,0.0,1946.0,1.0,0.0,0.0,2,6,,,Complete breakdown of these meds is here:\n\nh...
4,May his soul rest in eternal peace,2020-02-06 20:31:50.000000,1.225517e+18,https://twitter.com/thabeanstalk/status/122551...,Twitter for Android,Jack Bean,thabeanstalk,,0.0,275.0,1.0,0.0,0.0,0,0,,,May his soul rest in eternal peace https://t.c...


In [5]:
df.drop(columns=["retweet_count","text","created_at","tweet_id","url","user_name","screen_name","user_location","source","user_mentions"], inplace = True)

In [6]:
#The following cell will perform certain operations
#the operations are as follows

#get the word count of the tweet text, make a new column
df["word_count"] = df['full_text'].str.lower().str.split().apply(lambda x: len(x))

#get the amount of hashtags, make that its own column
df["hash_count"] = df['hashtags'].str.lower().str.split().apply(lambda x: len(x) if type(x) == list else 0)

#get the sentiment value for the tweet text, make that its own column
df["sentiment"] = df['full_text'].apply(lambda x: analyser.polarity_scores(x)['compound'])

In [7]:
df.head()

Unnamed: 0,user_verified,user_followers_count,is_quoted_tweet,quote_count,reply_count,favorite_count,hashtags,full_text,word_count,hash_count,sentiment
0,0.0,649.0,0.0,0.0,0.0,0,#Coronavirus #CoronavirusOutbreak,Conflicting reports. Some say he has passed aw...,13,2,-0.4019
1,0.0,371.0,0.0,0.0,0.0,0,#coronavirus #coronavirusuk,Coronavirus: Reports of Brighton patient havin...,10,2,0.0
2,0.0,8488.0,0.0,0.0,0.0,5,#CoronaVirus,People are worried about the #CoronaVirus and ...,34,1,-0.9477
3,0.0,1946.0,1.0,0.0,0.0,6,,Complete breakdown of these meds is here:\n\nh...,21,0,0.7717
4,0.0,275.0,1.0,0.0,0.0,0,,May his soul rest in eternal peace https://t.c...,8,0,0.5423


In [8]:
def remove_punct(sentence):
    punctuations=")(@?:`!.&,;"
    sentence_words = nltk.word_tokenize(sentence)
    for word in sentence_words:
        if word in punctuations:
            sentence_words.remove(word)
    
    return sentence_words

def remove_links_hash(sentence):
    sentence_list = sentence.split()
    newlist= []
    
    for word in sentence_list:
        if word.find("http") == -1 and word.find("#") == -1:
            newlist.append(word)
            
    return " ".join(newlist)


def stemSentence(sentence):
    token_words=word_tokenize(sentence)
    token_words
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

In [9]:
df['manipulated_text'] = df['full_text'].str.lower().apply(lambda x: remove_links_hash(x))
df['manipulated_text'] = df['manipulated_text'].str.lower().apply(lambda x: stemSentence(x))
df['manipulated_text'] = df['manipulated_text'].apply(lambda x: " ".join(remove_punct(x)))

In [10]:
bag_of_words = count.fit_transform(df['manipulated_text'])
feature_names = count.get_feature_names()
len(feature_names)

17877

In [11]:
wc = pd.DataFrame(bag_of_words.toarray(), columns = feature_names)
wc.columns = wc.columns.astype(str)
#wc.info()
s = wc.sum().sort_values(ascending=False, inplace=False)


In [12]:
wc = wc.loc[:,s.index]
wc = wc[s.index[:100]]

wc.head()

Unnamed: 0,the,to,of,in,is,and,thi,on,for,it,...,my,go,warn,test,over,need,wenliang,updat,novel,tri
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1,0,0,1,1,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,1,1,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
df.drop(columns=["hashtags","full_text","manipulated_text"], inplace = True)

In [14]:
df.head()

Unnamed: 0,user_verified,user_followers_count,is_quoted_tweet,quote_count,reply_count,favorite_count,word_count,hash_count,sentiment
0,0.0,649.0,0.0,0.0,0.0,0,13,2,-0.4019
1,0.0,371.0,0.0,0.0,0.0,0,10,2,0.0
2,0.0,8488.0,0.0,0.0,0.0,5,34,1,-0.9477
3,0.0,1946.0,1.0,0.0,0.0,6,21,0,0.7717
4,0.0,275.0,1.0,0.0,0.0,0,8,0,0.5423


In [15]:
merged = df.merge(wc, left_index = True, right_index=True)

In [16]:
merged.head()

Unnamed: 0,user_verified,user_followers_count,is_quoted_tweet,quote_count,reply_count,favorite_count,word_count,hash_count,sentiment,the,...,my,go,warn,test,over,need,wenliang,updat,novel,tri
0,0.0,649.0,0.0,0.0,0.0,0,13,2,-0.4019,0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,371.0,0.0,0.0,0.0,0,10,2,0.0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,8488.0,0.0,0.0,0.0,5,34,1,-0.9477,3,...,0,0,0,0,0,1,0,0,0,0
3,0.0,1946.0,1.0,0.0,0.0,6,21,0,0.7717,0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,275.0,1.0,0.0,0.0,0,8,0,0.5423,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
#convert to a dataframe
df.to_csv("simplified_tweets_for_likes.csv", index = False)
merged.to_csv('cleaned_tweets_for_likes.csv', index = False)

# Part 2  - making the dataset that will be used to predict tweet retweets

In [18]:
#loading in data file
#converting to a pandas dataframe
df = pd.read_csv("tweets_with_retweet_count.csv")
#df = df.sample(frac = .25)
#df.info()
df.head()

Unnamed: 0,text,created_at,tweet_id,url,source,user_name,screen_name,user_location,user_verified,user_followers_count,is_quoted_tweet,quote_count,reply_count,retweet_count,favorite_count,hashtags,user_mentions,full_text
0,Conflicting reports. Some say he has passed aw...,2020-02-06 20:31:38.000000,1.225517e+18,https://twitter.com/barbara_volkwyn/status/122...,Twitter Web App,Michael Volkwyn's sister,barbara_volkwyn,"Cape Town, South Africa",0.0,649.0,0.0,0.0,0.0,0,0,#Coronavirus #CoronavirusOutbreak,,Conflicting reports. Some say he has passed aw...
1,Coronavirus: Reports of Brighton patient havin...,2020-02-06 20:31:38.000000,1.225517e+18,https://twitter.com/Fund999ER/status/122551742...,Twitter for Android,#FundOurERServices,Fund999ER,,0.0,371.0,0.0,0.0,0.0,0,0,#coronavirus #coronavirusuk,,Coronavirus: Reports of Brighton patient havin...
2,People are worried about the #CoronaVirus and ...,2020-02-06 20:31:40.000000,1.225517e+18,https://twitter.com/deuceohsixx/status/1225517...,Twitter for iPhone,D.E. Jones,deuceohsixx,Seattle 🇺🇸,0.0,8488.0,0.0,0.0,0.0,0,5,#CoronaVirus,,People are worried about the #CoronaVirus and ...
3,Complete breakdown of these meds is here:\n\nh...,2020-02-06 20:31:49.000000,1.225517e+18,https://twitter.com/ContagionTrack/status/1225...,Twitter Web App,Contagion Tracker,ContagionTrack,Pandemic,0.0,1946.0,1.0,0.0,0.0,2,6,,,Complete breakdown of these meds is here:\n\nh...
4,May his soul rest in eternal peace,2020-02-06 20:31:50.000000,1.225517e+18,https://twitter.com/thabeanstalk/status/122551...,Twitter for Android,Jack Bean,thabeanstalk,,0.0,275.0,1.0,0.0,0.0,0,0,,,May his soul rest in eternal peace https://t.c...


In [19]:
df.drop(columns=["favorite_count","text","created_at","tweet_id","url","user_name","screen_name","user_location","source","user_mentions"], inplace = True)

In [20]:
#The following cell will perform certain operations
#the operations are as follows

#get the word count of the tweet text, make a new column
df["word_count"] = df['full_text'].str.lower().str.split().apply(lambda x: len(x))

#get the amount of hashtags, make that its own column
df["hash_count"] = df['hashtags'].str.lower().str.split().apply(lambda x: len(x) if type(x) == list else 0)

#get the sentiment value for the tweet text, make that its own column
df["sentiment"] = df['full_text'].apply(lambda x: analyser.polarity_scores(x)['compound'])

In [21]:
df['manipulated_text'] = df['full_text'].str.lower().apply(lambda x: remove_links_hash(x))
df['manipulated_text'] = df['manipulated_text'].str.lower().apply(lambda x: stemSentence(x))
df['manipulated_text'] = df['manipulated_text'].apply(lambda x: " ".join(remove_punct(x)))

In [22]:
bag_of_words = count.fit_transform(df['manipulated_text'])
feature_names = count.get_feature_names()
len(feature_names)

17877

In [23]:
wc = pd.DataFrame(bag_of_words.toarray(), columns = feature_names)
wc.columns = wc.columns.astype(str)
#wc.info()
s = wc.sum().sort_values(ascending=False, inplace=False)


In [24]:
wc = wc.loc[:,s.index]
wc = wc[s.index[:100]]

wc.head()

Unnamed: 0,the,to,of,in,is,and,thi,on,for,it,...,my,go,warn,test,over,need,wenliang,updat,novel,tri
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1,0,0,1,1,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,1,1,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
df.drop(columns=["hashtags","full_text","manipulated_text"], inplace = True)

In [26]:
merged = df.merge(wc, left_index = True, right_index=True)

In [27]:
#convert to a dataframe
df.to_csv("simplified_tweets_for_retweet.csv", index = False)
merged.to_csv('cleaned_tweets_for_retweet.csv', index = False)