# Check (That Tweet) Yo Self 
## Prioritizing Tweets to Fact Check
###### Part 2: Data Cleaning
In this notebook, we'll clean the tweets we've gathered about Coronavirus.

Import necessary libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
import time
import warnings
import regex as re
import seaborn as sns
import re

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from nltk.sentiment.vader import SentimentIntensityAnalyzer
warnings.filterwarnings('ignore')
np.random.seed(824)
from bs4 import BeautifulSoup 

# Import stopwords.
from nltk.corpus import stopwords # Import the stopword list
import nltk

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

plt.style.use('fivethirtyeight')

### Read in collected dataframe

In [2]:
tweet = pd.read_csv('../data/all_tweets.csv')

### Drop duplicate tweets (a little over 100 tweets removed)

In [3]:
tweet.shape

(103608, 14)

In [4]:
tweet.head(2)

Unnamed: 0,id,type,time,author,author_id,re_tweeter,associated_tweet,text,links,hashtags,mentions,reply_count,favorite_count,retweet_count
0,1254198473819291649,tweet,1587859192000,PulpNews,100986964,,1254198473819291649,Isolation and boredom of staying at home can b...,['https://t.co/49b7W0d6V5'],[],[],0,0,0
1,1254198461563637763,tweet,1587859189000,aishacs,15809934,,1254197958595301386,We left the trail early once we saw that the s...,[],[],[],2,21,0


In [5]:
tweet = tweet.drop_duplicates(subset='id', keep="first")

In [6]:
tweet.shape

(103492, 14)

### Drop "re-tweeter" column: it comprises only null entires
### Drop "type" column: it comprises only "tweet" entries, no knowledge gain.

In [7]:
tweet.isnull().sum()

id                       0
type                     0
time                     0
author                   0
author_id                0
re_tweeter          103492
associated_tweet         0
text                    14
links                    0
hashtags                 0
mentions                 0
reply_count              0
favorite_count           0
retweet_count            0
dtype: int64

In [8]:
tweet['type'].value_counts()

tweet    103492
Name: type, dtype: int64

In [9]:
tweet = tweet.drop(columns = ['re_tweeter', 'type'])

### Remove additional missing data

In [10]:
tweet.isnull().sum()

id                   0
time                 0
author               0
author_id            0
associated_tweet     0
text                14
links                0
hashtags             0
mentions             0
reply_count          0
favorite_count       0
retweet_count        0
dtype: int64

In [11]:
tweet = tweet.dropna()

### 1. Convert time column to readable text
### 2. Addition of day column to see time distribution of collected tweets

In [12]:
tweet.dtypes

id                   int64
time                 int64
author              object
author_id            int64
associated_tweet     int64
text                object
links               object
hashtags            object
mentions            object
reply_count          int64
favorite_count       int64
retweet_count        int64
dtype: object

In [13]:
tweet['time'] = pd.to_numeric(tweet['time'])

import time

def change_time(x):
    x = x / 1000
    return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(x))

tweet['time'] = tweet['time'].apply(change_time)

In [14]:
def day_classification (string):
    if "2020-04-26" in string:
        return "Sunday"
    elif "2020-04-25" in string:
        return "Saturday"
    elif "2020-04-24" in string:
        return "Friday"
    elif "2020-04-23" in string:
        return "Thursday"

tweet['day'] = tweet['time'].map(day_classification)    

In [15]:
tweet['day'].value_counts()

Saturday    46873
Friday      39723
Thursday    11034
Name: day, dtype: int64

### There are 5848 tweets that were collected that aren't within our timeframe of interest. Dropping these tweets from the data

In [16]:
tweet.isnull().sum()

id                     0
time                   0
author                 0
author_id              0
associated_tweet       0
text                   0
links                  0
hashtags               0
mentions               0
reply_count            0
favorite_count         0
retweet_count          0
day                 5848
dtype: int64

In [17]:
tweet = tweet.dropna()

### Clean up text to be predominantly recognized as english by the sklearn library

In [18]:
#set english words
english = set(nltk.corpus.words.words())

def is_english(string):
    #split string up
    words = string.split()
    #start counter
    count = 0
    for word in words:
        #check if the word is in our english list
        if word.lower() not in english:
            count += 1
        #check if any characters are not in the alphabet
        elif word.lower().isalpha() == False:
            count += .33
    perc = round(count / len(words),4)
    #return the proportion of non-english words
    return perc

#apply to this function and create a new column
tweet['not_english'] = tweet['text'].apply(is_english)

In [19]:
tweet.head(2)

Unnamed: 0,id,time,author,author_id,associated_tweet,text,links,hashtags,mentions,reply_count,favorite_count,retweet_count,day,not_english
0,1254198473819291649,2020-04-25 16:59:52,PulpNews,100986964,1254198473819291649,Isolation and boredom of staying at home can b...,['https://t.co/49b7W0d6V5'],[],[],0,0,0,Saturday,0.48
1,1254198461563637763,2020-04-25 16:59:49,aishacs,15809934,1254197958595301386,We left the trail early once we saw that the s...,[],[],[],2,21,0,Saturday,0.2174


In [20]:
tweet.describe()

Unnamed: 0,id,author_id,associated_tweet,reply_count,favorite_count,retweet_count,not_english
count,97630.0,97630.0,97630.0,97630.0,97630.0,97630.0,97630.0
mean,1.253957e+18,3.914429e+17,1.253282e+18,1.456345,15.607426,3.84599,0.363189
std,244166200000000.0,5.131221e+17,1.487264e+16,56.718205,625.546981,152.979652,0.203048
min,1.253218e+18,767.0,15078550000.0,0.0,0.0,0.0,0.0
25%,1.253824e+18,222753100.0,1.253794e+18,0.0,0.0,0.0,0.2222
50%,1.253835e+18,2259061000.0,1.253834e+18,0.0,0.0,0.0,0.3125
75%,1.254191e+18,9.518731e+17,1.254183e+18,1.0,2.0,0.0,0.4545
max,1.254199e+18,1.254107e+18,1.254199e+18,11718.0,112659.0,28381.0,1.0


### Setting a criteria for amount of english within the tweet
- Example: 
- (tweet[tweet['not_english'] < .4]) 
- less than 40% of the tweet is not identified as english by sklearn (tweet is 60% english)

In [24]:
tweet[tweet['not_english'] < 0.5]

Unnamed: 0,id,time,author,author_id,associated_tweet,text,links,hashtags,mentions,reply_count,favorite_count,retweet_count,day,not_english
0,1254198473819291649,2020-04-25 16:59:52,PulpNews,100986964,1254198473819291649,Isolation and boredom of staying at home can b...,['https://t.co/49b7W0d6V5'],[],[],0,0,0,Saturday,0.4800
1,1254198461563637763,2020-04-25 16:59:49,aishacs,15809934,1254197958595301386,We left the trail early once we saw that the s...,[],[],[],2,21,0,Saturday,0.2174
2,1254198450494885893,2020-04-25 16:59:47,nonatofilho,50183821,1254198450494885893,"During the period of isolation in Brazil, I wa...",[],[],['@realDonaldTrump'],0,1,0,Saturday,0.2857
3,1254198394022768640,2020-04-25 16:59:33,abbiesbuswell,1086009979692335108,1254198394022768640,@pritchardfan happy birthday ella !! hope you ...,[],[],['@pritchardfan'],1,1,0,Saturday,0.2143
4,1254198364209664000,2020-04-25 16:59:26,Oof_utd,1243653781679734784,1254198103504228352,Had a wank in isolation,[],[],[],1,1,0,Saturday,0.2000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103602,1253829172020908032,2020-04-24 16:32:24,JimKingTweet,800030102935306242,1253829172020908032,Maybe if we promise to drink bleach at dinner ...,[],[],[],0,0,0,Friday,0.1111
103603,1253829170204798976,2020-04-24 16:32:23,VictorNarraway,944181190285598724,1253481322141679619,A frightening 89 thousand approved of Trump's ...,[],[],[],0,0,0,Friday,0.2500
103604,1253829169080926215,2020-04-24 16:32:23,Stephaniegiann8,763887982898212864,1253751812194070529,Trump didn’t say to drink bleach,[],[],[],0,0,0,Friday,0.1667
103605,1253829166861959169,2020-04-24 16:32:23,William48013192,1198328078688112640,1253801998127857664,Not drinking bleach?,[],[],[],0,2,0,Friday,0.3333


### using 50% english now, might change that later..

In [25]:
#only keep words that are more than 50% english
tweet = tweet[tweet['not_english'] < .5]

In [26]:
tweet.shape

(75468, 14)

In [27]:
tweet.isnull().sum()

id                  0
time                0
author              0
author_id           0
associated_tweet    0
text                0
links               0
hashtags            0
mentions            0
reply_count         0
favorite_count      0
retweet_count       0
day                 0
not_english         0
dtype: int64

### Utilize iterable hashtag and mantion items to add columns for counts of hashtags and mentions associated with each tweet

In [28]:
def to_list(string):
    #remove all unnecessary characters
    string = string.replace('[', '') 
    string = string.replace(']', '') 
    string = string.replace('\'', '')
    #split into list based on commas
    new_list = string.split(',')
    #remove whitespace
    new_list = [x.strip() for x in new_list]
    #return the list
    return new_list

tweet['hashtags'] = tweet['hashtags'].map(to_list)
tweet['mentions'] = tweet['mentions'].map(to_list)

In [29]:
def count_hash(count_list):
    count = 0
    for i in count_list:
        if '#' in i:
            count += 1
    return count

tweet['hashtag_count'] = tweet['hashtags'].map(count_hash)

In [30]:
tweet['hashtag_count'].value_counts(normalize = True)

0     0.863174
1     0.066863
2     0.029867
3     0.017544
4     0.009289
5     0.005632
6     0.002942
7     0.002173
8     0.000954
9     0.000663
10    0.000411
11    0.000239
12    0.000119
13    0.000093
14    0.000040
Name: hashtag_count, dtype: float64

In [31]:
def count_at(count_list):
    count = 0
    for i in count_list:
        if "@" in i:
            count += 1
    return count

tweet['mention_count'] = tweet['mentions'].map(count_at)

In [32]:
tweet['mention_count'].value_counts(normalize = True)

0     0.890232
1     0.084089
2     0.015238
3     0.005486
4     0.002478
5     0.001153
6     0.000610
7     0.000331
8     0.000159
9     0.000106
10    0.000080
11    0.000027
12    0.000013
Name: mention_count, dtype: float64

### Create columns for tweet word count and character count

In [33]:
def word_count(string):
    return len(string.split())

tweet['word_count'] = tweet['text'].map(word_count)

In [34]:
tweet['word_count'].value_counts(normalize = True)

15    0.034544
13    0.032742
16    0.031828
17    0.030967
11    0.030914
        ...   
60    0.000172
61    0.000053
62    0.000027
68    0.000013
63    0.000013
Name: word_count, Length: 64, dtype: float64

In [35]:
def char_count(string):
    return len(string)

tweet['char_count'] = tweet['text'].map(char_count)

In [36]:
tweet['char_count'].value_counts(normalize = True)

280    0.012191
279    0.010455
278    0.010428
277    0.009620
276    0.008626
         ...   
478    0.000013
480    0.000013
647    0.000013
739    0.000013
447    0.000013
Name: char_count, Length: 486, dtype: float64

In [37]:
tweet.head(2)

Unnamed: 0,id,time,author,author_id,associated_tweet,text,links,hashtags,mentions,reply_count,favorite_count,retweet_count,day,not_english,hashtag_count,mention_count,word_count,char_count
0,1254198473819291649,2020-04-25 16:59:52,PulpNews,100986964,1254198473819291649,Isolation and boredom of staying at home can b...,['https://t.co/49b7W0d6V5'],[],[],0,0,0,Saturday,0.48,0,0,25,238
1,1254198461563637763,2020-04-25 16:59:49,aishacs,15809934,1254197958595301386,We left the trail early once we saw that the s...,[],[],[],2,21,0,Saturday,0.2174,0,0,46,253


### Adding column for count of attached links to tweet

In [38]:
tweet['link_count'] = [1 if len(x) > 2 else 0 for x in tweet['links']]

### Sentiment Analysis

In [39]:
def sentiment_score(string):
    sia = SentimentIntensityAnalyzer()
    return sia.polarity_scores(string)['compound']

tweet['text_sentiment'] = tweet['text'].map(sentiment_score)

In [40]:
tweet.describe()

Unnamed: 0,id,author_id,associated_tweet,reply_count,favorite_count,retweet_count,not_english,hashtag_count,mention_count,word_count,char_count,link_count,text_sentiment
count,75468.0,75468.0,75468.0,75468.0,75468.0,75468.0,75468.0,75468.0,75468.0,75468.0,75468.0,75468.0,75468.0
mean,1.253946e+18,3.938128e+17,1.253157e+18,1.602361,17.34086,3.992328,0.2724,0.300922,0.156146,25.420297,164.930779,0.307693,-0.030452
std,248130300000000.0,5.125541e+17,1.629141e+16,63.302824,701.278927,166.245342,0.102384,0.99034,0.555433,13.48498,86.561078,0.461542,0.499193
min,1.253218e+18,767.0,15078550000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,-0.9901
25%,1.253822e+18,230266500.0,1.253774e+18,0.0,0.0,0.0,0.2,0.0,0.0,14.0,92.0,0.0,-0.4404
50%,1.253835e+18,2303449000.0,1.253832e+18,0.0,0.0,0.0,0.2727,0.0,0.0,23.0,155.0,0.0,0.0
75%,1.254189e+18,9.51214e+17,1.254176e+18,1.0,2.0,0.0,0.3438,0.0,0.0,37.0,240.0,1.0,0.3802
max,1.254199e+18,1.254107e+18,1.254199e+18,11718.0,112659.0,28381.0,0.4915,14.0,12.0,68.0,779.0,1.0,0.9917


# Adding column of text with links removed

In [41]:
tweet['text_links_removed'] = [re.sub('(\(https:\/\/[^\s]+)|https:\/\/[^\s]+', '', row) for row in tweet['text']]


In [42]:
# sklearn's stopwords, extracted
sklearn_stopwords = list(CountVectorizer(stop_words = 'english').get_stop_words())
#Custom created list
#custom_stopwords = ['nosleep', 
#                    'scarystories',
#                    'quarantine',
#                    'virus', 
#                    'covid',
#                    'pandemic',
#                    'coronavirus',
#                    'corona',
#                    'don',
#                    've',
#                    'pt',
#                    'didn',
#                    'wasn',
#                    'll',
#                    'shouldn',
#                    'woo',
#                    'hadn'
#                    'notext',
#                   ]
# Personalized stopwords
#personal_stopwords = sklearn_stopwords + custom_stopwords
#print(personal_stopwords)

### Alter text data by removing HTML, non-letters, and stopwords in addition to tokenizing and converting text to all lowercase

In [43]:
def tweet_to_words(raw_tweet):
    
    # 1. Remove HTML.
    tweet_text = BeautifulSoup(raw_tweet).get_text()
    
    # 2. Remove non-letters.
    letters_only = re.sub("[^a-zA-Z]", " ", tweet_text)
    
    # 3. Convert to lower case, split into individual words.
    words = letters_only.lower().split()
    
    # 4. Convert personalized stopwords to set
    stops = set(stopwords.words('english')) #change this command to set(personal_stopwords) if you decide to customize

    # 5. Remove stopwords.
    meaningful_words = [w for w in words if w not in stops]
    
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return(" ".join(meaningful_words))

In [46]:
# Get the number of subreddits based on the dataframe size.
total_tweets = tweet.shape[0]
print(f'There are {total_tweets} tweets related to coronavirus.')

There are 75468 tweets related to coronavirus.


### Apply function to clean our tweet data

In [47]:
# Initialize an empty list to hold the clean posts.
clean_tweets = []

print("Cleaning and parsing twitter data...")

# Instantiate counter.
j = 0

# For every post in our training set...
for string in tweet['text_links_removed']:
    
    # Convert post to words, then append to clean_train_posts.
    clean_tweets.append(tweet_to_words(string))
    
    # If the index is divisible by 1000, print a message.
    if (j + 1) % 5000 == 0:
        print(f'Tweet {j + 1} of {total_tweets}.')
    
    j += 1  

Cleaning and parsing twitter data...
Tweet 5000 of 75468.
Tweet 10000 of 75468.
Tweet 15000 of 75468.
Tweet 20000 of 75468.
Tweet 25000 of 75468.
Tweet 30000 of 75468.
Tweet 35000 of 75468.
Tweet 40000 of 75468.
Tweet 45000 of 75468.
Tweet 50000 of 75468.
Tweet 55000 of 75468.
Tweet 60000 of 75468.
Tweet 65000 of 75468.
Tweet 70000 of 75468.
Tweet 75000 of 75468.


### Add cleaned tweet text to dataframe 

In [48]:
tweet = tweet.assign(clean_text = clean_tweets)

### Creating word and character count columns for cleaned text

In [49]:
tweet['clean_word_count'] = tweet['clean_text'].map(word_count)

In [50]:
tweet['clean_char_count'] = tweet['clean_text'].map(char_count)

In [51]:
tweet.head()

Unnamed: 0,id,time,author,author_id,associated_tweet,text,links,hashtags,mentions,reply_count,...,hashtag_count,mention_count,word_count,char_count,link_count,text_sentiment,text_links_removed,clean_text,clean_word_count,clean_char_count
0,1254198473819291649,2020-04-25 16:59:52,PulpNews,100986964,1254198473819291649,Isolation and boredom of staying at home can b...,['https://t.co/49b7W0d6V5'],[],[],0,...,0,0,25,238,1,-0.6124,Isolation and boredom of staying at home can b...,isolation boredom staying home harmful way doc...,11,64
1,1254198461563637763,2020-04-25 16:59:49,aishacs,15809934,1254197958595301386,We left the trail early once we saw that the s...,[],[],[],2,...,0,0,46,253,0,-0.296,We left the trail early once we saw that the s...,left trail early saw sentiment shifted worried...,24,156
2,1254198450494885893,2020-04-25 16:59:47,nonatofilho,50183821,1254198450494885893,"During the period of isolation in Brazil, I wa...",[],[],[@realDonaldTrump],0,...,0,1,28,168,0,0.6249,"During the period of isolation in Brazil, I wa...",period isolation brazil impressed film parasit...,12,99
3,1254198394022768640,2020-04-25 16:59:33,abbiesbuswell,1086009979692335108,1254198394022768640,@pritchardfan happy birthday ella !! hope you ...,[],[],[@pritchardfan],1,...,0,1,14,85,0,0.8652,@pritchardfan happy birthday ella !! hope you ...,pritchardfan happy birthday ella hope best day...,9,65
4,1254198364209664000,2020-04-25 16:59:26,Oof_utd,1243653781679734784,1254198103504228352,Had a wank in isolation,[],[],[],1,...,0,0,5,23,0,-0.4019,Had a wank in isolation,wank isolation,2,14


In [52]:
tweet.describe()

Unnamed: 0,id,author_id,associated_tweet,reply_count,favorite_count,retweet_count,not_english,hashtag_count,mention_count,word_count,char_count,link_count,text_sentiment,clean_word_count,clean_char_count
count,75468.0,75468.0,75468.0,75468.0,75468.0,75468.0,75468.0,75468.0,75468.0,75468.0,75468.0,75468.0,75468.0,75468.0,75468.0
mean,1.253946e+18,3.938128e+17,1.253157e+18,1.602361,17.34086,3.992328,0.2724,0.300922,0.156146,25.420297,164.930779,0.307693,-0.030452,14.064517,97.773546
std,248130300000000.0,5.125541e+17,1.629141e+16,63.302824,701.278927,166.245342,0.102384,0.99034,0.555433,13.48498,86.561078,0.461542,0.499193,7.444092,53.188216
min,1.253218e+18,767.0,15078550000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,-0.9901,0.0,0.0
25%,1.253822e+18,230266500.0,1.253774e+18,0.0,0.0,0.0,0.2,0.0,0.0,14.0,92.0,0.0,-0.4404,8.0,54.0
50%,1.253835e+18,2303449000.0,1.253832e+18,0.0,0.0,0.0,0.2727,0.0,0.0,23.0,155.0,0.0,0.0,13.0,89.0
75%,1.254189e+18,9.51214e+17,1.254176e+18,1.0,2.0,0.0,0.3438,0.0,0.0,37.0,240.0,1.0,0.3802,20.0,140.0
max,1.254199e+18,1.254107e+18,1.254199e+18,11718.0,112659.0,28381.0,0.4915,14.0,12.0,68.0,779.0,1.0,0.9917,63.0,444.0


In [53]:
tweet.to_csv('../data/cleaned_with_lysol.csv', index = False)