# Cleaning Tweets for Anti-Vax Article Clustering

In [1]:
import pandas as pd
import sys, os
import ast
import string, re, requests, urllib3
try:
    from unshortenit import UnshortenIt
    unshortener_available = True
except:
    unshortener_available = False
from urllib.parse import urljoin, urlparse, parse_qs

from bs4 import BeautifulSoup
import requests as reqs
import pickle
from tqdm import tqdm
tqdm.pandas()
import time
import signal

### text_cleaning
import nltk
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger') # for part of speech tagging, required for lemmatization
nltk.download('stopwords')
from nltk.corpus import stopwords



import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
import gensim.models.phrases



  from pandas import Panel
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/richardkuzma/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/richardkuzma/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/richardkuzma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# ensure directory where data is stored is in the path
sys.path.append(os.path.join('./data/'))

In [3]:
# comment out once ready for CMU machine
NROWS = 10000 
SKIP = 0

CSV_name = 'CovidVaxTweetsWithGSDMMTopicsMayJune'
# CSV_name = 'CovidVaxTweetsWithGSDMMTopics'

In [4]:
# load df
pd.set_option('display.max_columns', None)
DATA_PATH = './data/'

df = pd.read_csv(DATA_PATH + CSV_name + '.csv', skiprows=SKIP, nrows=NROWS)

In [5]:
# downselect to columns we want, so speed up calculations
cols_to_keep = ['created_at', 
                'id_str', 
                'entities', 
                'user', 
                'retweet_count', 
                'favorite_count', 
#                 'retweeted', 
#                 'favorited', 
                'retweeted_status', 
                'Hash words', 
                'link', 
                'Topic Label']

df = df[cols_to_keep]

In [6]:
def get_base_url(url):
    if urlparse(url).netloc.lower() in link_shorteners:
        if unshortener_available:
            try:
                unshortener = UnshortenIt(default_timeout=3)
                base_url = unshortener.unshorten(url, unshorten_nested=True)
                return base_url
            except:
                return 'unreachable_shortened_url'
        else:
            urllib3.disable_warnings()
            session = requests.Session()
            try:
                resp = session.head(url, allow_redirects=True, timeout=3.0, verify=False)
                base_url = resp.url
                return base_url
            except:
                return 'unreachable_shortened_url'
    else:
        return url


def process_url(entities):
    ents = ast.literal_eval(entities)
    if ents is not None:
        if "urls" in ents:
            urls = ents["urls"]
            new_urls =[]
            if len(urls) > 0:
                for u in [url["expanded_url"] for url in ents["urls"]]:
                    new_url = get_base_url(u)
                    new_urls.append(new_url)
            else:
                new_urls = ['None']
        else:
            new_urls = ['None']
    else:
        new_urls = ['None']
        
    return new_urls


def get_youtube_id(urls):
    you_tube_ids =[]
    for url in urls:
        if url !='None':
            query = urlparse(url)
            if query.hostname == 'youtu.be':
                you_tube_ids.append(query.path[1:])
            elif query.hostname in ('www.youtube.com', 'youtube.com', 'm.youtube.com'):
                p = parse_qs(query.query)
                you_tube_ids.append(p.get('v',['None'])[0])
            else:
                you_tube_ids.append('None')
        else:
            you_tube_ids.append('None')
    return you_tube_ids



def get_domain(urls):
    domains =[]
    for url in urls:
        if url !='None':
            domains.append(urlparse(url).netloc.lower())
        else:
            domains.append('None')
            
    return domains
        

link_shorteners =['trib.al', 'bit.ly','www.bit.ly','tinyurl','ow.ly','buff.ly',
                           'rebrand.ly', 'dlvr.it','sco.lt', 'shar.es', 'spr.ly',
                           'zpr.io', 'zurl.co', 'tinyurl.com', 'ht.ly', 'youtu.be',
                           't.ly', 'snip.ly', 'qoo.ly', 'loom.ly', 'invst.ly',
                           'hubs.ly', 'gates.ly', 'frost.ly', 'fcld.ly', 'cutt.ly',
                           'all.be', 'amzn.to', 'goo.gl', 'is.gd', 'bit.do', 'mcaf.ee',
                           'shorte.st', 'bc.vc', 'lnkd.in', 't.co', 'ift.tt', 'flip.it',
                           'reut.rs', 'nyti.ms', 'chng.it', 'cnn.it', 'cnb.cx', 'mol.im',
                           'paper.li', 'toi.in', 'flip.it', 'hill.cm', 'bbc.in',
                           'ti.me', 'politi.co', 'aje.io', 'gizmo.do', 'youtu.be']    

def check_for_link_shortener(entities):
    ents = ast.literal_eval(entities)
    if ents is not None:
        if "urls" in ents:
            urls = ents["urls"]
            shortened =[]
            if len(urls) > 0:
                for u in [url["expanded_url"] for url in ents["urls"]]:
                    if urlparse(u).netloc.lower() in link_shorteners:
                        shortened.append('True')
                    else:
                        shortened.append('False')
            else:
                shortened = ['None']
        else:
            shortened = ['None']
    else:
        shortened = ['None']
        
    return shortened

def process_hashtags(entities):
    ents = ast.literal_eval(entities)
    if ents is not None:
        if "hashtags" in ents:
            hashtags_info = ents["hashtags"]
            if len(hashtags_info) > 0:
                hashtags=[]
                for h in hashtags_info:
                    hashtags.append(h["text"])
            else:
                hashtags = ['None']
        else:
            hashtags = ['None']
    else:
        hashtags = ['None']
        
    return hashtags

def no_link(urls: list):
    if urls == ['None']:
        return True
    else:
        return False

def twitter_or_youtube_in_domain(domain_list: list):
    for domain in domain_list: # deal with one, not entire list
        if ('twitter.com' in domain) or ('youtube.com' in domain):
            return True
    return False

def pull_first_url(url_list: list):
    return url_list[0]


# def signal_handler(signum, frame):
#     raise Exception("Timed out!")

# signal.signal(signal.SIGALRM, signal_handler)
# signal.alarm(300)   # Ten seconds
# try:
#     long_function_call()
# except Exception, msg:
#     print "Timed out!"

def get_article_text_from_url_list(link):
    count = 1
#     link = link_list[0] # I made it so we're only looking at the first URL in list
    
#     signal.signal(signal.SIGALRM, signal_handler)
#     signal.alarm(11)   # 11 sec
    try:
        r = requests.get(link, timeout=6)
        content = r.content
        soup = BeautifulSoup(content, 'html.parser')
        temp = ''
        for tag in soup.find_all('p'):
            temp = temp + ' ' + tag.get_text()
        r.close() 
#         print(count)
#         count +=1
        return temp
    except:
#         print(count)
#         count +=1
        return "exception occurred"



In [7]:
def process_data(data_df):
    print('original shape: {}'.format(data_df.shape))
    # process urls first
    data_df['urls'] = data_df['entities'].apply(process_url)

    
    # identify which tweets have no link. remove them
    data_df['no_link'] = data_df['urls'].apply(lambda x: no_link(x))
    data_df = data_df[data_df['no_link'] == False]
    print('links only: {}'.format(data_df.shape))
    
    # identify twitter or youtube links (can't be embed). Remove them
    data_df['twitter_or_youtube'] = data_df['urls'].apply(lambda x: twitter_or_youtube_in_domain(x))
    data_df = data_df[data_df['twitter_or_youtube'] == False]
    print('no youtube or twitter links: {}'.format(data_df.shape))
    
    # pull only first url from list for simplicity
    data_df['first_url'] = data_df['urls'].apply(lambda x: pull_first_url(x))
    
    # iain's processing
    data_df['domains']=data_df['urls'].apply(get_domain)
    data_df['day_of_tweet'] = pd.to_datetime(data_df['created_at']).dt.to_period('D')
    data_df['month_year'] = pd.to_datetime(data_df['created_at']).dt.to_period('M')
    data_df['week_month_year'] = pd.to_datetime(data_df['created_at']).dt.weekofyear
    data_df['user'] = data_df['user'].apply(lambda x: ast.literal_eval(x))
    data_df['Topic Label'] = data_df['Topic Label'].apply(lambda x: ast.literal_eval(x))
    data_df['user_id'] = data_df['user'].map(lambda x: x['id_str'])
    data_df['cluster'] = data_df['Topic Label'].map(lambda x: x[0])
    data_df['is_retweet'] = ~data_df['retweeted_status'].isnull()
    data_df['hashtags']=data_df['entities'].apply(process_hashtags)
    data_df['link_shorteners'] = data_df['entities'].apply(check_for_link_shortener)
    
    return data_df
    
def scrape_articles(data_df):
    # extract text using requests and beautiful soup... leaves something to be desired
    data_df['article_text'] = data_df['first_url'].progress_apply(lambda x: get_article_text_from_url_list(x))

    return data_df


def drop_short_articles(data_df):    
    index_names = data_df[data_df['article_text'].str.len() < 500].index
    data_df.drop(index_names, inplace = True)
    
    return data_df


In [8]:
# Iain's processing
df = process_data(df)
df.reset_index(inplace=True, drop=True)

### sum total retweets and total favorites for each url shared in a tweet
df['url_total_retweets'] = df.groupby(['first_url'])['retweet_count'].transform('sum')
df['url_total_favorites'] = df.groupby(['first_url'])['favorite_count'].transform('sum')


### drop duplicate URLs
df.drop_duplicates(subset=['first_url'], inplace=True)


original shape: (10000, 12)
links only: (1845, 14)
no youtube or twitter links: (1339, 15)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [9]:
### scrape articles (6sec timeout)
df = scrape_articles(df)    

### drop short articles
df = drop_short_articles(df)


 15%|█▌        | 145/964 [03:31<08:46,  1.56it/s] Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
 17%|█▋        | 165/964 [03:50<05:55,  2.25it/s]  Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
 49%|████▊     | 468/964 [14:23<14:12,  1.72s/it]   Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
 74%|███████▍  | 711/964 [21:27<02:01,  2.08it/s]  Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
 77%|███████▋  | 742/964 [22:13<06:33,  1.77s/it]Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
 81%|████████  | 780/964 [23:28<03:07,  1.02s/it]Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
100%|██████████| 964/964 [28:12<00:00,  1.76s/it]


## Clean text

In [10]:
def remove_stopwords_and_simple_preprocess(texts):
    stop_words = stopwords.words('english')
    return [[word for word in simple_preprocess(str(doc), deacc = True) if word not in stop_words] for doc in texts]

In [11]:
df['article_text_preprocessed'] = remove_stopwords_and_simple_preprocess(df['article_text'])

## Save df

In [None]:
# save df to pkl
df_name = 'tweets_urls_articles_cleaned.pkl'

with open(DATA_PATH + df_name, 'wb') as f:
    pickle.dump(df, f)

# ====================================