## Project Outline

#### Get text data
- ingest csv to df
- drop tweets without URLs
- ignore youtube and twitter urls
- for a given tweet with a url:
    - for a given url
    - get year, month, date from 2020-01-31 23:04:55+00:00 format
    - request url content
    - scrape and aggregate paragraph tags or article tag
    - preprocess text
    - save text in df
- load google news pretrained d2v model
    - embed text from each article
    - store in dataframe with url, embedding, text?, year, month, week
- url_df
    - t-sne or PCA by month
    - https://towardsdatascience.com/visualizing-word-embedding-with-pca-and-t-sne-961a692509f5


In [None]:
import pandas as pd
import sys, os
import ast
import string, re, requests, urllib3
try:
    from unshortenit import UnshortenIt
    unshortener_available = True
except:
    unshortener_available = False
from urllib.parse import urljoin, urlparse, parse_qs

from bs4 import BeautifulSoup
import requests as reqs
import pickle
from tqdm import tqdm
tqdm.pandas()
import time
import signal



In [None]:
sys.path.append(os.path.join('./data/'))

In [None]:
NROWS = 10000
SKIP = 100000

# CSV_name = 'CovidVaxTweetsWithGSDMMTopicsMayJune'
CSV_name = 'CovidVaxTweetsWithGSDMMTopics'

In [None]:
# keep column names
df_with_cols = pd.read_csv('./data/' + CSV_name + '.csv', nrows=50)
df_colnames = list(df_with_cols.columns)

# load big df
pd.set_option('display.max_columns', None)
df_big = pd.read_csv('./data/' + CSV_name + '.csv', skiprows = SKIP, nrows=NROWS, names = df_colnames)


In [None]:
df_big.head()

In [None]:
# CSV_name = 'CovidVaxTweetsWithGSDMMTopicsMayJune'

# skip 500k --> jan 27
# skip 400k --> feb 13
# skip 300k --> feb 29
# skip 2
# skip 275k --> apr 04
# skip 250k --> apr 10
# skip 200k --> apr25
# skip 170k --> may01
# skip 150k --> may05
# skip 100k --> may 15
# skip 50k  --> may 21
# skip 0k   --> may 31

# CSV_name = 'CovidVaxTweetsWithGSDMMTopics'
# skip 100k --> 29FEB



### currently have...
# january 31  -- 1182 articles
# february 13 -- 1367 articles
# april 01    -- 532 articles
# april 10    -- 567 articles
# may 31      -- 897 articles


In [None]:
# downselect to columns we want, so speed up calculations
cols_to_keep = ['created_at', 'user', 'id_str', 'retweeted_status', 'Hash words', 'link', 'entities', 'Topic Label']
df = df_big[cols_to_keep]
df.head()

In [None]:
def get_base_url(url):
    if urlparse(url).netloc.lower() in link_shorteners:
        if unshortener_available:
            try:
                unshortener = UnshortenIt(default_timeout=3)
                base_url = unshortener.unshorten(url, unshorten_nested=True)
                return base_url
            except:
                return 'unreachable_shortened_url'
        else:
            urllib3.disable_warnings()
            session = requests.Session()
            try:
                resp = session.head(url, allow_redirects=True, timeout=3.0, verify=False)
                base_url = resp.url
                return base_url
            except:
                return 'unreachable_shortened_url'
    else:
        return url


def process_url(entities):
    ents = ast.literal_eval(entities)
    if ents is not None:
        if "urls" in ents:
            urls = ents["urls"]
            new_urls =[]
            if len(urls) > 0:
                for u in [url["expanded_url"] for url in ents["urls"]]:
                    new_url = get_base_url(u)
                    new_urls.append(new_url)
            else:
                new_urls = ['None']
        else:
            new_urls = ['None']
    else:
        new_urls = ['None']
        
    return new_urls


def get_youtube_id(urls):
    you_tube_ids =[]
    for url in urls:
        if url !='None':
            query = urlparse(url)
            if query.hostname == 'youtu.be':
                you_tube_ids.append(query.path[1:])
            elif query.hostname in ('www.youtube.com', 'youtube.com', 'm.youtube.com'):
                p = parse_qs(query.query)
                you_tube_ids.append(p.get('v',['None'])[0])
            else:
                you_tube_ids.append('None')
        else:
            you_tube_ids.append('None')
    return you_tube_ids


def get_domain(urls):
    domains =[]
    for url in urls:
        if url !='None':
            domains.append(urlparse(url).netloc.lower())
        else:
            domains.append('None')
            
    return domains
        

link_shorteners =['trib.al', 'bit.ly','www.bit.ly','tinyurl','ow.ly','buff.ly',
                           'rebrand.ly', 'dlvr.it','sco.lt', 'shar.es', 'spr.ly',
                           'zpr.io', 'zurl.co', 'tinyurl.com', 'ht.ly', 'youtu.be',
                           't.ly', 'snip.ly', 'qoo.ly', 'loom.ly', 'invst.ly',
                           'hubs.ly', 'gates.ly', 'frost.ly', 'fcld.ly', 'cutt.ly',
                           'all.be', 'amzn.to', 'goo.gl', 'is.gd', 'bit.do', 'mcaf.ee',
                           'shorte.st', 'bc.vc', 'lnkd.in', 't.co', 'ift.tt', 'flip.it',
                           'reut.rs', 'nyti.ms', 'chng.it', 'cnn.it', 'cnb.cx', 'mol.im',
                           'paper.li', 'toi.in', 'flip.it', 'hill.cm', 'bbc.in',
                           'ti.me', 'politi.co', 'aje.io', 'gizmo.do', 'youtu.be']    

def check_for_link_shortener(entities):
    ents = ast.literal_eval(entities)
    if ents is not None:
        if "urls" in ents:
            urls = ents["urls"]
            shortened =[]
            if len(urls) > 0:
                for u in [url["expanded_url"] for url in ents["urls"]]:
                    if urlparse(u).netloc.lower() in link_shorteners:
                        shortened.append('True')
                    else:
                        shortened.append('False')
            else:
                shortened = ['None']
        else:
            shortened = ['None']
    else:
        shortened = ['None']
        
    return shortened

def process_hashtags(entities):
    ents = ast.literal_eval(entities)
    if ents is not None:
        if "hashtags" in ents:
            hashtags_info = ents["hashtags"]
            if len(hashtags_info) > 0:
                hashtags=[]
                for h in hashtags_info:
                    hashtags.append(h["text"])
            else:
                hashtags = ['None']
        else:
            hashtags = ['None']
    else:
        hashtags = ['None']
        
    return hashtags

def no_link(urls: list):
    if urls == ['None']:
        return True
    else:
        return False

def twitter_or_youtube_in_domain(domain_list: list):
    for domain in domain_list: # deal with one, not entire list
        if ('twitter.com' in domain) or ('youtube.com' in domain):
            return True
    return False

def pull_first_url(url_list: list):
    return url_list[0]

def signal_handler(signum, frame):
    raise Exception("Timed out!")

# signal.signal(signal.SIGALRM, signal_handler)
# signal.alarm(300)   # Ten seconds
# try:
#     long_function_call()
# except Exception, msg:
#     print "Timed out!"

def get_article_text_from_url_list(link_list: list):
    count = 1
    link = link_list[0] # I made it so we're only looking at the first URL in list
    signal.signal(signal.SIGALRM, signal_handler)
    signal.alarm(11)   # 11 sec
    try:
        r = requests.get(link, timeout=6)
        content = r.content
        soup = BeautifulSoup(content, 'html.parser')
        temp = ''
        for tag in soup.find_all('p'):
            temp = temp + ' ' + tag.get_text()
        r.close() 
#         print(count)
#         count +=1
        return temp
    except:
#         print(count)
#         count +=1
        return "exception occurred"



In [None]:
def process_data(data_df):
    print(data_df.shape)
    # process urls first
    data_df['urls'] = data_df['entities'].apply(process_url)

    
    # identify which tweets have no link. remove them
    data_df['no_link'] = data_df['urls'].apply(lambda x: no_link(x))
    data_df = data_df[data_df['no_link'] == False]
    print('links only: {}'.format(data_df.shape))
    
    # identify twitter or youtube links (can't be embed). Remove them
    data_df['twitter_or_youtube'] = data_df['urls'].apply(lambda x: twitter_or_youtube_in_domain(x))
    data_df = data_df[data_df['twitter_or_youtube'] == False]
    print('no youtube or twitter links: {}'.format(data_df.shape))
    
    # pull only first url from list for simplicity
    data_df['first_url'] = data_df['urls'].apply(lambda x: pull_first_url(x))
    
    # iain's processing
    data_df['domains']=data_df['urls'].apply(get_domain)
    data_df['day_of_tweet'] = pd.to_datetime(data_df['created_at']).dt.to_period('D')
    data_df['month_year'] = pd.to_datetime(data_df['created_at']).dt.to_period('M')
    data_df['week_month_year'] = pd.to_datetime(data_df['created_at']).dt.weekofyear
    data_df['user'] = data_df['user'].apply(lambda x: ast.literal_eval(x))
    data_df['Topic Label'] = data_df['Topic Label'].apply(lambda x: ast.literal_eval(x))
    data_df['user_id'] = data_df['user'].map(lambda x: x['id_str'])
    data_df['cluster'] = data_df['Topic Label'].map(lambda x: x[0])
    data_df['is_retweet'] = ~data_df['retweeted_status'].isnull()
    data_df['hashtags']=data_df['entities'].apply(process_hashtags)
    data_df['link_shorteners'] = data_df['entities'].apply(check_for_link_shortener)
    
    return data_df


def scrape_articles(data_df):
    # extract text using requests and beautiful soup... leaves something to be desired
    data_df['article_text'] = data_df['urls'].progress_apply(lambda x: get_article_text_from_url_list(x))

    return data_df


def drop_short_articles(data_df):    
    index_names = data_df[data_df['article_text'].str.len() < 500].index
    data_df.drop(index_names, inplace = True)
    
    return data_df


In [None]:
df = process_data(df)
df_len = len(df)
with open('./data/cleaned/' + CSV_name + '_' + str(SKIP) + '_skip_' + str(NROWS) + '_rows_' + str(df_len) + '_processed.pkl', 'wb') as f:
    pickle.dump(df, f)



In [None]:
# df = scrape_articles(df)
# df_articles = len(df)
# with open('./data/cleaned/' + CSV_name + '_' + str(NROWS) + '_rows_' + str(df_len) + '_processed_'+ str(df_articles) + '_articles.pkl', 'wb') as f:
#     pickle.dump(df, f)
    

In [None]:
# df = drop_short_articles(df)
# df_articles_left = len(df)
# with open('./data/cleaned/' + CSV_name + '_' + str(NROWS) + '_rows_' + str(df_len) + '_processed_'+ str(df_articles) + '_articles_' + str(df_articles_left) + '_left.pkl', 'wb') as f:
#     pickle.dump(df, f)
    

In [None]:
print('finished.')


In [None]:
df.shape

# Restart Kernal, then start here

In [None]:
import pandas as pd
import sys, os
import ast
import string, re, requests, urllib3
try:
    from unshortenit import UnshortenIt
    unshortener_available = True
except:
    unshortener_available = False
from urllib.parse import urljoin, urlparse, parse_qs

from bs4 import BeautifulSoup
import requests as reqs
import pickle
from tqdm import tqdm
tqdm.pandas()
import time
import signal



sys.path.append(os.path.join('./data/'))

def get_base_url(url):
    if urlparse(url).netloc.lower() in link_shorteners:
        if unshortener_available:
            try:
                unshortener = UnshortenIt(default_timeout=3)
                base_url = unshortener.unshorten(url, unshorten_nested=True)
                return base_url
            except:
                return 'unreachable_shortened_url'
        else:
            urllib3.disable_warnings()
            session = requests.Session()
            try:
                resp = session.head(url, allow_redirects=True, timeout=3.0, verify=False)
                base_url = resp.url
                return base_url
            except:
                return 'unreachable_shortened_url'
    else:
        return url


def process_url(entities):
    ents = ast.literal_eval(entities)
    if ents is not None:
        if "urls" in ents:
            urls = ents["urls"]
            new_urls =[]
            if len(urls) > 0:
                for u in [url["expanded_url"] for url in ents["urls"]]:
                    new_url = get_base_url(u)
                    new_urls.append(new_url)
            else:
                new_urls = ['None']
        else:
            new_urls = ['None']
    else:
        new_urls = ['None']
        
    return new_urls


def get_youtube_id(urls):
    you_tube_ids =[]
    for url in urls:
        if url !='None':
            query = urlparse(url)
            if query.hostname == 'youtu.be':
                you_tube_ids.append(query.path[1:])
            elif query.hostname in ('www.youtube.com', 'youtube.com', 'm.youtube.com'):
                p = parse_qs(query.query)
                you_tube_ids.append(p.get('v',['None'])[0])
            else:
                you_tube_ids.append('None')
        else:
            you_tube_ids.append('None')
    return you_tube_ids


def get_domain(urls):
    domains =[]
    for url in urls:
        if url !='None':
            domains.append(urlparse(url).netloc.lower())
        else:
            domains.append('None')
            
    return domains
        

link_shorteners =['trib.al', 'bit.ly','www.bit.ly','tinyurl','ow.ly','buff.ly',
                           'rebrand.ly', 'dlvr.it','sco.lt', 'shar.es', 'spr.ly',
                           'zpr.io', 'zurl.co', 'tinyurl.com', 'ht.ly', 'youtu.be',
                           't.ly', 'snip.ly', 'qoo.ly', 'loom.ly', 'invst.ly',
                           'hubs.ly', 'gates.ly', 'frost.ly', 'fcld.ly', 'cutt.ly',
                           'all.be', 'amzn.to', 'goo.gl', 'is.gd', 'bit.do', 'mcaf.ee',
                           'shorte.st', 'bc.vc', 'lnkd.in', 't.co', 'ift.tt', 'flip.it',
                           'reut.rs', 'nyti.ms', 'chng.it', 'cnn.it', 'cnb.cx', 'mol.im',
                           'paper.li', 'toi.in', 'flip.it', 'hill.cm', 'bbc.in',
                           'ti.me', 'politi.co', 'aje.io', 'gizmo.do', 'youtu.be']    

def check_for_link_shortener(entities):
    ents = ast.literal_eval(entities)
    if ents is not None:
        if "urls" in ents:
            urls = ents["urls"]
            shortened =[]
            if len(urls) > 0:
                for u in [url["expanded_url"] for url in ents["urls"]]:
                    if urlparse(u).netloc.lower() in link_shorteners:
                        shortened.append('True')
                    else:
                        shortened.append('False')
            else:
                shortened = ['None']
        else:
            shortened = ['None']
    else:
        shortened = ['None']
        
    return shortened

def process_hashtags(entities):
    ents = ast.literal_eval(entities)
    if ents is not None:
        if "hashtags" in ents:
            hashtags_info = ents["hashtags"]
            if len(hashtags_info) > 0:
                hashtags=[]
                for h in hashtags_info:
                    hashtags.append(h["text"])
            else:
                hashtags = ['None']
        else:
            hashtags = ['None']
    else:
        hashtags = ['None']
        
    return hashtags

def no_link(urls: list):
    if urls == ['None']:
        return True
    else:
        return False

def twitter_or_youtube_in_domain(domain_list: list):
    for domain in domain_list: # deal with one, not entire list
        if ('twitter.com' in domain) or ('youtube.com' in domain):
            return True
    return False

def pull_first_url(url_list: list):
    return url_list[0]

def signal_handler(signum, frame):
    raise Exception("Timed out!")

# signal.signal(signal.SIGALRM, signal_handler)
# signal.alarm(300)   # Ten seconds
# try:
#     long_function_call()
# except Exception, msg:
#     print "Timed out!"

def get_article_text_from_url_list(link_list: list):
    count = 1
    link = link_list[0] # I made it so we're only looking at the first URL in list
    signal.signal(signal.SIGALRM, signal_handler)
    signal.alarm(11)   # 11 sec
    try:
        r = requests.get(link, timeout=6)
        content = r.content
        soup = BeautifulSoup(content, 'html.parser')
        temp = ''
        for tag in soup.find_all('p'):
            temp = temp + ' ' + tag.get_text()
        r.close() 
#         print(count)
#         count +=1
        return temp
    except:
#         print(count)
#         count +=1
        return "exception occurred"



def process_data(data_df):
    print(data_df.shape)
    # process urls first
    data_df['urls'] = data_df['entities'].apply(process_url)

    
    # identify which tweets have no link. remove them
    data_df['no_link'] = data_df['urls'].apply(lambda x: no_link(x))
    data_df = data_df[data_df['no_link'] == False]
    print('links only: {}'.format(data_df.shape))
    
    # identify twitter or youtube links (can't be embed). Remove them
    data_df['twitter_or_youtube'] = data_df['urls'].apply(lambda x: twitter_or_youtube_in_domain(x))
    data_df = data_df[data_df['twitter_or_youtube'] == False]
    print('no youtube or twitter links: {}'.format(data_df.shape))
    
    # pull only first url from list for simplicity
    data_df['first_url'] = data_df['urls'].apply(lambda x: pull_first_url(x))
    
    # iain's processing
    data_df['domains']=data_df['urls'].apply(get_domain)
    data_df['day_of_tweet'] = pd.to_datetime(data_df['created_at']).dt.to_period('D')
    data_df['month_year'] = pd.to_datetime(data_df['created_at']).dt.to_period('M')
    data_df['week_month_year'] = pd.to_datetime(data_df['created_at']).dt.weekofyear
    data_df['user'] = data_df['user'].apply(lambda x: ast.literal_eval(x))
    data_df['Topic Label'] = data_df['Topic Label'].apply(lambda x: ast.literal_eval(x))
    data_df['user_id'] = data_df['user'].map(lambda x: x['id_str'])
    data_df['cluster'] = data_df['Topic Label'].map(lambda x: x[0])
    data_df['is_retweet'] = ~data_df['retweeted_status'].isnull()
    data_df['hashtags']=data_df['entities'].apply(process_hashtags)
    data_df['link_shorteners'] = data_df['entities'].apply(check_for_link_shortener)
    
    return data_df


def scrape_articles(data_df):
    # extract text using requests and beautiful soup... leaves something to be desired
    data_df['article_text'] = data_df['urls'].progress_apply(lambda x: get_article_text_from_url_list(x))

    return data_df


def drop_short_articles(data_df):    
    index_names = data_df[data_df['article_text'].str.len() < 500].index
    data_df.drop(index_names, inplace = True)
    
    return data_df


In [None]:
df_len = 1548

In [None]:

NROWS = 10000
SKIP = 100000

# CSV_name = 'CovidVaxTweetsWithGSDMMTopicsMayJune'
CSV_name = 'CovidVaxTweetsWithGSDMMTopics'

In [None]:
df_filepath = './data/cleaned/' + CSV_name + '_' + str(SKIP) +'_skip_' + str(NROWS) + '_rows_' + str(df_len) + '_processed.pkl'
with open(df_filepath, 'rb') as f:
    df = pickle.load(f)
df_len = len(df)



In [None]:
df.head()

In [None]:
df = scrape_articles(df)
df_articles = len(df)
with open('./data/cleaned/' + CSV_name + '_' + str(SKIP) + '_skip_' + str(NROWS) + '_rows_' + str(df_len) + '_processed_'+ str(df_articles) + '_articles.pkl', 'wb') as f:
    pickle.dump(df, f)
    

In [None]:
df = drop_short_articles(df)
df_articles_left = len(df)
with open('./data/cleaned/' + CSV_name + '_' + str(SKIP) + '_skip_' + str(NROWS) + '_rows_' + str(df_len) + '_processed_'+ str(df_articles) + '_articles_' + str(df_articles_left) + '_left.pkl', 'wb') as f:
    pickle.dump(df, f)
    

In [None]:
print('finished')

In [None]:
# 15MAY2020

# Parking Lot

In [None]:
### useful commands


# r = requests.get(str_url)
# content = r.content
# soup = BeautifulSoup(content, 'html.parser')

# print(soup.prettify())




In [None]:
num = 0
urls.head()

In [None]:
urls['url'][0]

In [None]:
def get_domains(url):
    domains = []
    print(url)
    print(urlparse(url).hostname)
    domains.append(urlparse(url).hostname)
        
    return domains

In [None]:
urls['domains'] = urls['url'].apply(get_domains)


In [None]:
type(urls['url'][0])

In [None]:
url_list = ['https://www.businessinsider.com/australia-successfully-grows-wuhan-coronavirus-sample-from-sick-patient-2020-1',
         'https://www.dailymail.co.uk/news/article-7952287/Virologist-warns-coronavirus-deadlier-Qantas-waits-fly-Australians-Wuhan.html',
         'https://news.abs-cbn.com/overseas/02/01/20/coronavirus-vaccine-will-take-months-biotech-exec',
         'https://www.politico.com/news/2020/01/31/coronavirus-vaccine-missed-chance-109709']

In [None]:
soups = []


for i in url_list:
    soup = BeautifulSoup(requests.get(i).content, 'html.parser')
    soups.append(soup)

In [None]:
def show_article(soup):    
    articles = soup.find_all('article')
    for i in articles:
        print(i.get_text())

In [None]:
def show_paragraphs(soup):    
    paragraphs = soup.find_all('p')
    for i in paragraphs:
        print(i.get_text())
        print('-'*20)


In [None]:
for i in range(len(soups)):
    print('='*50)
    print(url_list[i])
    print('='*50)
    show_article(soups[i])


In [None]:
for i in range(len(soups)):
    print('='*50)
    print(url_list[i])
    print('='*50)
    show_paragraphs(soups[i])

In [None]:
def show_content(soup):
    articles = soup.find_all('article')
    if len(articles) > 0:
        for i in articles:
            print(i.get_text())
    else:
        print('using paragraphs instead')
        show_paragraphs(soup)

In [None]:
for i in range(len(soups)):
    print('='*50)
    print(url_list[i])
    print('='*50)
    show_content(soups[i])

In [None]:
soups[0].find('article').get_text()

In [None]:
temp = ''
for i in range(len(soups[0].find_all('p'))):
    temp = temp + soups[0].find_all('p')[i].get_text()
print(temp)

In [None]:
for i in range(len(soups)):
    temp = ''
    for j in range(len(soups[i].find_all('p'))):
        temp = temp + soups[i].find_all('p')[j].get_text()
    print(temp)
    print('\n'*3 + '='*60 + '\n'*3)


In [None]:
def get_article_text_from_url(link: str):
    r = requests.get(link)
    content = r.content
    soup = BeautifulSoup(content, 'html.parser')
    temp = ''
    for tag in soup.find_all('p'):
        temp = temp + tag.get_text()
    return temp

In [None]:
print(get_article_text_from_url('https://www.dailymail.co.uk/news/article-7952287/Virologist-warns-coronavirus-deadlier-Qantas-waits-fly-Australians-Wuhan.html'))

In [None]:
def display_closestwords_tsnescatterplot(model, word):
    
    arr = np.empty((0,300), dtype='f')
    word_labels = [word]

    # get close words
    close_words = model.similar_by_word(word)
    
    # add the vector for each of the closest words to the array
    arr = np.append(arr, np.array([model[word]]), axis=0)
    for wrd_score in close_words:
        wrd_vector = model[wrd_score[0]]
        word_labels.append(wrd_score[0])
        arr = np.append(arr, np.array([wrd_vector]), axis=0)
        
    # find tsne coords for 2 dimensions
    tsne = TSNE(n_components=2, random_state=0)
    np.set_printoptions(suppress=True)
    Y = tsne.fit_transform(arr)

    x_coords = Y[:, 0]
    y_coords = Y[:, 1]
    # display scatter plot
    plt.scatter(x_coords, y_coords)

    for label, x, y in zip(word_labels, x_coords, y_coords):
        plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
    plt.xlim(x_coords.min()+0.00005, x_coords.max()+0.00005)
    plt.ylim(y_coords.min()+0.00005, y_coords.max()+0.00005)
    plt.show()

In [None]:
# glove pretrained
from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = ‘glove.6B.100d.txt’
word2vec_output_file = ‘glove.6B.100d.txt.word2vec’
glove2word2vec(glove_input_file, word2vec_output_file)
You now have a copy of the GloVe model in word2vec format with the filename glove.6B.100d.txt.word2vec.
from gensim.models import KeyedVectors
# load the Stanford GloVe model
filename = ‘glove.6B.100d.txt.word2vec’
model = KeyedVectors.load_word2vec_format(filename, binary=False)



# load keyedvectors
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True, limit=10 ** 5)