In [None]:
!pip install bs4

In [None]:
import psycopg2
from sqlalchemy import create_engine
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup

In [None]:
db_string = "postgresql://postgres:postgres@postgres/postgres"
# db_string = "postgresql://postgres:postgres@postgres/dev4slack"
db = create_engine(db_string)

def query_df(line_query, cell_query=None, conn=db):
    if cell_query==None:
      return pd.read_sql(line_query, conn)
    return pd.read_sql(cell_query, conn)

# Custom notebook magic commands for loading sql.
from IPython.core.magic import register_line_cell_magic
def create_df_sql_magic(magic_name, conn):
    def sql_df(line_query, cell_query=None, conn=db):
        if cell_query==None:
          return pd.read_sql(line_query, conn)
        return pd.read_sql(cell_query, conn)
    custom_func = sql_df
    custom_func.__name__ = magic_name
    register_line_cell_magic(custom_func)
create_df_sql_magic('sql_df', db)

In [None]:
ds_gen_parents_n_replies_query = \
'''
SELECT 
    message.text AS p_text, message.reply_count, message.user_id as p_id, message.ts,
    reply.text AS c_text, reply.user_id as c_id
FROM message
LEFT JOIN reply on reply.thread_ts=message.ts
WHERE message.channel_id='CFBBHV7AT' AND message.reply_count > 0
ORDER BY message.ts, reply.ts;
'''
ds_gen_parents_n_replies_df = query_df(ds_gen_parents_n_replies_query)

df = ds_gen_parents_n_replies_df

In [None]:
# Create new column of replies to each element in c_text.
df['replies'] = df.c_text.shift(-1)

# Drop rows where a reply refers to an unrelated parent.
df = df.groupby('ts', as_index=False).apply(lambda x: x.iloc[:-1])
df.shape

In [None]:
def simple_clean(col):
    '''replaces whitespace, quotes, and urls'''
    col.replace({'\t':' ','\n':' ','"':''},
#                 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+':'URL>'},
                inplace=True, regex=True)

cols2clean = ['c_text', 'replies']
for col in cols2clean:
    simple_clean(df[col])

# Drop rows with duplicate column items.
df = df[df.c_text != df.replies]
df.shape

In [None]:
# Get text with "<@user>" formatted as replies to that user.
ats = df[df.replies.str.contains('<@')]

In [None]:
ats.shape

In [None]:
def describe_urls(text):
    '''Looks for urls in text. Replaces urls with their scraped meta description.'''
    url_found = re.search('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
    if url_found:
        url = url_found.group(0)
        url = url[:-1]
        try: 
            response = requests.get(url)
            soup = BeautifulSoup(response.text)
            metas = soup.find_all('meta')
            meta_description = [meta.attrs['content'] for meta in metas \
                                if 'name' in meta.attrs \
                                and meta.attrs['name'] == 'description']
            if meta_description:
                before_url = 'here is a link: '
                url_meta = f' it is about, and I quote, "{meta_description[0]}"'

                return re.sub(rf'<({url})>', rf'{before_url}\1{url_meta}', text)
            else:
                return re.sub(rf'<({url})>', '<URL>', text)
        except:
            return re.sub(rf'<({url})>', '<URL>', text)
    else:
        return text

In [None]:
url1 = '<https://twitter.com/nickpgeorge|https://twitter.com/nickpgeorge>'
url2 = "<https://www.youtube.com/watch?v=fyW_QTAPkLQ&amp;feature=youtu.be>"

In [None]:
describe_urls(url1)

In [None]:
test = 'blah blah blah... <htp://lear;lsdkfnjsdata.com>'
describe_urls(test)

In [None]:
# df['replies'] = df['replies'].apply(describe_urls)

In [None]:
# sample = df.sample(100)
pat = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
r_urls = sample.replies.str.contains(rf'{pat}').sum()
c_urls = sample.c_text.str.contains(rf'{pat}').sum()
print(f'total urls: {r_urls + c_urls}')

In [None]:
sample_urls = sample[(sample.replies.str.contains(rf'{pat}')==True) | (sample.c_text.str.contains(rf'{pat}')== True)]

In [None]:
sample_urls.replies.iloc[2]

In [None]:
sample_urls['replies'] = sample_urls['replies'].apply(describe_urls)
sample_urls['c_text'] = sample_urls['c_text'].apply(describe_urls)

In [None]:
sample_urls

In [None]:
r_nope = sample.replies.str.contains('<URL>').sum()
c_nope = sample.c_text.str.contains('<URL>').sum()
r_found = sample.replies.str.contains(rf'{pat}').sum()
c_found = sample.c_text.str.contains(rf'{pat}').sum()
print(
    f'''
    total descriptions found: {r_found + c_found}
    total not found: {r_nope + c_nope}
    ''')

In [None]:
test_list = sample_urls.replies.tolist()

In [None]:
test_list

In [None]:
from multiprocessing import Pool
import tqdm

with Pool(32) as p:
     data_list = list((p.imap_unordered(describe_urls, test_list), total=len(test_list)))