In [None]:
# !pip install bs4

In [None]:
import psycopg2
from sqlalchemy import create_engine
import pandas as pd
import re
from multiprocessing import Pool

In [None]:
db_string = "postgresql://postgres:postgres@postgres/postgres"
# db_string = "postgresql://postgres:postgres@postgres/dev4slack"
db = create_engine(db_string)

def query_df(line_query, cell_query=None, conn=db):
    if cell_query==None:
      return pd.read_sql(line_query, conn)
    return pd.read_sql(cell_query, conn)

# Custom notebook magic commands for loading sql.
from IPython.core.magic import register_line_cell_magic
def create_df_sql_magic(magic_name, conn):
    def sql_df(line_query, cell_query=None, conn=db):
        if cell_query==None:
          return pd.read_sql(line_query, conn)
        return pd.read_sql(cell_query, conn)
    custom_func = sql_df
    custom_func.__name__ = magic_name
    register_line_cell_magic(custom_func)
create_df_sql_magic('sql_df', db)

In [None]:
# query = \
# '''
# SELECT 
#     message.text AS p, message.reply_count, message.user_id as p_id, message.ts,
#     reply.text AS c, reply.user_id as c_id
# FROM message
# LEFT JOIN reply on reply.thread_ts=message.ts
# WHERE message.channel_id='CFBBHV7AT' AND message.reply_count > 0
# ORDER BY message.ts, reply.ts;
# '''
# df = query_df(query)
# df.shape

In [None]:
df = pd.read_csv('all_channels.csv', error_bad_lines=False)
df.shape

In [None]:
df = df.dropna()
df.shape

In [None]:
# Drop rows where a reply refers to an unrelated parent.
# df = df.groupby('ts', as_index=False).apply(lambda x: x.iloc[:-1])
# df.shape

# Rename and keep only parent and child columns.
df.columns = ['p', 'c']
# df = df[['p', 'c']]

# Do this if you want parents and replies in order by time.
df['r'] = df.c.shift(-1)

# Do this if you want all children to refer to original parent.
# df.columns = ['p', 'r']
df.shape

In [None]:
# drop 'p' column. rename others
df = df[['c', 'r']]
df.columns = ['p', 'r']

In [None]:
df.isna().sum().sum()

In [None]:
df = df.dropna()

In [None]:
started = len(df)

In [None]:
def no_whitespace(text):
    for r in (("\t", " "), ("\n", " "), ('"', '')):
        text = text.replace(*r)
    return text

def no_url(text):
    tokens = text.split()
    new = []
    for t in tokens:
        if 'http' in t:
            new.append('<URL>')        
        else:
            new.append(t)
    clean = ' '.join(new)
    return clean

def no_short_reply(text):
    if len(text) < 10:
        text = None
    return text

def cleaner(series):
    series = series.apply(no_whitespace)
    series = series.apply(no_url)
    series = series.apply(no_short_reply)
    return series

def fast_clean(df):
  # requires df to have columns 'p' and 'r' for parent and reply
    with Pool(16) as p:
        seq = [df[col] for col in list(df)]
        listy = p.map(cleaner, seq)
        results = [pd.Series(i) for i in listy]
        clean = pd.concat(results, axis=1)
        clean = clean.dropna()
        clean = clean[clean.p != clean.r]
    return clean

In [None]:
df = fast_clean(df)
df.shape

In [None]:
now = len(df)
print('dropped rows: ' + str(started - now))

In [None]:
headers = ['p', 'r']

In [None]:
# df = pd.read_csv('train.tsv', sep='\t', names=headers)

In [None]:
# df.shape

In [None]:
# !pip install scikit-learn

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, random_state=42)

In [None]:
train.to_csv('train.tsv', sep='\t', index=False, header=False)
test.to_csv('test.tsv', sep='\t', index=False, header=False)

In [None]:
train.shape, test.shape

In [None]:
# df = '1.0 ' + df
# df.to_csv('train.tsv', sep ='\t', index=False, header=False)