In [39]:
import pandas as pd
import re
from multiprocessing import Pool

df = pd.read_csv('SO.csv')
df = df[['date', 'parent_id', 'score', 'title', 'reply']]
df.shape

(100000, 5)

In [40]:
needs_space = re.compile('\n|&nbsp')
no_space = re.compile('<.*?>|\r|&amp|&gt|&lt')

def no_code(text):
    '''Remove html, new_line, nbsp from text'''
    text = re.sub(needs_space, ' ', text)
    text = re.sub(no_space, '', text)
    return text

def cleaner(series):
    series = series.apply(no_code)
    return series

def fast_clean(df):
    names = list(df)
    cleaned = []
    for name in names:
        if name in ('title', 'reply'):
            with Pool(8) as p:
                seq = [df[name]]
                lis = p.map(cleaner, seq)
                for s in lis:
                    cleaned.append(pd.Series(s))
                
    return cleaned[0], cleaned[1]

In [41]:
%%time
df['c_title'], df['c_reply'] = fast_clean(df)
print(df.shape)
df.head()

CPU times: user 169 ms, sys: 535 ms, total: 704 ms
Wall time: 3.5 s


(100000, 7)

In [47]:
%%time
# Drop questions and answers longer than 510 characters.
df = df.loc[df['c_title'].str.len() < 511]
df = df.loc[df['c_reply'].str.len() < 511]

# Drop nans and reset index.
df = df.dropna()
df = df.reset_index(drop=True)
df.shape

CPU times: user 102 ms, sys: 0 ns, total: 102 ms
Wall time: 101 ms


(50785, 7)

In [49]:
df = df[['score', 'parent_id', 'c_title', 'c_reply']]
df.to_csv('SO_clean_50k.csv')

In [None]:
# def no_code(text):
#   '''Cleans code and html tags.'''
#   text = re.sub('<pre>.*?</pre>', '', text, flags=re.DOTALL)
#   text = re.sub('<code>.*?</code>', '', text, flags=re.DOTALL)
#   text = re.sub('<[^>]+>', '', text, flags=re.DOTALL)
#   return text.replace("\n", "")