In [None]:
import pandas as pd
import re
from multiprocessing import Pool

df = pd.read_csv('SO.csv')
df = df[['date', 'parent_id', 'score', 'title', 'reply']]
df.shape

In [57]:
needs_space = re.compile('\n|&nbsp')
no_space = re.compile('<.*?>|\r|&amp|&gt|&lt')

def no_code1(text):
    '''Remove html, new_line, nbsp from text'''
    text = re.sub(needs_space, ' ', text)
    text = re.sub(no_space, '', text)
    return text

def no_code2(text):
    '''Cleans code and html tags.'''
    text = re.sub('<pre>.*?</pre>', '', text, flags=re.DOTALL)
    text = re.sub('<code>.*?</code>', '', text, flags=re.DOTALL)
    text = re.sub('<[^>]+>', '', text, flags=re.DOTALL)
    return text.replace("\n", "")

def no_short_reply(text):
    if len(text) < 30:
        text = None
    return text

def cleaner(series):
#     series = series.apply(no_code1)
    series = series.apply(no_code2)
    series = series.apply(no_short_reply)
    return series

def fast_clean(df):
    names = list(df)
    cleaned = []
    for name in names:
        if name in ('title', 'reply'):
            with Pool(8) as p:
                seq = [df[name]]
                lis = p.map(cleaner, seq)
                for s in lis:
                    cleaned.append(pd.Series(s))
                
    return cleaned[0], cleaned[1]

In [58]:
%%time
df['c_title'], df['c_reply'] = fast_clean(df)
print(df.shape)
df.head()

(100000, 7)
CPU times: user 160 ms, sys: 408 ms, total: 568 ms
Wall time: 2.38 s


Unnamed: 0,date,parent_id,score,title,reply,c_title,c_reply
0,2017-02-10T11:47:29.810,12692067,8,(and other unicode characters) in identifiers...,"<p>This is a known bug in GCC: <a href=""https:...",(and other unicode characters) in identifiers...,This is a known bug in GCC: Bug 67224 - UTF-8 ...
1,2017-01-12T05:34:57.210,2996139,2,500 Internal Server Error in ASP.NET MVC,<p>I got more details of the error from window...,500 Internal Server Error in ASP.NET MVC,I got more details of the error from windows e...
2,2017-07-20T14:30:21.050,10492178,10,Cannot find yasm even though I have installed...,<p>Just in case of someone got here from Googl...,Cannot find yasm even though I have installed...,"Just in case of someone got here from Google, ..."
3,2016-04-03T10:37:21.017,12430337,8,Embed Youtube videos :- with contains content...,<p>You can use <code>player vars</code> while ...,Embed Youtube videos :- with contains content...,You can use while initializing youtube sdk:En...
4,2016-09-22T22:59:37.590,12430337,2,Embed Youtube videos :- with contains content...,"<p>If you're embedding on a mobile app, you ne...",Embed Youtube videos :- with contains content...,"If you're embedding on a mobile app, you need ..."


In [59]:
%%time
# Drop questions and answers longer than 510 characters.
df = df.loc[df['c_title'].str.len() < 511]
df = df.loc[df['c_reply'].str.len() < 511]

# Drop nans and reset index.
df = df.dropna()
df = df.reset_index(drop=True)
df.shape

CPU times: user 167 ms, sys: 7.15 ms, total: 174 ms
Wall time: 173 ms


(65823, 7)

In [60]:
df = df[['score', 'parent_id', 'c_title', 'c_reply']]
df.to_csv('SO_clean_65k.csv', sep='\t')