In [1]:
from platform import python_version
python_version()

'3.6.7'

In [20]:
import pandas as pd
import multiprocessing
import nltk
import numpy as np
import sklearn
import re

In [3]:
pd.__version__, nltk.__version__, np.__version__, sklearn.__version__

('0.25.1', '3.4.5', '1.16.3', '0.21.1')

In [4]:
from multiprocessing import Pool

In [5]:
from nltk.corpus import brown

## generate a dataset that 3 times as big as the brown corpus by generating random permutations

In [6]:
def make_texts():
    return [" ".join(np.random.permutation(sents)) for sents in brown.sents()]

In [24]:
brown_df = pd.DataFrame({
    'text': make_texts() + make_texts() + make_texts() + make_texts()
})

In [25]:
brown_df.sample(10)

Unnamed: 0,text
136644,"Napoleon's was ideal as Macedon of His . was , Alexander Caesar Julius"
94946,foams Urethane
72356,"possible the , the talents incorporate range to offer exceptional freedom give and making to details design They which contours . of it free designer"
205270,purposes Impersonal
154171,", ; was which ; we , respected killing them trade about we their redcoats the of felt terms in Whatever"
145992,"It is full the summer this still amazing species swing happen an , , August while in in will for that instance some is fact . in"
5360,driver truck ; dairy The ;
104626,? What you `` here ? doing are ''
186249,it What : does
56572,"playing '' was As around every , , I nearly mine all with scene built picture me Mother . actually the `` Cabrini was"


In [26]:
brown_df.shape

(229360, 1)

In [27]:
def replace_weird_double_quotes(input_string):
    return input_string.replace("``",'"')

In [28]:
def to_lowercase(input_string):
    return input_string.lower()

In [29]:
def replace_digits_with_token(input_string):
    return re.sub(r"\b\d+\b","tok_num", input_string)

In [30]:
def get_text_length(input_string):
    return len(re.split(r"(?:\s+)|(?:,)|(?:\-)",input_string))

In [31]:
def process_df(df):
    
    output_df = df.copy()
    
    # replace weird double quotes with normal ones
    output_df['text']      = output_df['text'].apply(replace_weird_double_quotes)

    # text to lower case
    output_df['text']      = output_df['text'].apply(to_lowercase)
    
    # replace number with a special token
    output_df['text']      = output_df['text'].apply(replace_digits_with_token)
    
    # take out texts that are too large or too small
    output_df['num_words'] = output_df['text'].apply(get_text_length)   
        
    indices_to_remove_too_large = output_df[output_df['num_words'] > 50]
    output_df.drop(indices_to_remove_too_large.index, inplace=True)
    
    indices_to_remove_too_small = output_df[output_df['num_words'] < 10]
    output_df.drop(indices_to_remove_too_small.index, inplace=True)    
    
    output_df.reset_index(drop=True, inplace=True)
    
    return output_df

In [32]:
%%time
processed_df = process_df(brown_df)

CPU times: user 2.53 s, sys: 16 ms, total: 2.54 s
Wall time: 2.54 s


In [33]:
processed_df.head()

Unnamed: 0,text,num_words
0,"took election jury an . investigation the place atlanta's any "" produced evidence friday recent that fulton of irregularities no primary said county grand ''",25
1,"in city deserves thanks was manner election conducted the for committee . that the the charge city the '' , which of of term-end further atlanta the the in presentments and executive said jury "" which over-all praise of , had election the",47
2,"hard-fought judge primary september-october charged won '' the jr. irregularities jury term court by ivan by durwood pye had been reports . to in of possible superior which "" the mayor-nominate investigate fulton allen was",38
3,", received "" size of of , in handful widespread considering the only election '' such the relative "" the said a number the was '' and . jury of voters the interest , this reports city",40
4,"and that did jury "" registration outmoded laws find of election georgia's the often said are . ambiguous or inadequate '' and it many",24


In [34]:
processed_df.shape

(174440, 2)

## parallel version

In [43]:
NUM_CORES = 8
df_chunks = np.array_split(brown_df,NUM_CORES)

In [44]:
%%time

with multiprocessing.Pool(NUM_CORES) as pool:
    processed_df = pd.concat(pool.map(process_df, df_chunks), ignore_index=True)

CPU times: user 164 ms, sys: 136 ms, total: 300 ms
Wall time: 907 ms


In [42]:
processed_df.shape

(174440, 2)