In [54]:
import pandas as pd
import numpy as np
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
import string
import re
from multiprocessing import Pool
import multiprocessing as mp
from dask import delayed, compute
import dask.dataframe as dd
from dask.distributed import Client
from dask.diagnostics import ProgressBar
import time

In [3]:
num_partitions = 10 #number of partitions to split dataframe
num_cores = mp.cpu_count() #number of cores on your machine
Client(n_workers=4, threads_per_worker=2, memory_limit='4GB')

0,1
Client  Scheduler: tcp://127.0.0.1:53151  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 8  Memory: 16.00 GB


In [4]:
data = pd.read_csv('./data/train.csv')

In [5]:
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [6]:
nlp = spacy.load('en_core_web_sm')
# Remove HTML Tags
def remove_html(text):
    soup = BeautifulSoup(text,'lxml')
    html_free_text = soup.get_text()
    return html_free_text

# Remove Punctuations
def punctuation_remover(text):
    punctuation_free_text = " ".join([word for word in text if word \
                                    not in string.punctuation])
    return punctuation_free_text


# Stop Word Removal
cached_stop_words = stopwords.words('english') # Provides 70 X Speedup
def stop_words_remover(text):
#     text = text.lower().split()
    words = [word for word in text if \
             word not in cached_stop_words]
    return words

# Convert to lower case
def convert_to_lowercase(tokens):
    low = []
    for tok in tokens:
        low.append(tok.lower().strip())
    return low

# Lemmatization
def lemmatize_words(text):
    words = nlp(str(text))
    return [word.lemma_ for word in words if word.lemma_ != '-PRON-']  

def replace_urls(tokens):
    re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', " ", tokens)
    return tokens

In [7]:
# @delayed
def clean_text_for_tasks(text, for_pos_tagging = False):
    cleaned_text = remove_html(text)
    cleaned_text = replace_urls(cleaned_text)
    cleaned_text = word_tokenize(cleaned_text)
    cleaned_text = stop_words_remover(cleaned_text)
    cleaned_text = lemmatize_words(cleaned_text)
    cleaned_text = punctuation_remover(cleaned_text)
    
    return cleaned_text

In [8]:
# def clean_text(text, remove_stopwords = True):
#     output = ""
#     text = str(text).replace("\n", "")
#     text = re.sub(r'[^\w\s]','',text).lower()
#     if remove_stopwords:
#         text = text.split(" ")
#         for word in text:
#             if word not in stopwords.words("english"):
#                 output = output + " " + word
#     else:
#         output = text
#     return str(output.strip())[1:-3].replace("  ", " ")

In [9]:
# def cleaner(data):
#     data['cleaned_comments'] = data['comment_text'].apply(lambda x : clean_text_for_tasks(x))
#     return data

# def parallelize_dataframe(df, func):
#     df_split = np.array_split(df, num_partitions)
#     pool = Pool(num_cores)
#     df = pd.concat(pool.map(func, df_split))
#     pool.close()
#     pool.join()
#     return df

In [10]:
# def delayed_cleaner(data):
#     cleaned_data = []
#     for index,row in data.iterrows():
#         cleaned_text = clean_text_for_tasks(row.comment_text)
#         cleaned_data.append(cleaned_text)
#     return cleaned_data

# def parallelize_list(df, func):
#     df_split = np.array_split(df, num_partitions)
#     pool = Pool(num_cores)
#     complete_list = complete_list.append(pool.map(func, df_split))
#     pool.close()
#     pool.join()
#     return df
    


In [11]:
# cleaned_data = delayed_cleaner(data)

# cleaned_data = compute(*cleaned_data)   

# cleaned_data_df = pd.DataFrame(cleaned_data)

In [43]:
temp_data = data[:10000]

In [44]:
temp_data['cleaned_text'] = -1

In [45]:
temp_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,cleaned_text
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,-1
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,-1
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,-1
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,-1
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,-1


In [46]:
dask_df = dd.from_pandas(temp_data,npartitions=8)

In [47]:
dask_df

Unnamed: 0_level_0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,cleaned_text
npartitions=8,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,object,object,int64,int64,int64,int64,int64,int64,int64
1250,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...
8750,...,...,...,...,...,...,...,...,...
9999,...,...,...,...,...,...,...,...,...


In [48]:
missing_values = dask_df.isnull().sum()
with ProgressBar():
    missing_percent = ((missing_values / dask_df.index.size)*100).compute()
missing_percent   

id               0.0
comment_text     0.0
toxic            0.0
severe_toxic     0.0
obscene          0.0
threat           0.0
insult           0.0
identity_hate    0.0
cleaned_text     0.0
dtype: float64

In [49]:
with ProgressBar():
    no_of_toxic_comments = dask_df['toxic'].value_counts().compute()
no_of_toxic_comments  

0    9029
1     971
Name: toxic, dtype: int64

In [50]:
def clean_df(df):
    df['cleaned_text'] = df.comment_text.map(clean_text_for_tasks)
    return df

In [55]:
start = time.time()
result = dask_df.map_partitions(clean_df,meta=temp_data)

In [56]:
df = result.compute()
print(f'Time Taken for Processing {df.shape[0]} rows with Dask(4 Workers ,8 Cores) : {time.time()-start} ')

Time Taken for Processing 10000 rows with Dask(4 Workers ,8 Cores) : 345.70559334754944 


In [57]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,cleaned_text
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation why edit make username Hardcore Me...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,D'aww match background colour m seemingly stuc...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,hey man m really try edit war 's guy constantl...
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,More ca n't make real suggestion improvement w...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,sir hero any chance remember page 's


In [58]:
df_without_parallel = temp_data.copy()
start = time.time()
df_without_parallel = df_without_parallel.comment_text.apply(clean_text_for_tasks)

In [59]:
print(f'Time Taken for Processing {df_without_parallel.shape[0]} rows without parallelization : {time.time()-start} ')

Time Taken for Processing 10000 rows without parallelization : 2846.9455647468567 


In [60]:
df.to_csv('./data/toxic_comments_cleaned.csv',index=False)