In [1]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
import re
import nltk
import string
from dask.multiprocessing import get
from nltk import word_tokenize
from nltk.corpus import stopwords
import datefinder
stopwords = nltk.corpus.stopwords.words('english')
wnl = nltk.WordNetLemmatizer()
stm = nltk.PorterStemmer()

In [2]:
def clean_dates(text):
    dates = list(datefinder.find_dates(text, source=True))
    dates = [x[1] for x in dates]
    for date in dates:
        text = text.replace(date,'')
    return text

def clean_text(text):
    text = text.lower()
    ipaddress = re.findall( r'[0-9]+(?:\.[0-9]+){3}', text)
    for ip in ipaddress:
        text = text.replace(ip,'')
    text = clean_dates(text)
    text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', text)
    text = text.replace("can't", "can not")
    text = text.replace("havn't", "have not")
    text = text.replace("n't"," not")
    text = text.replace("i'm", "i am")
    text = text.replace("it's", "it is")
    text = text.replace("there's", "there is")
    text = text.replace("'ve", " have")
    text = text.replace("e-mail", "email")
    text = text.replace("you'll", "you will")
    text = re.sub('([' + string.punctuation + '“”¨«»®´·º½¾¿¡§£₤‘’])', '', text)
    text = nltk.word_tokenize(text)
    text = ' '.join([x.strip() for x in text])
    text = re.sub('\s+', ' ', text)
    return text.strip().lower()

In [3]:
train_data = pd.read_csv('../data/download/train.csv', usecols=['comment_text'])
test_data = pd.read_csv('../data/download/test.csv', usecols=['comment_text'])
internal = train_data.append(test_data)
internal['comment_text'] = internal['comment_text'].fillna('nan')
internal = dd.from_pandas(internal, npartitions=10)
internal = internal.map_partitions(lambda df: df.apply((lambda row: clean_text(*row)),axis=1))
internal = internal.compute(get=get)
internal = pd.DataFrame(internal, columns=['comment_text'])
del train_data, test_data
print('internal data:', internal.shape)

internal data: (312735, 1)


In [4]:
external_1 = pd.read_csv('../data/download/attack_annotated_comments.tsv', sep='\t')
external_1['comment'] = external_1['comment'].map(lambda x : x.replace('NEWLINE_TOKEN',' '))
external_2 = pd.read_csv('../data/download/aggression_annotated_comments.tsv', sep='\t')
external_2['comment'] = external_2['comment'].map(lambda x : x.replace('NEWLINE_TOKEN',' '))
external_3 = pd.read_csv('../data/download/toxicity_annotated_comments.tsv', sep='\t')
external_3['comment'] = external_3['comment'].map(lambda x : x.replace('NEWLINE_TOKEN',' '))

In [5]:
external = external_1[['comment']].copy()
external = external.append(external_2[['comment']])
external = external.append(external_3[['comment']])
external = external.rename(columns={'comment':'comment_text'})
del external_1, external_2, external_3
external = dd.from_pandas(external, npartitions=10)
external = external.map_partitions(lambda df: df.apply((lambda row: clean_text(*row)),axis=1))
external = external.compute(get=get)
external = pd.DataFrame(external, columns=['comment_text'])
print('external data:', external.shape)

external data: (391414, 1)


In [6]:
data = internal.append(external)
data['comment_text'] = data['comment_text'].fillna('nan')
data = data.sample(frac=1.0, random_state=2017)
data[['comment_text']].to_csv('../data/data/fasttext/data.txt', index=False, header=None)

In [7]:
!fasttext skipgram -input ../data/data/fasttext/data.txt -output ../data/data/fasttext/vector -minCount 7

Read 46M words
Number of words:  93650
Number of labels: 0
Progress: 100.0% words/sec/thread:  103683 lr:  0.000000 loss:  1.320773 ETA:   0h 0m 0.023224 loss:  1.773688 ETA:   0h 1m 0.019452 loss:  1.749323 ETA:   0h 1mh 0m 1.536813 ETA:   0h 0m
