In [1]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
import re
import nltk
import string
from sklearn.model_selection import KFold
from nltk import word_tokenize
from nltk.corpus import stopwords
from dask.multiprocessing import get
import datefinder
stopwords = nltk.corpus.stopwords.words('english')
wnl = nltk.WordNetLemmatizer()
stm = nltk.PorterStemmer()

In [2]:
def clean_dates(text):
    dates = list(datefinder.find_dates(text, source=True))
    dates = [x[1] for x in dates]
    for date in dates:
        text = text.replace(date,'')
    return text

def clean_text(idx,text):
    text = text.lower()
    ipaddress = re.findall( r'[0-9]+(?:\.[0-9]+){3}', text)
    for ip in ipaddress:
        text = text.replace(ip,'')
    text = clean_dates(text)
    text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', text)
    text = text.replace("can't", "can not")
    text = text.replace("havn't", "have not")
    text = text.replace("n't"," not")
    text = text.replace("i'm", "i am")
    text = text.replace("it's", "it is")
    text = text.replace("there's", "there is")
    text = text.replace("'ve", " have")
    text = text.replace("e-mail", "email")
    text = text.replace("you'll", "you will")
    text = re.sub('([' + string.punctuation + '“”¨«»®´·º½¾¿¡§£₤‘’])', '', text)
    text = nltk.word_tokenize(text)
    text = ' '.join([x.strip() for x in text])
    text = re.sub('\s+', ' ', text)
    return {idx:text.strip().lower()}

In [3]:
labels = ['id','toxic','severe_toxic','obscene','threat','insult','identity_hate']
text = ['id','comment_text']

In [5]:
train_data = pd.read_csv('../data/download/train.csv').fillna('nan')
print('train data:', train_data.shape)
train_labels = train_data[labels].drop_duplicates()
train_text = train_data[text]
train_text = dd.from_pandas(train_text, npartitions=10)
train_text = train_text.map_partitions(lambda df: df.apply((lambda row: clean_text(*row)),axis=1))
train_text = train_text.compute(get=get)
train_text = pd.DataFrame(train_text, columns=['comment_text'])
train_text['id'] = train_text['comment_text'].map(lambda x : list(x.items())[0][0])
train_text['comment_text'] = train_text['comment_text'].map(lambda x : list(x.items())[0][1])
train_data = train_text.merge(train_labels, on='id')
print('train data:', train_data.shape)
del train_text, train_labels

train data: (159571, 8)
train data: (159571, 8)


In [6]:
test_data = pd.read_csv('../data/download/test.csv').fillna('nan')
test_text = dd.from_pandas(test_data, npartitions=10)
test_text = test_text.map_partitions(lambda df: df.apply((lambda row: clean_text(*row)),axis=1))
test_text = test_text.compute(get=get)
test_text = pd.DataFrame(test_text, columns=['comment_text'])
test_text['id'] = test_text['comment_text'].map(lambda x : list(x.items())[0][0])
test_text['comment_text'] = test_text['comment_text'].map(lambda x : list(x.items())[0][1])
test_data = test_text[['id','comment_text']].copy()
print('train data:', test_data.shape)
del test_text

train data: (153164, 2)


In [7]:
train_feats = pd.read_csv('../data/download/train_feats.csv')
test_feats = pd.read_csv('../data/download/test_feats.csv')
train_data = train_data.merge(train_feats, on='id')
test_data = test_data.merge(test_feats, on='id')
print('train data:', train_data.shape, train_feats.shape)
print('score data:', test_data.shape, test_feats.shape)
feats = text + list(train_feats.columns)[1:]

train data: (159571, 15) (159571, 8)
score data: (153164, 9) (153164, 8)


In [8]:
comments = train_data['id'].unique()
X = train_data[feats]
y = train_data[labels]
folds = KFold(n_splits=5, shuffle=True, random_state=2017)

In [9]:
fold = 1
source = 'source_1'

for train_idx, test_idx in folds.split(comments):
    train_idx = comments[train_idx]
    test_idx = comments[test_idx]
    X_train = X[X['id'].isin(train_idx)]
    y_train = y[y['id'].isin(train_idx)]
    X_test = X[X['id'].isin(test_idx)]
    y_test = y[y['id'].isin(test_idx)]
    print('train data:', X_train.shape, y_train.shape)
    print('test data:', X_test.shape, y_test.shape)
    path = '../data/data/{}/train/'.format(source)
    X_train.to_csv(path + 'train_data_{}.csv'.format(fold), index=False)
    y_train.to_csv(path + 'train_labels_{}.csv'.format(fold), index=False)
    X_test.to_csv(path + 'test_data_{}.csv'.format(fold), index=False)
    y_test.to_csv(path + 'test_labels_{}.csv'.format(fold), index=False)
    fold += 1

train data: (127656, 9) (127656, 7)
test data: (31915, 9) (31915, 7)
train data: (127657, 9) (127657, 7)
test data: (31914, 9) (31914, 7)
train data: (127657, 9) (127657, 7)
test data: (31914, 9) (31914, 7)
train data: (127657, 9) (127657, 7)
test data: (31914, 9) (31914, 7)
train data: (127657, 9) (127657, 7)
test data: (31914, 9) (31914, 7)


In [10]:
source = 'source_1'
path = '../data/data/{}/score/'.format(source)
test_data[feats].to_csv(path + 'score_text.csv', index=False)
print('test data:', test_data.shape)

test data: (153164, 9)
