# Preprocessing and train/test split based only on the toxic label

## Quick look at the shape of the data

In [None]:
import pandas as pd
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_colwidth", -1)

In [None]:
data = pd.read_csv("./data/raw-data/train.csv")

In [None]:
data.shape

In [None]:
data.describe()

## Split to train and test

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=543553)

In [None]:
trainIndex, testIndex = list(split.split(data, data.toxic))[0]

In [None]:
train, test = data.iloc[trainIndex], data.iloc[testIndex]

## Filter punctuation

In [None]:
def filterPunctuation(dataset):
    return (dataset
    .str.replace("[^A-Za-z\s]", "")
    .str.replace("\s", " ")
    .str.lower())

In [None]:
%%time

trainFiltered = pd.concat([
        train.drop(["comment_text"], axis=1), 
        filterPunctuation(train.comment_text)], 
    axis=1)

testFiltered = pd.concat([
        test.drop(["comment_text"], axis=1),
        filterPunctuation(test.comment_text)], 
    axis=1)

## Export train and test

In [None]:
trainFiltered.head()

In [None]:
testFiltered.head()

In [None]:
trainFiltered.shape

In [None]:
testFiltered.shape

In [None]:
from os import path
import csv

exportDirectory = "./data/preprocessed-train-test"

trainFiltered.to_csv(
    path.join(exportDirectory, "train.csv"), 
    index=False, 
    escapechar="\\",
    quoting=csv.QUOTE_NONNUMERIC)
testFiltered.to_csv(
    path.join(exportDirectory, "test.csv"), 
    index=False, 
    escapechar="\\",
    quoting=csv.QUOTE_NONNUMERIC)