# Preprocessing and train/test split based only on the toxic label

## Quick look at the shape of the data

In [163]:
import pandas as pd
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_colwidth", -1)

In [164]:
data = pd.read_csv("./data/raw-data/train.csv")

In [165]:
contestTest = pd.read_csv("./data/raw-data/test.csv")

In [166]:
data.shape

(159571, 8)

In [167]:
contestTest.shape

(153164, 2)

In [168]:
data.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


## Split to train and test

In [169]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=543553)

In [170]:
trainIndex, testIndex = list(split.split(data, data.toxic))[0]

In [171]:
train, test = data.iloc[trainIndex], data.iloc[testIndex]

In [172]:
train.shape

(127656, 8)

In [173]:
test.shape

(31915, 8)

In [174]:
data.shape

(159571, 8)

In [175]:
contestTest.shape

(153164, 2)

## Export data

In [176]:
from os import path
import csv

def exportDataset(dataset, filename, exportDirectory):
    dataset.to_csv(
        path.join(exportDirectory, filename), 
        index=False, 
        escapechar="\\",
        quoting=csv.QUOTE_NONNUMERIC)

In [177]:
exportDirectory = "./data/preprocessed-train-test"
datasetFilenames = zip(
    [train, test, data, contestTest], 
    ["train.csv", "test.csv", "all.csv", "contest-test.csv"])

for dataset, filename in datasetFilenames:
    exportDataset(dataset, filename, exportDirectory)