# Preprocessing and train/test split based only on the toxic label

## Quick look at the shape of the data

In [None]:
import pandas as pd
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_colwidth", -1)

In [None]:
data = pd.read_csv("./data/raw-data/train.csv")

In [None]:
contestTest = pd.read_csv("./data/raw-data/test.csv")

In [None]:
data.shape

In [None]:
contestTest.shape

In [None]:
data.describe()

## Split to train and test

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=543553)

In [None]:
trainIndex, testIndex = list(split.split(data, data.toxic))[0]

In [None]:
train, test = data.iloc[trainIndex], data.iloc[testIndex]

In [None]:
train.shape

In [None]:
test.shape

In [None]:
data.shape

In [None]:
contestTest.shape

## Export data

In [None]:
from os import path
import csv

def exportDataset(dataset, filename, exportDirectory):
    dataset.to_csv(
        path.join(exportDirectory, filename), 
        index=False, 
        escapechar="\\",
        quoting=csv.QUOTE_NONNUMERIC)

In [None]:
exportDirectory = "./data/preprocessed-train-test"
datasetFilenames = zip(
    [train, test, data, contestTest], 
    ["train.csv", "test.csv", "all.csv", "contest-test.csv"])

for dataset, filename in datasetFilenames:
    exportDataset(dataset, filename, exportDirectory)