# Oversample data and features

## Import datasets and features

In [None]:
import pandas as pd
from os import path
from scipy.sparse import load_npz

dataDirectory = "./data/preprocessed-train-test"
featuresDirectory = "./data/features/tf-idf"

train, test, data, contestTest = map(
    lambda filename: pd.read_csv(path.join(dataDirectory, filename)), 
    ["train.csv", "test.csv", "all.csv", "contest-test.csv"])

trainFeatures, testFeatures, dataFeatures, contestTestFeatures = map(
    lambda filename: load_npz(path.join(featuresDirectory, filename)),
    ["train.npz", "test.npz", "all.npz", "contest-test.npz"])

## Oversampling function

In [None]:
from scipy.sparse import vstack

def oversample(dataset, features, label): 
    multiples = int(dataset[dataset[label] == 0].shape[0] / dataset[dataset[label] == 1].shape[0])
    
    datasetPositive = dataset[dataset[label] == 1]
    featuresPositive = features[datasetPositive.index.tolist(), :]
    
    datasetOversampled = pd.concat([dataset] + multiples*[datasetPositive]) 
    featuresOversampled = vstack([features] + multiples * [featuresPositive])
    
    return datasetOversampled, featuresOversampled

## Export oversampled datasets and features

In [None]:
import csv
from scipy.sparse import save_npz

def exportDataset(dataset, label, name):
    dataset.to_csv(
        path.join(dataDirectory, "oversampled-{}-{}.csv".format(label, name)), 
        index=False, 
        escapechar="\\",
        quoting=csv.QUOTE_NONNUMERIC)

def exportFeatures(features, label, name):
    save_npz(
        path.join(featuresDirectory, "oversampled-{}-{}.npz".format(label, name)), 
        features)

In [None]:
labels = train.columns[2:].tolist()

for label in labels:
    datasetOversampled, featuresOversampled = oversample(train, trainFeatures, label)
    exportDataset(datasetOversampled, label, "train")
    exportFeatures(featuresOversampled, label, "train")
    
for label in labels:
    datasetOversampled, featuresOversampled = oversample(data, dataFeatures, label)
    exportDataset(datasetOversampled, label, "all")
    exportFeatures(featuresOversampled, label, "all")