# Logistic regression on oversampled datasets

## Load datasets

In [None]:
import pandas as pd
from os import path
from scipy.sparse import load_npz

dataDirectory = "./data/preprocessed-train-test"
featuresDirectory = "./data/features/tf-idf"
labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

features = {label: load_npz(path.join(featuresDirectory, "oversampled-{}-all.npz".format(label))) 
            for label in labels}

datasets = {label: pd.read_csv(path.join(dataDirectory, "oversampled-{}-all.csv".format(label))) 
            for label in labels}

In [None]:
contestTestFeatures = load_npz(path.join(featuresDirectory, "contest-test.npz"))
contestTest = pd.read_csv(path.join(dataDirectory, "contest-test.csv"))

## Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

def getLabelPredictions(trainFeature, trainDataset, testFeature, label):
    model = LogisticRegression().fit(trainFeature, trainDataset[label])
    return model.predict_proba(testFeature)[:, 1]

In [None]:
results = pd.DataFrame.from_items( 
    [("id", contestTest["id"])] 
    + [(label, getLabelPredictions(features[label], datasets[label], contestTestFeatures, label)) 
       for label in labels])

## Export results

In [None]:
results.head()

In [None]:
results.shape

In [None]:
exportFilename = "./submissions/oversampled-tf-idf.csv"

results.to_csv(exportFilename, index=False)