# Logistic regression on oversampled datasets

## Load datasets

In [10]:
import pandas as pd
from os import path
from scipy.sparse import load_npz

dataDirectory = "./data/preprocessed-train-test"
featuresDirectory = "./data/features/tf-idf"
labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

features = {label: load_npz(path.join(featuresDirectory, "oversampled-{}-all.npz".format(label))) 
            for label in labels}

datasets = {label: pd.read_csv(path.join(dataDirectory, "oversampled-{}-all.csv".format(label))) 
            for label in labels}

In [13]:
contestTestFeatures = load_npz(path.join(featuresDirectory, "contest-test.npz"))
contestTest = pd.read_csv(path.join(dataDirectory, "contest-test.csv"))

## Logistic regression

In [9]:
from sklearn.linear_model import LogisticRegression

def getLabelPredictions(trainFeature, trainDataset, testFeature, label):
    model = LogisticRegression().fit(trainFeature, trainDataset[label])
    return model.predict_proba(testFeature)[:, 1]

In [15]:
results = pd.DataFrame.from_items( 
    [("id", contestTest["id"])] 
    + [(label, getLabelPredictions(features[label], datasets[label], contestTestFeatures, label)) 
       for label in labels])

## Export results

In [16]:
results.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999985,0.908444,0.999989,0.787636,0.999315,0.969821
1,0000247867823ef7,0.007931,0.001914,0.002114,0.000236,0.009395,0.009243
2,00013b17ad220c46,0.096677,0.014633,0.04518,0.003769,0.061378,0.023241
3,00017563c3f7919a,0.005284,0.007179,0.004339,0.001455,0.006547,0.00107
4,00017695ad8997eb,0.071303,0.014613,0.027473,0.005728,0.020629,0.007045


In [17]:
results.shape

(153164, 7)

In [18]:
exportFilename = "./submissions/oversampled-tf-idf.csv"

results.to_csv(exportFilename, index=False)