# Run logistic regression on a feature set

## Import data

In [None]:
import pandas as pd
from os import path
from scipy.sparse import load_npz

dataDirectory = "./data/preprocessed-train-test"
featuresDirectory = "./data/features/tf-idf"

train, test, data, contestTest = map(
    lambda filename: pd.read_csv(path.join(dataDirectory, filename)), 
    ["train.csv", "test.csv", "all.csv", "contest-test.csv"])

trainFeatures, testFeatures, dataFeatures, contestTestFeatures = map(
    lambda filename: load_npz(path.join(featuresDirectory, filename)),
    ["train.npz", "test.npz", "all.npz", "contest-test.npz"])

## Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

def trainModel(features, labels):
    return LogisticRegression().fit(features, labels)

def getPredictions(model, features):
    return model.predict(features), model.predict_proba(features)

In [None]:
model = trainModel(trainFeatures, train.toxic)

testPredictions, testProbPredictions = getPredictions(model, testFeatures)

In [None]:
dataFeatures.shape

## Evaluate predictions

In [None]:
from evaluate_predictions import evaluatePredictions

In [None]:
evaluatePredictions(test.toxic, testPredictions, testProbPredictions)

## Print metrics for train dataset

In [None]:
trainPredictions, trainProbPredictions = getPredictions(model, trainFeatures)
evaluatePredictions(train.toxic, trainPredictions, trainProbPredictions)

## Train on all datapoints and labels

In [None]:
def getLabelPredictions(trainFeatures, trainDataset, testFeatures, label):
    model = trainModel(trainFeatures, trainDataset[label])
    return model.predict_proba(testFeatures)[:, 1]

In [None]:
labelColumns = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

results = pd.DataFrame.from_items( 
    [("id", contestTest["id"])] 
    + [(label, getLabelPredictions(dataFeatures, data, contestTestFeatures, label)) 
       for label in labelColumns])

In [None]:
results.head()

## Export submission

In [None]:
exportFilename = "./submissions/good-old-tf-idf.csv"

results.to_csv(exportFilename, index=False)