# Feature extraction: TF-IDF 

## Quick look at the shape of the data

In [None]:
import pandas as pd
from os import path

importDirectory = "./data/preprocessed-train-test/"

train, test, data, contestTest = map(
    lambda filename: pd.read_csv(path.join(importDirectory, filename)), 
    ["train.csv", "test.csv", "all.csv", "contest-test.csv"])

In [None]:
print("train: {}, test: {}, all: {}, contestTest: {}".format(
    train.shape, test.shape, data.shape, contestTest.shape))

### Tf-idf features

In [None]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

vectorizerTrain = vectorizer.fit(train.comment_text)
vectorizerData = vectorizer.fit(data.comment_text)

In [None]:
%%time
trainFeatures = vectorizerTrain.transform(train.comment_text)
testFeatures = vectorizerTrain.transform(test.comment_text)
dataFeatures = vectorizerData.transform(data.comment_text)
contestTestFeatures = vectorizerData.transform(contestTest.comment_text)

## Evaluate features

In [None]:
%%time
from feature_evaluation_logistic_regression import evaluateFeaturesWithLogisticRegression

evaluateFeaturesWithLogisticRegression(trainFeatures, testFeatures, train.toxic, test.toxic)

## Export features

In [None]:
print("train: {}, test: {}, all: {}, contestTest: {}".format(
    trainFeatures.shape, testFeatures.shape, dataFeatures.shape, contestTestFeatures.shape))

In [None]:
from scipy.sparse import save_npz

def exportFeatures(filename, features):
    save_npz(filename, features)

In [None]:
from os import path

exportDirectory = "./data/features/tf-idf"
featureFilenames = zip(
    [trainFeatures, testFeatures, dataFeatures, contestTestFeatures], 
    ["train.npz", "test.npz", "all.npz", "contest-test.npz"])

for features, filename in featureFilenames:
    exportFeatures(path.join(exportDirectory, filename), features)