# Feature extraction: TF-IDF 

## Quick look at the shape of the data

In [None]:
import pandas as pd
from os import path

importDirectory = "./data/preprocessed-train-test/"

train, test, data = map(
    lambda filename: pd.read_csv(path.join(importDirectory, filename)), 
    ["train.csv", "test.csv", "all.csv"])

In [None]:
print("train: {}, test: {}, all: {}".format(train.shape, test.shape, data.shape))

### Tf-idf features

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=70000, 
    norm="l2")

vectorizerTrain = vectorizer.fit(train.comment_text)
vectorizerData = vectorizer.fit(data.comment_text)

In [None]:
trainFeatures = vectorizerTrain.transform(train.comment_text)
testFeatures = vectorizerTrain.transform(test.comment_text)
dataFeatures = vectorizerData.transform(data.comment_text)

In [None]:
print("train: {}, test: {}, all: {}".format(trainFeatures.shape, testFeatures.shape, dataFeatures.shape))

## Export features

In [None]:
from scipy.sparse import save_npz

def exportFeatures(filename, features):
    save_npz(filename, features)

In [None]:
from os import path

exportDirectory = "./data/features/tf-idf"
featureFilenames = zip(
    [trainFeatures, testFeatures, dataFeatures], 
    ["train.npz", "test.npz", "all.npz"])

for features, filename in featureFilenames:
    exportFeatures(path.join(exportDirectory, filename), features)