In [1]:
import pandas as pd
import pickle
from engarde.decorators import has_dtypes
from sklearn.pipeline import FeatureUnion
from capstone_project import preprocessor

In [2]:
# Make sure that the loaded dataframe has the correct layout otherwise throw assertion error 
@has_dtypes(dict(question1=object, question2=object, is_duplicate=int))
def load_and_check_data(filename):
    """Load dataframe using filename as input. A pandas dataframe is returned and it is checked that it 
    has the layout that is defined by the decorators.
    """
    df = pd.read_pickle(filename)
    df.fillna("", inplace=True)  # Two questions have empty fields
    return df

In [3]:
# The current working directory for python is the capstone_project/notebook folder
file_directory = "../output/data/"

train_data = load_and_check_data(file_directory+"train_data.pkl")
test_data = load_and_check_data(file_directory+"test_data.pkl")

In [4]:
train_data = train_data[:30]
test_data = test_data[:30]

tfidf = preprocessor.TfidfTransformer()
feature_egineering = preprocessor.FeatureTransformer()

train_y = train_data["is_duplicate"].values
test_y = test_data["is_duplicate"].values

feature_creator = FeatureUnion([('feature_engineering', feature_egineering), ('tfidf', tfidf)])

feature_creator.fit(train_data, train_y)

FeatureUnion(n_jobs=1,
       transformer_list=[('feature_engineering', FeatureTransformer()), ('tfidf', TfidfTransformer())],
       transformer_weights=None)

In [5]:
train_features = feature_creator.transform(train_data)
test_features = feature_creator.transform(test_data)

In [6]:
def save_output(dataset, filename):
    with open (filename, "wb") as handle:
        pickle.dump(dataset, handle)

In [7]:
prefix = "first_test_"

save_output(train_features, file_directory+prefix+"preprocessed_train_data.pkl") 
save_output(test_features, file_directory+prefix+"preprocessed_test_data.pkl")
save_output(train_y, file_directory+prefix+"preprocessed_train_labels.pkl")
save_output(test_y, file_directory+prefix+"preprocessed_test_labels.pkl")