# <center> Feature engineering </center>

All preprocessing steps that do not implement a fit method are applied in this notebook.  All transformers that implement a fit function must be applied in a later step in order for cross validation to work correctly.

In [1]:
import gensim
import numpy as np
import pandas as pd
from time import time
from IPython.display import display
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from capstone_project import preprocessor as pre
from capstone_project import utility

Load tokenized datasets:

In [2]:
# The current working directory for python is the capstone_project/notebook folder
file_directory = "../output/data/"
prefix = "tokenized_"

train_data = utility.load_pickle(file_directory, prefix+"train_data.pkl")
val_data = utility.load_pickle(file_directory, prefix+"val_data.pkl")
test_data = utility.load_pickle(file_directory, prefix+"test_data.pkl")

Check that correct dataframes have been loaded by displaying them:

In [3]:
display(train_data.head(1))
display(val_data.head(1))
display(test_data.head(1))

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_tokens,q2_tokens
355802,355802,696825,696826,Which are the best songs of Enrique Iglesias?,Which is the best song of Enrique iglesias?,1,"[good, song, enrique, iglesias]","[good, song, enrique, iglesias]"


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_tokens,q2_tokens
304293,304293,597514,597515,Should i buy iPhone 7 or iPhone 6S?,Should I buy iPhone 6s or wait to buy iPhone 7?,1,"[buy, iphone, 7, iphone, 6s]","[buy, iphone, 6s, wait, buy, iphone, 7]"


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_tokens,q2_tokens
113079,113079,224193,224194,What does horny goat weed do?,What is horny goat weed?,1,"[horny, goat, weed]","[horny, goat, weed]"


Load pretrained word2vec model. The model can be downloaded at: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing

In [4]:
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format("../data/GoogleNews-vectors-negative300.bin.gz", binary=True)

Next we will create the feature transformers and add them to a pipeline that we will use to create the new features. We will create two different types of features. The first type the word features are derived from the text data directly. The second type the word2vec feature are derived from word vectors. The word2vec transformer creates the word vectors.

In [5]:
word_features = pre.FeatureTransformer()
word2vec_transform = pre.Word2vecTransformer()
word2vec_features = pre.VectorFeatureTransformer()

Create word2vec pipeline and the feature union:

In [6]:
word2vec_pipe = Pipeline([("vec_transformer", word2vec_transform), ("vec_features", word2vec_features)])
feature_creator = FeatureUnion([('word_features', word_features), ('word2vec_pipeline', word2vec_pipe)])

print feature_creator.get_params().keys()

['n_jobs', 'word2vec_pipeline', 'word2vec_pipeline__vec_transformer__model', 'transformer_weights', 'word2vec_pipeline__steps', 'word2vec_pipeline__vec_transformer', 'word2vec_pipeline__vec_features', 'transformer_list', 'word_features', 'word2vec_pipeline__vec_transformer__sum_up']


The pretrained word2vec model needs to be passed as parameter to the vector transformer:

In [7]:
initial_params = {"word2vec_pipeline__vec_transformer__model": word2vec_model}
feature_creator.set_params(**initial_params) 

FeatureUnion(n_jobs=1,
       transformer_list=[('word_features', FeatureTransformer()), ('word2vec_pipeline', Pipeline(steps=[('vec_transformer', Word2vecTransformer(model=<gensim.models.keyedvectors.KeyedVectors object at 0x7f48a3758dd0>,
          sum_up=False)), ('vec_features', VectorFeatureTransformer())]))],
       transformer_weights=None)

Now we are ready to transform the text data into features. The execution produces some warnings because the transformation of some questions produce nan values. These nan values are filled within the transformer methods.

In [8]:
start = time()
# Fit needs to be called as part of a pipeline even if it is just implemented as an empty method
train_features = feature_creator.fit_transform(train_data)  
val_features = feature_creator.fit_transform(val_data)
test_features = feature_creator.fit_transform(test_data)
print "Data transformation took {:.2f} minutes.".format((time() - start)/60)

  dist = 1.0 - np.dot(u, v) / (norm(u) * norm(v))
  np.double(np.bitwise_or(u != 0, v != 0).sum()))
  return abs(u - v).sum() / abs(u + v).sum()


Data transformation took 10.28 minutes.


The pipeline class in scikit learn does not implement a get_feature_names method. Thus it is necessary to extract the names of the newly created features manually. It is important to make sure that the order is correct, because it depends on the properties of the feature union defined above.

In [9]:
feature_names_1 = feature_creator.get_params()["word_features"].get_feature_names() 
feature_names_2 = feature_creator.get_params()["word2vec_pipeline"].named_steps["vec_features"].get_feature_names()
feature_names = list(feature_names_1) + list(feature_names_2)

Create dataframes that hold the newly created features:

In [10]:
train_features = pd.DataFrame(train_features, columns=feature_names)
val_features = pd.DataFrame(val_features, columns=feature_names)
test_features = pd.DataFrame(test_features, columns=feature_names)

Check that dataframes have been created correctly by printing them:

In [11]:
display(train_features.head(1))
display(val_features.head(1))
display(test_features.head(1))

print "Rows in train: {} val: {} test: {}".format(len(train_features), len(val_features), len(test_features))

assert train_features.isnull().values.any() == False
assert val_features.isnull().values.any() == False
assert test_features.isnull().values.any() == False

Unnamed: 0,q1_length,q2_length,diff_length,q1_n_words,q2_n_words,q1_len_word_ratio,q2_len_word_ratio,word_share,word2vec_cosine_distance,word2vec_cityblock_distance,word2vec_jaccard_distance,word2vec_canberra_distance,word2vec_minkowski_distance,word2vec_euclidean_distance,word2vec_braycurtis_distance,word2vec_skew_q1,word2vec_skew_q2,word2vec_kurtosis_q1,word2vec_kurtosis_q2
0,45.0,43.0,2.0,4.0,4.0,11.25,10.75,0.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.062758,-0.062758,-0.018099,-0.018099


Unnamed: 0,q1_length,q2_length,diff_length,q1_n_words,q2_n_words,q1_len_word_ratio,q2_len_word_ratio,word_share,word2vec_cosine_distance,word2vec_cityblock_distance,word2vec_jaccard_distance,word2vec_canberra_distance,word2vec_minkowski_distance,word2vec_euclidean_distance,word2vec_braycurtis_distance,word2vec_skew_q1,word2vec_skew_q2,word2vec_kurtosis_q1,word2vec_kurtosis_q2
0,35.0,47.0,-12.0,5.0,7.0,7.0,6.714286,0.45,0.050746,4.433986,1.0,88.295622,0.143691,0.318579,0.161497,0.134025,0.111909,-0.218607,-0.207114


Unnamed: 0,q1_length,q2_length,diff_length,q1_n_words,q2_n_words,q1_len_word_ratio,q2_len_word_ratio,word_share,word2vec_cosine_distance,word2vec_cityblock_distance,word2vec_jaccard_distance,word2vec_canberra_distance,word2vec_minkowski_distance,word2vec_euclidean_distance,word2vec_braycurtis_distance,word2vec_skew_q1,word2vec_skew_q2,word2vec_kurtosis_q1,word2vec_kurtosis_q2
0,29.0,24.0,5.0,3.0,3.0,9.666667,8.0,0.583333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.065999,0.065999,-0.015661,-0.015661


Rows in train: 291132 val: 72783 test: 40436


Save features and labels for future use:

In [12]:
file_directory = "../output/data/"

utility.save_pickle(train_features, file_directory, "train_features.pkl") 
utility.save_pickle(val_features, file_directory, "val_features.pkl") 
utility.save_pickle(test_features, file_directory, "test_features.pkl") 

In [13]:
train_y = train_data["is_duplicate"].values
val_y = val_data["is_duplicate"].values
test_y = test_data["is_duplicate"].values

utility.save_pickle(train_y, file_directory, "train_labels.pkl") 
utility.save_pickle(val_y, file_directory, "val_labels.pkl") 
utility.save_pickle(test_y, file_directory, "test_labels.pkl") 