Preprocessing

All preprocessing that does not implement any fit method

In [1]:
import gensim
from IPython.display import display
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from capstone_project import preprocessor as pre


Load tokenized datasets:

In [2]:
# The current working directory for python is the capstone_project/notebook folder
file_directory = "../output/data/"
prefix = ""

train_data = pre.load_pickle(file_directory, prefix+"train_data.pkl")
y = train_data["is_duplicate"].values

Check if correct dataframe has been loaded:

In [3]:
display(train_data.head(1))
train_data = train_data[:100]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_tokens,q2_tokens
355802,355802,696825,696826,Which are the best songs of Enrique Iglesias?,Which is the best song of Enrique iglesias?,1,"[best, song, enrique, iglesias]","[best, song, enrique, iglesias]"


Load pretrained word2vec model. The model can be downloaded at: #TODO find url

In [4]:
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format("../data/GoogleNews-vectors-negative300.bin.gz", binary=True)

In the following we will create the transformers and add the to a pipeline that we will use to create the new features. This is more work upfront and might seem like overkill since we could just apply the transformations on the dataframes directly, but I tend to play around with the feature engineering and I find that this way my features are better organized, which saves time in the end. 

In [5]:
word_features = pre.FeatureTransformer()
word2vec_transform = pre.Word2vecTransformer(word2vec_model, sum_up=True)
word2vec_transform = pre.Word2vecTransformer()
word2vec_features = pre.VectorFeatureTransformer()
mms = MinMaxScaler()

In [6]:
word2vec_pipe = Pipeline([("vec_transformer", word2vec_transform), ("vec_features", word2vec_features)])
feature_creator = FeatureUnion([('word_features', word_features), ('word2vec_pipeline', word2vec_pipe)])

print feature_creator.get_params().keys()

['n_jobs', 'word2vec_pipeline', 'word2vec_pipeline__vec_transformer__model', 'transformer_weights', 'word2vec_pipeline__steps', 'word2vec_pipeline__vec_transformer', 'word2vec_pipeline__vec_features', 'transformer_list', 'word_features', 'word2vec_pipeline__vec_transformer__sum_up']


In [7]:
initial_params = {"word2vec_pipeline__vec_transformer__sum_up": True, 
                  "word2vec_pipeline__vec_transformer__model": word2vec_model}
feature_creator.set_params(**initial_params) 

FeatureUnion(n_jobs=1,
       transformer_list=[('word_features', FeatureTransformer()), ('word2vec_pipeline', Pipeline(steps=[('vec_transformer', Word2vecTransformer(model=<gensim.models.keyedvectors.KeyedVectors object at 0x7f8038a2c150>,
          sum_up=True)), ('vec_features', VectorFeatureTransformer())]))],
       transformer_weights=None)

In [8]:
features = feature_creator.fit_transform(train_data)

In [13]:
print features[0],type(features)
print word_features.get_feature_names(), word2vec_features.get_feature_names()
#print train_data.columns.values, type( train_data.columns.values)

[ 45.          43.           2.           4.           4.          11.25
  10.75         0.75         0.           0.           0.           0.           0.
   0.           0.          -0.11522716  -0.11522716  -0.24668134
  -0.24668134] <type 'numpy.ndarray'>
['q1_length' 'q2_length' 'diff_length' 'q1_n_words' 'q2_n_words'
 'q1_len_word_ratio' 'q2_len_word_ratio' 'word_share'] ['word2vec_cosine_distance' 'word2vec_cityblock_distance'
 'word2vec_jaccard_distance' 'word2vec_canberra_distance'
 'word2vec_minkowski_distance' 'word2vec_euclidean_distance'
 'word2vec_braycurtis_distance' 'word2vec_skew_q1' 'word2vec_skew_q2'
 'word2vec_kurtosis_q1' 'word2vec_kurtosis_q2']
