In [2]:
import os
import pandas as pd
import pickle
import gensim
from IPython.display import display
from engarde.decorators import has_dtypes
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm_notebook
from capstone_project import preprocessor

tqdm_notebook().pandas(desc="Progress:")




In [3]:
# Make sure that the loaded dataframe has the correct layout otherwise throw assertion error 
@has_dtypes(dict(question1=object, question2=object, is_duplicate=int))
def load_data(filename):
    """Load dataframe using filename as input. A pandas dataframe is returned and it is checked that it 
    has the correct layout.
    """
    df = pd.read_pickle(filename)
    return df

In [4]:
# The current working directory for python is the capstone_project/notebook folder
file_directory = "../output/data/"

train_data = load_data(file_directory+"train_data.pkl")
#train_data = train_data[0:5]

test_data = load_data(file_directory+"test_data.pkl")
#test_data = test_data[0:5]

KeyboardInterrupt: (None, <function _new_Index at 0x7f7f76a16c80>, (<class 'pandas.indexes.base.Index'>, {'data': array(['id', 'qid1', 'qid2', 'is_duplicate'], dtype=object), 'name': None}))

In [None]:
train_data.fillna("", inplace=True)  # Two questions have empty fields
train_data["q1_tokens"] = train_data["question1"].progress_apply(preprocessor.tokenize)
train_data["q2_tokens"] = train_data["question2"].progress_apply(preprocessor.tokenize)

test_data.fillna("", inplace=True)  # Two questions have empty fields
test_data["q1_tokens"] = test_data["question1"].progress_apply(preprocessor.tokenize)
test_data["q2_tokens"] = test_data["question2"].progress_apply(preprocessor.tokenize)

In [None]:
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('../data/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [6]:

train_data["q1_word2vec"] = train_data["q1_tokens"].progress_apply(lambda x: preprocessor.question_to_vector(x, model=word2vec_model))
train_data["q1_vecsum"] = train_data["q1_word2vec"].progress_apply(preprocessor.sum_vectors)
train_data.drop("q1_word2vec", 1, inplace=True)

train_data["q2_word2vec"] = train_data["q2_tokens"].progress_apply(lambda x: preprocessor.question_to_vector(x, model=word2vec_model))
train_data["q2_vecsum"] = train_data["q2_word2vec"].progress_apply(preprocessor.sum_vectors)
train_data.drop("q2_word2vec", 1, inplace=True)


test_data["q1_word2vec"] = test_data["q1_tokens"].progress_apply(lambda x: preprocessor.question_to_vector(x, model=word2vec_model))
test_data["q1_vecsum"] = test_data["q1_word2vec"].progress_apply(preprocessor.sum_vectors)
test_data.drop("q1_word2vec", 1, inplace=True)


test_data["q2_word2vec"] = test_data["q2_tokens"].progress_apply(lambda x: preprocessor.question_to_vector(x, model=word2vec_model))
test_data["q2_vecsum"] = test_data["q2_word2vec"].progress_apply(preprocessor.sum_vectors)
test_data.drop("q2_word2vec", 1, inplace=True)


























In [7]:
display(train_data.head(1))
display(test_data.head(1))

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_tokens,q2_tokens,q1_vecsum,q2_vecsum
218104,218104,430027,430028,How do I build a self confidence?,How can I raise my self esteem?,1,"[build, self, confidence]","[raise, self, esteem]","[0.0182878, 0.0904482, 0.0239183, -0.00576562,...","[0.052301, 0.0498388, -0.0129904, 0.099338, -0..."


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_tokens,q2_tokens,q1_vecsum,q2_vecsum
113079,113079,224193,224194,What does horny goat weed do?,What is horny goat weed?,1,"[horny, goat, weed]","[horny, goat, weed]","[0.0250831, 0.0553706, -0.048425, 0.0710079, -...","[0.0250831, 0.0553706, -0.048425, 0.0710079, -..."


In [8]:
def save_as_pickle(dataset, output_dir, filename):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    with open (output_dir+filename, "wb") as handle:
        pickle.dump(dataset, handle)

In [9]:
prefix = "blaa"

save_as_pickle(train_data, file_directory, prefix+"preprocessed_train_data.pkl") 
save_as_pickle(test_data, file_directory, prefix+"preprocessed_test_data.pkl") 

In [10]:
skf = StratifiedKFold(n_splits=3, random_state=12574, shuffle=True)
output_directory = "../output/models/"
save_as_pickle(skf, output_directory, "kfolds.pkl") 