In [1]:
import os
import pandas as pd
import pickle
import gensim
from IPython.display import display
from engarde.decorators import has_dtypes
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score, RandomizedSearchCV
from tqdm import tqdm_notebook
from capstone_project import preprocessor

tqdm_notebook().pandas(desc="Progress:")




In [2]:
# Make sure that the loaded dataframe has the correct layout otherwise throw assertion error 
@has_dtypes(dict(question1=object, question2=object, is_duplicate=int))
def load_data(filename):
    """Load dataframe using filename as input. A pandas dataframe is returned and it is checked that it 
    has the correct layout.
    """
    df = pd.read_pickle(filename)
    return df

In [3]:
# The current working directory for python is the capstone_project/notebook folder
file_directory = "../output/data/"

train_data = load_data(file_directory+"train_data.pkl")
#train_data = train_data[0:1]

test_data = load_data(file_directory+"test_data.pkl")
#test_data = test_data[0:1]

In [4]:
train_data.fillna("", inplace=True)  # Two questions have empty fields
train_data["q1_tokens"] = train_data["question1"].progress_apply(preprocessor.tokenize)
train_data["q2_tokens"] = train_data["question2"].progress_apply(preprocessor.tokenize)

test_data.fillna("", inplace=True)  # Two questions have empty fields
test_data["q1_tokens"] = test_data["question1"].progress_apply(preprocessor.tokenize)
test_data["q2_tokens"] = test_data["question2"].progress_apply(preprocessor.tokenize)













In [5]:
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('../data/GoogleNews-vectors-negative300.bin.gz', binary=True)

train_data["q1_word2vec"] = train_data["q1_tokens"].progress_apply(lambda x: preprocessor.question_to_vector(x, model=word2vec_model))
train_data["q2_word2vec"] = train_data["q2_tokens"].progress_apply(lambda x: preprocessor.question_to_vector(x, model=word2vec_model))

test_data["q1_word2vec"] = test_data["q1_tokens"].progress_apply(lambda x: preprocessor.question_to_vector(x, model=word2vec_model))
test_data["q2_word2vec"] = test_data["q2_tokens"].progress_apply(lambda x: preprocessor.question_to_vector(x, model=word2vec_model))













In [6]:
display(train_data.head(1))
display(test_data.head(1))

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_tokens,q2_tokens,q1_word2vec,q2_word2vec
218104,218104,430027,430028,How do I build a self confidence?,How can I raise my self esteem?,1,"[build, self, confidence]","[raise, self, esteem]","[[-0.143555, 0.216797, 0.0388184, 0.0898438, -...","[[-0.027832, -0.0922852, -0.134766, 0.237305, ..."


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_tokens,q2_tokens,q1_word2vec,q2_word2vec
113079,113079,224193,224194,What does horny goat weed do?,What is horny goat weed?,1,"[horny, goat, weed]","[horny, goat, weed]","[[0.0140991, -0.0334473, -0.111328, 0.304688, ...","[[0.0140991, -0.0334473, -0.111328, 0.304688, ..."


In [7]:
def save_as_pickle(dataset, output_dir, filename):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    with open (output_dir+filename, "wb") as handle:
        pickle.dump(dataset, handle)

In [8]:
prefix = "first_test_"

save_as_pickle(train_data, file_directory, prefix+"preprocessed_train_data.pkl") 
save_as_pickle(test_data, file_directory, prefix+"preprocessed_test_data.pkl") 

In [9]:
skf = StratifiedKFold(n_splits=3, random_state=12574, shuffle=True)
output_directory = "../output/models/"
save_as_pickle(skf, output_directory, "kfolds.pkl") 