In [72]:
import pandas as pd
import pickle
from engarde.decorators import has_dtypes
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, KFold
from capstone_project import preprocessor

In [51]:
# Make sure that the loaded dataframe has the correct layout otherwise throw assertion error 
@has_dtypes(dict(question1=object, question2=object, is_duplicate=int))
def load_and_check_data(filename):
    """Load dataframe using filename as input. A pandas dataframe is returned and it is checked that it 
    has the layout that is defined by the decorators.
    """
    df = pd.read_pickle(filename)
    df.fillna("", inplace=True)  # Two questions have empty fields
    return df

In [77]:
# The current working directory for python is the capstone_project/notebook folder
file_directory = "../output/data/"

X = load_and_check_data(file_directory+"train_data.pkl")
X = X[0:10]

In [78]:
#train_data = train_data[:1000]

tfidf = preprocessor.TfidfTransformer()
feature_egineering = preprocessor.FeatureTransformer()

y = X["is_duplicate"].values

feature_creator = FeatureUnion([('feature_engineering', feature_egineering), ('tfidf', tfidf)])
clf = LogisticRegression()


pipe = Pipeline([("features", feature_creator), ("logistic", clf)])

In [92]:
skf = StratifiedKFold(n_splits=5, random_state=12574, shuffle=True)

for train_index, test_index in skf.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

('TRAIN:', array([1, 2, 5, 7, 8]), 'TEST:', array([0, 3, 4, 6, 9]))
('TRAIN:', array([0, 3, 4, 6, 9]), 'TEST:', array([1, 2, 5, 7, 8]))


In [12]:
def save_output(dataset, filename):
    with open (filename, "wb") as handle:
        pickle.dump(dataset, handle)

In [14]:
prefix = "first_test_"

save_output(pipe, file_directory+prefix+"logistic_pipeline.pkl") 