# Train Models
<div style="color:red; font-size:14px;">!! Don't define functions here, import them from utils.py</div>

This notebook contains the code needed to train and store models to disk.

Remember that if you use a function with a random state you have to fix it to a number so that the results are reproducible.

## Imports

In [None]:
import pandas as pd
import sklearn
from sklearn import *
import os
import pickle

from utils import *

## Read data and split

In [None]:
home_dir = os.environ['HOME']
path_folder_quora = home_dir + '/Datasets/QuoraQuestionPairs'

In [None]:
path_folder_quora

In [None]:
train_df = pd.read_csv(os.path.join(path_folder_quora, 'quora_train_data.csv'))
test_df = pd.read_csv(os.path.join(path_folder_quora, 'quora_test_data.csv'))

A_df, te_df = sklearn.model_selection.train_test_split(train_df,
                                                       test_size=0.05,
                                                       random_state=123)
tr_df, va_df = sklearn.model_selection.train_test_split(A_df,
                                                        test_size=0.05,
                                                        random_state=123)
y_tr = tr_df['is_duplicate'].values
X_tr_df = tr_df.drop(['is_duplicate'], axis =1)

y_va = va_df['is_duplicate'].values
X_va_df = va_df.drop(['is_duplicate'], axis =1)

y_te = te_df['is_duplicate'].values
X_te_df = te_df.drop(['is_duplicate'], axis =1)

print('X_tr_df.shape=',X_tr_df.shape)
print('y_tr.shape=',y_tr.shape)
print('X_va.shape=',X_va_df.shape)
print('y_va_df.shape=',y_tr.shape)
print('X_te.shape=',X_te_df.shape)
print('y_tr_df.shape=',y_tr.shape)

## Explore data

In [None]:
train_df.head()

## Simple Solution

In [None]:
# Build corpus combining all questions in a list
all_q1 = list(X_tr_df["question1"])
all_q2 = list(X_tr_df["question2"])
all_questions = all_q1 + all_q2

len(all_questions)

In [None]:
# Cast lists as strings
all_questions = cast_list_as_strings(all_questions)

In [None]:
# Train and transform using Count Vectorizer
count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(1,1))
count_vectorizer.fit(all_questions)

X_tr_q1q2 = get_features_from_df(X_tr_df,count_vectorizer)
X_va_q1q2 = get_features_from_df(X_va_df,count_vectorizer)
X_te_q1q2  = get_features_from_df(X_te_df, count_vectorizer)

X_tr_q1q2.shape, tr_df.shape, X_va_q1q2.shape, va_df.shape, te_df.shape, X_te_q1q2.shape

In [None]:
# Train Logistic Regression Model
lr_model = sklearn.linear_model.LogisticRegression(solver="liblinear",
                                                   random_state=123)
lr_model.fit(X_tr_q1q2, y_tr)

### Save model

In [None]:
if not os.path.isdir("model_artifacts"):
    os.mkdir("model_artifacts")

if not os.path.isdir("model_artifacts/simple_solution"):
        os.mkdir("model_artifacts/simple_solution")
        # Save model and validation and test datasets
        with open('model_artifacts/simple_solution/lr_model.pkl', 'wb') as file:
            pickle.dump(lr_model, file)
        with open('model_artifacts/simple_solution/X_tr_q1q2.pkl', 'wb') as file:
            pickle.dump(X_tr_q1q2, file)
        with open('model_artifacts/simple_solution/y_tr.pkl', 'wb') as file:
            pickle.dump(y_tr, file)
        with open('model_artifacts/simple_solution/X_va_q1q2.pkl', 'wb') as file:
            pickle.dump(X_va_q1q2, file)
        with open('model_artifacts/simple_solution/y_va.pkl', 'wb') as file:
            pickle.dump(y_va, file)
        with open('model_artifacts/simple_solution/X_te_q1q2.pkl', 'wb') as file:
            pickle.dump(X_te_q1q2, file)
        with open('model_artifacts/simple_solution/y_te.pkl', 'wb') as file:
            pickle.dump(y_te, file)