# Reproduce results

<div style="color:red; font-size:14px;">!! Don't define functions here, import them from utils.py</div>

This notebook loads the trained models from disk and shows the results obtained with them.

## Imports

In [None]:
import pickle
import os
import pandas as pd

from utils import *

In [None]:
home_dir = os.environ['HOME']
path_folder_quora = home_dir + '/Datasets/QuoraQuestionPairs'

In [None]:
train_df = pd.read_csv(os.path.join(path_folder_quora, 'quora_train_data.csv'))
# concatenate qid1 and qid2 into a new column called "qid"
qid1 = train_df[['qid1', 'question1']].rename(columns={'qid1': 'qid', 'question1': 'question'})
qid2 = train_df[['qid2', 'question2']].rename(columns={'qid2': 'qid', 'question2': 'question'})
qid_df = pd.concat([qid1, qid2])

# drop any duplicate rows based on "qid" column
qid_df = qid_df.drop_duplicates(subset=['qid'])

# sort the dataframe by "qid"
qid_df = qid_df.sort_values(by=['qid'])

# reset the index of the dataframe
qid_df = qid_df.reset_index(drop=True)

## Simple Solution Evaluation

### Load files related to the model

In [None]:
# Load the saved model
with open('model_artifacts/simple_solution/lr_model.pkl', 'rb') as file:
    lr_model = pickle.load(file)
with open('model_artifacts/simple_solution/X_tr_q1q2.pkl', 'rb') as file:
    X_train = pickle.load(file)
    X_train = scipy.sparse.csr_matrix(X_train)
with open('model_artifacts/simple_solution/y_tr.pkl', 'rb') as file:
    y_train = pickle.load(file)
with open('model_artifacts/simple_solution/X_va_q1q2.pkl', 'rb') as file:
    X_val = pickle.load(file)
    X_val = scipy.sparse.csr_matrix(X_val)
with open('model_artifacts/simple_solution/y_va.pkl', 'rb') as file:
    y_val = pickle.load(file)
with open('model_artifacts/simple_solution/X_te_q1q2.pkl', 'rb') as file:
    X_test = pickle.load(file)
    X_test = scipy.sparse.csr_matrix(X_test)
with open('model_artifacts/simple_solution/y_te.pkl', 'rb') as file:
    y_test = pickle.load(file)

#### Results

In [None]:
print('========== TRAIN SET ==========')
evaluate_model(lr_model, X_train, y_train)
print('========== VALIDATION SET ==========')
evaluate_model(lr_model, X_val, y_val)
print('========== TEST SET ==========')
evaluate_model(lr_model, X_test, y_test)

## Improved Solution Evaluation

### Load files related to the model

In [None]:
# Load the saved model
with open('model_artifacts/improved_solution1/xgb_model.pkl', 'rb') as file:
    xgb_model = pickle.load(file)
with open('model_artifacts/improved_solution1/X_tr_q1q2.pkl', 'rb') as file:
    X_train = pickle.load(file)
with open('model_artifacts/improved_solution1/y_tr.pkl', 'rb') as file:
    y_train = pickle.load(file)
with open('model_artifacts/improved_solution1/X_va_q1q2.pkl', 'rb') as file:
    X_val = pickle.load(file)
with open('model_artifacts/improved_solution1/y_va.pkl', 'rb') as file:
    y_val = pickle.load(file)
with open('model_artifacts/improved_solution1/X_te_q1q2.pkl', 'rb') as file:
    X_test = pickle.load(file)
with open('model_artifacts/improved_solution1/y_te.pkl', 'rb') as file:
    y_test = pickle.load(file)

#### Results

In [None]:
print('========== TRAIN SET ==========')
evaluate_model(xgb_model, X_train, y_train)
print('========== VALIDATION SET ==========')
evaluate_model(xgb_model, X_val, y_val)
print('========== TEST SET ==========')
evaluate_model(xgb_model, X_test, y_test)

#### See some mistakes

In [None]:
incorrect_indices, predictions = get_mistakes(xgb_model, X_test, y_test)

# Show 15 random mistakes
for i in np.random.choice(incorrect_indices, 15):
    qid1 = X_test.iloc[i]['qid1']
    qid2 = X_test.iloc[i]['qid2']
    print('Question 1: {}'.format(qid_df[qid_df['qid']==qid1].question.values))
    print('Question 2: {}'.format(qid_df[qid_df['qid']==qid2].question.values))
    print('Predicted: {}'.format(predictions[i]))
    print('Actual: {}'.format(y_test[i]))
    print('------------------------------------')