# Reproduce results

<div style="color:red; font-size:14px;">!! Don't define functions here, import them from utils.py</div>

This notebook loads the trained models from disk and shows the results obtained with them.

## Imports

In [None]:
import pickle
import os
import pandas as pd

from utils import *

In [None]:
from scipy.sparse import csr_matrix

In [None]:
home_dir = os.environ['HOME']
path_folder_quora = home_dir + 'Datasets/QuoraQuestionPairs'

In [None]:
train_df = pd.read_csv(os.path.join(path_folder_quora, 'quora_train_data.csv'))

## Simple Solution Evaluation

### Load files related to the model

In [None]:
# Load the saved model
with open('model_artifacts/simple_solution/lr_model.pkl', 'rb') as file:
    lr_model = pickle.load(file)
with open('model_artifacts/simple_solution/X_tr_q1q2.pkl', 'rb') as file:
    X_train = pickle.load(file)
    X_train = scipy.sparse.csr_matrix(X_train)
with open('model_artifacts/simple_solution/y_tr.pkl', 'rb') as file:
    y_train = pickle.load(file)
with open('model_artifacts/simple_solution/X_va_q1q2.pkl', 'rb') as file:
    X_val = pickle.load(file)
    X_val = scipy.sparse.csr_matrix(X_val)
with open('model_artifacts/simple_solution/y_va.pkl', 'rb') as file:
    y_val = pickle.load(file)
with open('model_artifacts/simple_solution/X_te_q1q2.pkl', 'rb') as file:
    X_test = pickle.load(file)
    X_test = scipy.sparse.csr_matrix(X_test)
with open('model_artifacts/simple_solution/y_te.pkl', 'rb') as file:
    y_test = pickle.load(file)
with open('model_artifacts/simple_solution/qid_df.pkl', 'rb') as file:
    qid_df = pickle.load(file)

#### Results

In [None]:
print('========== TRAIN SET ==========')
evaluate_model(lr_model, X_train, y_train)
print('========== VALIDATION SET ==========')
evaluate_model(lr_model, X_val, y_val)
print('========== TEST SET ==========')
evaluate_model(lr_model, X_test, y_test)

## Improved Solution Evaluation

### Baseline model

#### Load files related to the model

In [None]:
# Load the saved model
with open('model_artifacts/improved_solution_baseline/xgb_model.pkl', 'rb') as file:
    xgb_model = pickle.load(file)
with open('model_artifacts/improved_solution_baseline/X_tr_q1q2.pkl', 'rb') as file:
    X_train = pickle.load(file)
with open('model_artifacts/improved_solution_baseline/y_tr.pkl', 'rb') as file:
    y_train = pickle.load(file)
with open('model_artifacts/improved_solution_baseline/X_va_q1q2.pkl', 'rb') as file:
    X_val = pickle.load(file)
with open('model_artifacts/improved_solution_baseline/y_va.pkl', 'rb') as file:
    y_val = pickle.load(file)
with open('model_artifacts/improved_solution_baseline/X_te_q1q2.pkl', 'rb') as file:
    X_test = pickle.load(file)
with open('model_artifacts/improved_solution_baseline/y_te.pkl', 'rb') as file:
    y_test = pickle.load(file)
with open('model_artifacts/qid_df.pkl', 'rb') as file:
    qid_df = pickle.load(file)

#### Results

In [None]:
print('========== TRAIN SET ==========')
evaluate_model(xgb_model, X_train.drop(['qid1','qid2','id'], axis = 1), y_train)
print('========== VALIDATION SET ==========')
evaluate_model(xgb_model, X_val.drop(['qid1','qid2','id'], axis = 1), y_val)
print('========== TEST SET ==========')
evaluate_model(xgb_model, X_test.drop(['qid1','qid2','id'], axis = 1), y_test)

#### See some mistakes

In [None]:
incorrect_indices, predictions = get_mistakes(xgb_model, X_val.drop(['qid1','qid2','id'], axis = 1), y_val)
# Show 15 random mistakes
for i in np.random.choice(incorrect_indices, 15):
    qid1 = X_val.iloc[i,1]
    qid2 = X_val.iloc[i, 2]
    print('Original question 1: {}'.format(train_df[train_df['qid1']==qid1].question1.values[0]))
    print('Original question 2: {}'.format(train_df[train_df['qid2']==qid2].question2.values[0]))
    print('Question 1: {}'.format(qid_df[qid_df['qid']==qid1].question.values[0]))
    print('Question 2: {}'.format(qid_df[qid_df['qid']==qid2].question.values[0]))
    print('Predicted: {}'.format(predictions[i]))
    print('Actual: {}'.format(y_val[i]))
    print('------------------------------------')

### Model with feature selection + XGBoost

#### Load files related to the model

In [None]:
# Load the saved model
with open('model_artifacts/improved_solution_topfeatures/xgb_model.pkl', 'rb') as file:
    xgb_model_improve = pickle.load(file)
with open('model_artifacts/improved_solution_topfeatures/X_tr_q1q2.pkl', 'rb') as file:
    X_train = pickle.load(file)
with open('model_artifacts/improved_solution_topfeatures/y_tr.pkl', 'rb') as file:
    y_train = pickle.load(file)
with open('model_artifacts/improved_solution_topfeatures/X_va_q1q2.pkl', 'rb') as file:
    X_val = pickle.load(file)
with open('model_artifacts/improved_solution_topfeatures/y_va.pkl', 'rb') as file:
    y_val = pickle.load(file)
with open('model_artifacts/improved_solution_topfeatures/X_te_q1q2.pkl', 'rb') as file:
    X_test = pickle.load(file)
with open('model_artifacts/improved_solution_topfeatures/y_te.pkl', 'rb') as file:
    y_test = pickle.load(file)
with open('model_artifacts/qid_df.pkl', 'rb') as file:
    qid_df = pickle.load(file)

#### Results

In [None]:
print('========== TRAIN SET ==========')
evaluate_model(xgb_model_improve, X_train.drop(['qid1','qid2','id'], axis = 1), y_train)
print('========== VALIDATION SET ==========')
evaluate_model(xgb_model_improve, X_val.drop(['qid1','qid2','id'], axis = 1), y_val)
print('========== TEST SET ==========')
evaluate_model(xgb_model_improve, X_test.drop(['qid1','qid2','id'], axis = 1), y_test)

#### See some mistakes

In [None]:
incorrect_indices, predictions = get_mistakes(xgb_model_improve, X_val.drop(['qid1','qid2','id'], axis = 1), y_val)
# Show 15 random mistakes
for i in np.random.choice(incorrect_indices, 15):
    qid1 = X_val.iloc[i,-3]
    qid2 = X_val.iloc[i, -2]
    print('Original question 1: {}'.format(train_df[train_df['qid1']==qid1].question1.values[0]))
    print('Original question 2: {}'.format(train_df[train_df['qid2']==qid2].question2.values[0]))
    print('Question 1: {}'.format(qid_df[qid_df['qid']==qid1].question.values[0]))
    print('Question 2: {}'.format(qid_df[qid_df['qid']==qid2].question.values[0]))
    print('Predicted: {}'.format(predictions[i]))
    print('Actual: {}'.format(y_val[i]))
    print('------------------------------------')

### Model with feature selection + Random Forest

#### Load files related to the model

In [None]:
# Load the saved model
with open('model_artifacts/improved_solution_topfeatures/rf_model.pkl', 'rb') as file:
    rf_model = pickle.load(file)

#### Results

In [None]:
print('========== TRAIN SET ==========')
evaluate_model(rf_model, X_train.drop(['qid1','qid2','id'], axis = 1), y_train)
print('========== VALIDATION SET ==========')
evaluate_model(rf_model, X_val.drop(['qid1','qid2','id'], axis = 1), y_val)
print('========== TEST SET ==========')
evaluate_model(rf_model, X_test.drop(['qid1','qid2','id'], axis = 1), y_test)

#### See some mistakes

In [None]:
incorrect_indices, predictions = get_mistakes(rf_model, X_val.drop(['qid1','qid2','id'], axis = 1), y_val)
# Show 15 random mistakes
for i in np.random.choice(incorrect_indices, 15):
    qid1 = X_val.iloc[i,-3]
    qid2 = X_val.iloc[i, -2]
    print('Original question 1: {}'.format(train_df[train_df['qid1']==qid1].question1.values[0]))
    print('Original question 2: {}'.format(train_df[train_df['qid2']==qid2].question2.values[0]))
    print('Question 1: {}'.format(qid_df[qid_df['qid']==qid1].question.values[0]))
    print('Question 2: {}'.format(qid_df[qid_df['qid']==qid2].question.values[0]))
    print('Predicted: {}'.format(predictions[i]))
    print('Actual: {}'.format(y_val[i]))
    print('------------------------------------')

### Model with feature selection + Histogram-Based Gradient Boosting

#### Load files related to the model

In [None]:
# Load the saved model
with open('model_artifacts/improved_solution_topfeatures/hbgd_model.pkl', 'rb') as file:
    hbgd_model = pickle.load(file)

#### Results

In [None]:
print('========== TRAIN SET ==========')
evaluate_model(hbgd_model, X_train.drop(['qid1','qid2','id'], axis = 1), y_train)
print('========== VALIDATION SET ==========')
evaluate_model(hbgd_model, X_val.drop(['qid1','qid2','id'], axis = 1), y_val)
print('========== TEST SET ==========')
evaluate_model(hbgd_model, X_test.drop(['qid1','qid2','id'], axis = 1), y_test)

#### See some mistakes

In [None]:
incorrect_indices, predictions = get_mistakes(hbgd_model, X_val.drop(['qid1','qid2','id'], axis = 1), y_val)
# Show 15 random mistakes
for i in np.random.choice(incorrect_indices, 15):
    qid1 = X_val.iloc[i,-3]
    qid2 = X_val.iloc[i, -2]
    print('Original question 1: {}'.format(train_df[train_df['qid1']==qid1].question1.values[0]))
    print('Original question 2: {}'.format(train_df[train_df['qid2']==qid2].question2.values[0]))
    print('Question 1: {}'.format(qid_df[qid_df['qid']==qid1].question.values[0]))
    print('Question 2: {}'.format(qid_df[qid_df['qid']==qid2].question.values[0]))
    print('Predicted: {}'.format(predictions[i]))
    print('Actual: {}'.format(y_val[i]))
    print('------------------------------------')

#### Ensemble method

Combination of XGBoost and HistGradientBoostingClassifier

#### Load the files related to the model

In [None]:
# Load the saved model
with open('model_artifacts/improved_solution_topfeatures/eclf1.pkl', 'rb') as file:
    ensembling_model = pickle.load(file)

#### Result

In [None]:
print('========== TRAIN SET ==========')
evaluate_model(ensembling_model, X_train.drop(['qid1','qid2','id'], axis = 1), y_train)
print('========== VALIDATION SET ==========')
evaluate_model(ensembling_model, X_val.drop(['qid1','qid2','id'], axis = 1), y_val)
print('========== TEST SET ==========')
evaluate_model(ensembling_model, X_test.drop(['qid1','qid2','id'], axis = 1), y_test)

#### See some mistakes

In [None]:
incorrect_indices, predictions = get_mistakes(ensembling_model, X_val.drop(['qid1','qid2','id'], axis = 1), y_val)
# Show 15 random mistakes
for i in np.random.choice(incorrect_indices, 15):
    qid1 = X_val.iloc[i,-3]
    qid2 = X_val.iloc[i, -2]
    print('Original question 1: {}'.format(train_df[train_df['qid1']==qid1].question1.values[0]))
    print('Original question 2: {}'.format(train_df[train_df['qid2']==qid2].question2.values[0]))
    print('Question 1: {}'.format(qid_df[qid_df['qid']==qid1].question.values[0]))
    print('Question 2: {}'.format(qid_df[qid_df['qid']==qid2].question.values[0]))
    print('Predicted: {}'.format(predictions[i]))
    print('Actual: {}'.format(y_val[i]))
    print('------------------------------------')