In [11]:
import os
import pickle
import numpy as np
import pandas as pd

import mlflow
import mlflow.sklearn
from mlflow import MlflowClient
from mlflow.models.signature import infer_signature

import matplotlib.pyplot as plt
import seaborn as sns

In [12]:
MLFLOW_DIR = "../mlruns"
MLFLOW_EXP_NAME = "quora-question-similarity"
mlflow.set_tracking_uri(MLFLOW_DIR)
mlflow.set_experiment(MLFLOW_EXP_NAME)
mlflow_client = MlflowClient()

In [28]:
runs = mlflow.search_runs(filter_string="metrics.best_cv_score < 1")
best_run_id = runs.loc[runs['metrics.test_auc_score'].idxmin()]['run_id']
model = mlflow.sklearn.load_model("runs:/" + best_run_id + "/model")

In [23]:
PREPROCESSED_DATA_FPATH = "../data/features.pkl"
MLFLOW_DIR = "../mlruns"
MLFLOW_EXP_NAME = "quora-question-similarity"
RANDOM_STATE = 12181006

In [3]:
mlflow.set_tracking_uri(MLFLOW_DIR)
mlflow.set_experiment(experiment_name=MLFLOW_EXP_NAME)

<Experiment: artifact_location='file:///c:/Users/aksha/Documents/Projects/NLP-Quora/notebooks/../mlruns/313535672057841037', creation_time=1686155617750, experiment_id='313535672057841037', last_update_time=1686155617750, lifecycle_stage='active', name='quora-question-similarity', tags={}>

## Load data

In [4]:
with open(PREPROCESSED_DATA_FPATH, "rb") as f:
    features_df = pickle.load(f)

In [5]:
features_df.sample(3)

Unnamed: 0,qid1,qid2,question1,question2,is_duplicate,q1_lemma,q1_lemma_len,q2_lemma,q2_lemma_len,q1_len,...,num_common_words_lemma,bow_euclidean_dist,bow_manhattan_dist,bow_cosine_dist,tfidf_euclidean_dist,tfidf_manhattan_dist,tfidf_cosine_dist,w2v_euclidean_dist,w2v_manhattan_dist,w2v_cosine_dist
306013,331721,444565,I'm just over 13 years old and 6 ft tall (184c...,Is 6ft 1 tall for a 13 year old girl?,0,year old ft tall cm normal,6,ft tall year old girl,5,14,...,4,1.732051,3.0,0.269703,0.693814,1.425547,0.240689,0.349829,0.349829,0.0
267651,262077,330035,How can I meet and make more friends as an int...,How can I make friends if I am an introvert?,1,meet make friend introvert,4,make friend introvert,3,11,...,3,1.0,1.0,0.133975,0.582448,0.843075,0.169623,0.045761,0.045761,0.0
119103,61911,61912,"If gay marriage is legal, then what's to stop ...","If gay marriage is legal, why isn't polygamy?",1,gay marriage legal stop people want reintroduc...,8,gay marriage legal polygamy,4,15,...,4,2.0,4.0,0.292893,0.75839,1.825271,0.287578,0.076995,0.076995,0.0


In [6]:
X = features_df.drop(["qid1", "qid2", "question1", "question2", "q1_lemma", "q2_lemma", "is_duplicate"], axis=1)
y = features_df["is_duplicate"]

## Train Test split

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

In [9]:
print("Train size:", X_train.shape, y_train.shape)
print("Train size:", X_test.shape, y_test.shape)

Train size: (283000, 16) (283000,)
Train size: (121287, 16) (121287,)


## Modeling

In [20]:
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

In [13]:
def eval_metrics(actual, pred):

    accuracy = metrics.accuracy_score(actual, pred)
    precision = metrics.precision_score(actual, pred)
    recall = metrics.recall_score(actual, pred)
    f1_score = metrics.f1_score(actual, pred)
    auc_score = metrics.roc_auc_score(actual, pred)
    
    return accuracy, precision, recall, f1_score, auc_score

### Linear models

#### Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegressionCV

In [17]:
mlflow.sklearn.autolog()
with mlflow.start_run(run_name="logistic_reg"):
    lr = LogisticRegressionCV(cv=5, class_weight="balanced", max_iter=1000, random_state=RANDOM_STATE)
    lr.fit(X_train, y_train)
    
    y_test_pred = lr.predict(X_test) > 0.5
    (accuracy, precision, recall, f1_score, auc_score) = eval_metrics(y_test, y_test_pred)
    mlflow.log_metric("testAccuracy", accuracy)
    mlflow.log_metric("testPrecission", precision)
    mlflow.log_metric("testRecall", recall)
    mlflow.log_metric("testF1_Score", f1_score)
    mlflow.log_metric("testAUC_Score", auc_score)



### Decision Tree

In [21]:
from sklearn.tree import DecisionTreeClassifier

In [27]:
mlflow.sklearn.autolog()
with mlflow.start_run(run_name="decision_tree"):
    dt = DecisionTreeClassifier(class_weight="balanced", random_state=RANDOM_STATE)

    params = {
        criterion = ['gini', 'entropy'],
        max_depth = [2,4,6,8,10,12]
    }
    dt_grid_cv = GridSearchCV(pipe, parameters)
    dt_grid_cv.fit(X_train, y_train)
    
    y_test_pred = lr.predict(X_test) > 0.5
    (accuracy, precision, recall, f1_score, auc_score) = eval_metrics(y_test, y_test_pred)
    mlflow.log_metric("testAccuracy", accuracy)
    mlflow.log_metric("testPrecission", precision)
    mlflow.log_metric("testRecall", recall)
    mlflow.log_metric("testF1_Score", f1_score)
    mlflow.log_metric("testAUC_Score", auc_score)



Number of nodes in the last tree is: 113231 with ccp_alpha: 0.08096648553716229




Number of nodes in the last tree is: 113231 with ccp_alpha: 0.08096648553716229




Number of nodes in the last tree is: 113231 with ccp_alpha: 0.08096648553716229




Number of nodes in the last tree is: 113231 with ccp_alpha: 0.08096648553716229




Number of nodes in the last tree is: 113231 with ccp_alpha: 0.08096648553716229




Number of nodes in the last tree is: 113231 with ccp_alpha: 0.08096648553716229




KeyboardInterrupt: 

In [30]:
ccp_alphas

array([ 0.00000000e+00, -9.48676901e-20, -1.18584613e-20, ...,
        6.80379714e-03,  1.50463223e-02,  8.09664855e-02])

In [36]:
np.arange(0, 5, 10)

array([0])