In [None]:
%reset -sf

# Competition Attempt

In [None]:
import os, collections, random, itertools

import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# for dirname, _, filenames in os.walk('/kaggle/input'): 
#     for filename in filenames: print(os.path.join(dirname, filename))

In [None]:
# load data

df = pd.read_csv("/kaggle/input/quora-question-pairs/train.csv.zip", dtype={'question1': str, 'question2': str})
df_hidden = pd.read_csv("/kaggle/input/quora-question-pairs/test.csv", dtype={'question1': str, 'question2': str})

df["question1"] = df["question1"].astype(str)  # resolve nan
df["question2"] = df["question2"].astype(str)
df_hidden["question1"] = df_hidden["question1"].astype(str)  # resolve nan
df_hidden["question2"] = df_hidden["question2"].astype(str)
df["qid1"] = df["qid1"] - 1
df["qid2"] = df["qid2"] - 1
maxidx = max(max(df["qid1"]), max(df["qid2"])) + 1

In [None]:
df_tfidf = pd.read_csv("/kaggle/input/quora-question-pair-competition-tfidf/train_tfidf.csv")
df_hidden_tfidf = pd.read_csv("/kaggle/input/quora-question-pair-competition-tfidf/test_tfidf.csv")

df["word_match"] = df_tfidf["word_match"]
df["tfidf_word_match"] = df_tfidf["tfidf_word_match"]
df_hidden["word_match"] = df_hidden_tfidf["word_match"]
df_hidden["tfidf_word_match"] = df_hidden_tfidf["tfidf_word_match"]

In [None]:
df.sample(10)

In [None]:
## for internal test
# df = df[:1234]
# df_hidden = df_hidden[:5678]

### Compute Augmented Features

In [None]:
from fuzzywuzzy import fuzz

In [None]:
def process_fuzz(df):
    df["ratio"] = [fuzz.ratio(question1,question2)
        for question1, question2 in tqdm.tqdm(zip(df["question1"], df["question2"]))]
    df["partial_ratio"] = [fuzz.partial_ratio(question1,question2)
        for question1, question2 in tqdm.tqdm(zip(df["question1"], df["question2"]))]
    df["token_sort_ratio"] = [fuzz.token_sort_ratio(question1,question2)
        for question1, question2 in tqdm.tqdm(zip(df["question1"], df["question2"]))]
    df["token_set_ratio"] = [fuzz.token_set_ratio(question1,question2)
        for question1, question2 in tqdm.tqdm(zip(df["question1"], df["question2"]))]
    return df

df = process_fuzz(df)
df_hidden = process_fuzz(df_hidden)

In [None]:
def process_text(df):
    df["q1_length"] = df["question1"].str.len()
    df["q2_length"] = df["question2"].str.len()
    df["q1_spaces"] = df["question1"].str.count(" ")  # words
    df["q2_spaces"] = df["question2"].str.count(" ")
    df["q1_upper"] = df['question1'].str.count(r'[A-Z]')
    df["q2_upper"] = df['question2'].str.count(r'[A-Z]')

    return df.drop(["question1", "question2"], axis=1)

df = process_text(df)
df_hidden = process_text(df_hidden)

### Include Sentence Vectors

In [None]:
model_name = "bert-base-nli-stsb-mean-tokens"
sentence_vectors = np.load(f"../input/quora-question-pairs-bert-sentence-vectors-hidden/sentence_vectors_{model_name}.npy")

sentence_vectors_question1 = np.load(f"../input/quora-question-pairs-bert-sentence-vectors-hidden/sentence_vectors_question1.npy")
sentence_vectors_question2 = np.load(f"../input/quora-question-pairs-bert-sentence-vectors-hidden/sentence_vectors_question2.npy")

In [None]:
# %%time
# from sklearn.decomposition import PCA

# import pickle
# with open('../input/quora-question-pairs-bert-sentence-vectors-hidden/pca.pkl', 'rb') as pickle_file:
#     pca = pickle.load(pickle_file) 

# pca = PCA(n_components=2)
# sentence_vectors = pca.fit_transform(sentence_vectors)
# sentence_vectors_question1 = pca.transform(sentence_vectors_question1)
# sentence_vectors_question2 = pca.transform(sentence_vectors_question2)
# pca.explained_variance_ratio_

In [None]:
# q1_vec = [sentence_vectors[qid] for qid in df["qid1"]]
# q1_vec = np.transpose(np.array(q1_vec))
# for i,v in enumerate(q1_vec):
#     df[f"q1v{i}"] = v

# q2_vec = [sentence_vectors[qid] for qid in df["qid2"]]
# q2_vec = np.transpose(np.array(q2_vec))
# for i,v in enumerate(q2_vec):
#     df[f"q2v{i}"] = v
    
# del q1_vec, q2_vec, sentence_vectors

In [None]:
# q1_vec = sentence_vectors_question1
# q1_vec = np.transpose(np.array(q1_vec))
# for i,v in enumerate(q1_vec):
#     df_hidden[f"q1v{i}"] = v

# q2_vec = sentence_vectors_question2
# q2_vec = np.transpose(np.array(q2_vec))
# for i,v in enumerate(q2_vec):
#     df_hidden[f"q2v{i}"] = v
    
# del q1_vec, q2_vec
# df_hidden

In [None]:
from scipy.spatial import distance
q1_vecs = [sentence_vectors[qid] for qid in df["qid1"]]
q2_vecs = [sentence_vectors[qid] for qid in df["qid2"]]
q_sim = [distance.cosine(q1_vec, q2_vec) for q1_vec, q2_vec in tqdm.tqdm(zip(q1_vecs, q2_vecs))]
df["q_sim"] = q_sim

q1_vecs = sentence_vectors_question1
q2_vecs = sentence_vectors_question2
q_sim = [distance.cosine(q1_vec, q2_vec) for q1_vec, q2_vec in tqdm.tqdm(zip(q1_vecs, q2_vecs))]
df_hidden["q_sim"] = q_sim

### Training

In [None]:
df_train = df.drop(["id", "qid1","qid2", "is_duplicate"], axis=1)
target_train = df["is_duplicate"]

In [None]:
import lightgbm as lgb

eval_set = np.array([True if i < len(df_train)*0.2 else False for i in range(len(df_train))])
lgb_train = lgb.Dataset(df_train[~eval_set], target_train[~eval_set])
lgb_eval = lgb.Dataset(df_train[eval_set], target_train[eval_set], reference=lgb_train)
lgb_all = lgb.Dataset(df_train, target_train)

In [None]:
params = {
#     'boosting_type': 'gbdt',
    'objective': 'binary',
    'scale_pos_weight': 0.360,
#     'metric': {'auc'},
#     'num_leaves': 15,
#     'learning_rate': 0.05,
#     'feature_fraction': 0.9,
#     'bagging_fraction': 0.8,
#     'bagging_freq': 5,
    'verbose': -1,
}


gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10000,
                valid_sets=lgb_eval,
                verbose_eval=200,
                early_stopping_rounds=10)

In [None]:
pd.DataFrame({"feature": df_train.columns, "importance": gbm.feature_importance(importance_type="gain")})[:20]

In [None]:
pred_test = gbm.predict(df_hidden.drop(["test_id"], axis=1),                     
    num_iteration=gbm.best_iteration)

In [None]:
plt.hist(pred_test)
plt.show()

In [None]:
df_submission = pd.DataFrame({"test_id": df_hidden["test_id"], "is_duplicate":pred_test})
df_submission.to_csv("submission.csv", index=False)
df_submission.shape

In [None]:
fig, ax = plt.subplots(figsize=(14,4))
sc = ax.scatter(df_hidden["tfidf_word_match"], df_hidden["word_match"], alpha=1, c=df_submission["is_duplicate"])
fig.colorbar(sc, ax=ax)
plt.ylabel("word_match")
plt.xlabel("tfidf_word_match")
plt.show()

In [None]:
!head submission.csv