In [1]:
import pandas as pd
import numpy as np
import nltk
from sklearn import metrics
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import time
from sklearn.linear_model import LogisticRegression
import os


In [18]:
# DATA LOADING FUNCTIONS

# split dataset
def split_dataset(full_data, train_ratio, validation_ratio, test_ratio):
    """
    Function that splits the dataset into train, validation, and test
    """
    random_idx = np.random.permutation(len(full_data))
    train_threshold = int(round(train_ratio*len(full_data)))
    validation_threshold = int(round((train_ratio+validation_ratio)*len(full_data)))
    
    train_set = full_data.iloc[random_idx[:train_threshold]]
    validation_set = full_data.iloc[random_idx[train_threshold:validation_threshold]]
    test_set = full_data.iloc[random_idx[validation_threshold:]]
    
    return train_set, validation_set, test_set


# load dataset
def load_datasets(load_dir = "../data/kaggle_competition/", prefix="clean_kaggle_", post_fix=""):
    """
    Function that loads the dataset
    """
    train_set = pd.read_csv(os.path.join(load_dir, "{0}train{1}.csv".format(prefix,post_fix)), keep_default_na=False)
    validation_set = pd.read_csv(os.path.join(load_dir, "{0}validation{1}.csv".format(prefix,post_fix)), keep_default_na=False)
    test_set = pd.read_csv(os.path.join(load_dir, "{0}test{1}.csv".format(prefix,post_fix)), keep_default_na=False)
    return train_set, validation_set, test_set

def xy_split(df, label_col="is_duplicate"):
    """
    Function that splits a data frame into X and y
    """
    return df.drop(label_col, axis=1).as_matrix(), df[label_col]


In [6]:
# DATA CLEANING FUNCTIONS
def clean_str(input_str):
    """
    Helper function that converts string to ASCII
    """
    # trivial case
    if pd.isnull(input_str) or type(input_str)==np.float or type(input_str)==float:
        return ""
    # encoding
    input_str = input_str.decode('ascii', 'ignore').lower()
    return input_str

def clean_dataset(full_dataset):
    """
    Function that cleans the full dataset
    """
    full_dataset["clean_q1"] = full_dataset["question1"].apply(clean_str,1)
    full_dataset["clean_q2"] = full_dataset["question2"].apply(clean_str,1)
    col_need = ["clean_q1", "clean_q2"]
    if "is_duplicate" in full_dataset.columns:
        col_need += ["is_duplicate"]
    return full_dataset[col_need]

In [7]:
# FEATURE ENGINEERING FUNCTIONS
def word_overlap(row):
    """
    Function that calculates the percentage of word overlap
    """
    token_1 = nltk.word_tokenize(row["clean_q1"])
    token_2 = nltk.word_tokenize(row["clean_q2"])
    avg_length = float(len(token_1)+len(token_2))/2
    save_token_num = len(set(token_1).intersection(set(token_2)))
    return float(save_token_num)/avg_length


def feature_engineering(df, top_k_word=500, tokenizer=None):
    """
    Feature engineering function
    """
    # length
    df["len_1"] = df["clean_q1"].apply(lambda x: len(str(x)))
    df["len_2"] = df["clean_q1"].apply(lambda x: len(str(x)))
    df["len_diff"] = np.abs(df["len_1"]-df["len_2"])
    print("length fueature loaded")
    
    # first words match
    df["first_word_q1"] = df.apply(lambda x: x["clean_q1"].split(" ")[0], 1)
    df["first_word_q2"] = df.apply(lambda x: x["clean_q2"].split(" ")[0], 1)
    df["first_word_match"] = (df["first_word_q1"] == df["first_word_q2"])
    print("first word feature loaded")
    
    # bag of words
    if tokenizer is None:
        bag_of_word_tokenizer = CountVectorizer(stop_words="english", max_features=top_k_word)
    else:
        bag_of_word_tokenizer = tokenizer
    q1_matrix = bag_of_word_tokenizer.fit_transform(df["clean_q1"]).astype(np.float)
    q2_matrix = bag_of_word_tokenizer.fit_transform(df["clean_q2"]).astype(np.float)
    df["vec_q1"] = [q1_matrix[i] for i in range(len(df))]
    df["vec_q2"] = [q2_matrix[i] for i in range(len(df))]
    print("question vectorized")
    
    # similarity measure
    cosine_sim = [cosine_similarity(q1_matrix[i], q2_matrix[i])[0][0] for i in range(len(df))]
    df["cosine_sim"] = cosine_sim
    df["overlap_percent"] = df.apply(word_overlap, 1)
    print("similarity feature loaded")
    
    # filter columns
    ignore_columns = ["first_word_q1", "first_word_q2", "clean_q1", "clean_q2", "vec_q1", "vec_q2"]
    #full_feature_df = df
    clean_feature_df = df.drop(ignore_columns, axis=1)
    
    return clean_feature_df, bag_of_word_tokenizer
    


In [19]:
# DATA CREATION SCRIPTS
# Quora Dataset
#full_data = pd.read_csv("../data/questions.csv")
# Kaggle Dataset
# begin_time = time.time()
# kaggle_train = pd.read_csv("../data/kaggle_competition/origin/train.csv")
# kaggle_test = pd.read_csv("../data/kaggle_competition/origin/test.csv")
# print("data loaded, used {0} seconds".format(time.time()-begin_time))

# clean dataset
# begin_time = time.time()
# clean_train = clean_dataset(kaggle_train)
# clean_test = clean_dataset(kaggle_test)
#clean_train.to_csv("../data/kaggle_competition/clean_datasets/clean_train.csv", index=False)
#clean_test.to_csv("../data/kaggle_competition/clean_datasets/clean_test.csv", index=False)
#print("data cleaned, used {0} seconds".format(time.time()-begin_time))


# split and save dataset
#begin_time = time.time()
# since Kaggle has its own test set, test_ratio=0
# train_set, validation_set, _ = split_dataset(clean_train, 0.8, 0.2, 0)
# test_set = clean_test
# train_set.to_csv("../data/kaggle_competition/clean_kaggle_train.csv", index=False)
# validation_set.to_csv("../data/kaggle_competition/clean_kaggle_validation.csv", index=False)
# test_set.to_csv("../data/kaggle_competition/clean_kaggle_test.csv", index=False)
# load splitted dataset
#train_set, validation_set, test_set = load_datasets()
# print("data splitted, used {0} seconds".format(time.time()-begin_time))


# feature engineering
#begin_time = time.time()
#feature_train, tokenizer = feature_engineering(train_set)
#feature_validation, _ = feature_engineering(validation_set, tokenizer=tokenizer)
#feature_test, _ = feature_engineering(test_set, tokenizer=tokenizer)
#feature_train.to_csv("../data/kaggle_competition/feature_datasets/feature_train_v1.csv", index=False)
#feature_validation.to_csv("../data/kaggle_competition/feature_datasets/feature_validation_v1.csv", index=False)
#feature_test.to_csv("../data/kaggle_competition/feature_datasets/feature_test_v1.csv", index=False)
#print("data featurized, used {0} seconds".format(time.time()-begin_time))
feature_train, feature_validation, feature_test= load_datasets(load_dir = "../data/kaggle_competition/feature_datasets", prefix="feature_", post_fix="_v1")
# load splitted dataset
# train_set, validation_set, test_set = load_datasets()

# split X, y
X_train, y_train = xy_split(feature_train)
X_validate, y_validate = xy_split(feature_validation)
X_test=feature_test.as_matrix()

In [17]:
feature_test.head()

Unnamed: 0,len_1,len_2,len_diff,first_word_match,cosine_sim,overlap_percent
0,57,57,0,False,0,0.296296
1,66,66,0,False,0,0.5
2,60,60,0,True,0,0.545455
3,27,27,0,False,0,0.222222
4,32,32,0,True,0,0.571429


In [21]:
print X_train.shape
print y_train.shape
print X_validate.shape
print y_validate.shape
print X_test.shape

(323432, 6)
(323432,)
(80858, 6)
(80858,)
(2345796, 6)


In [121]:
#mat1 = full_feature_df["vec_q1"].iloc[0]
#mat2 = full_feature_df["vec_q2"].iloc[0]

#cosine_similarity(mat1[5], mat2[5])
#a = time.time()
#cosine_similarity(full_feature_df["vec_q1"].iloc[10],full_feature_df["vec_q2"].iloc[10] )
#print time.time()-a
#len(full_feature_df)
#full_feature_df.to_csv("../data/full_feature_df.csv", index=False)
#np.sum(clean_feature_df["cosine_sim"]==0)/float(len(clean_feature_df))

0.65178520641719695

In [100]:
#mat1 = mat1.astype(np.float)
#mat2 = mat2.astype(np.float)
#cosine_similarity(mat1, mat2)

array([[ 0.]])

In [25]:
# PREDICTIVE MODEL FUNCTIONS

In [22]:
# MODEL ANALYTICS FUNCTIONS
def all_test_metrics(y_pred, y_test, metrics_list=["acc", "auc", "f1", "nll"]):
    score_dict = {}
    # acc
    if "acc" in metrics_list:
        y_pred_acc = np.round(y_pred).astype(np.int8)
        acc = metrics.accuracy_score(y_test, y_pred_acc, normalize=True)
        score_dict["acc"] = acc 
    # auc
    if "auc" in metrics_list:
        fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred, pos_label=1)
        auc = metrics.auc(fpr, tpr)
        score_dict["auc"] = auc
        #score_dict["fpr"] = fpr
        #score_dict["tpr"] = tpr
    # f1-measure
    if "f1" in metrics_list:
        y_pred_acc = np.round(y_pred).astype(np.int8)
        f1 = metrics.f1_score(y_test, y_pred_acc, labels=[0,1], pos_label=1)
        score_dict["f1"] = f1
    # nll
    if "nll" in metrics_list:
        nll = metrics.log_loss(y_test, y_pred)
        score_dict["nll"] = nll
    return score_dict


def test_model(model, X_test, y_test, verbose=True, model_name="", y_pred_test=None, pred_lambda=None):
    """
    Function that generate performance stats for a model
    """
    if y_pred_test is None:
        if pred_lambda is None:
            y_pred_test = model.predict(X_test)
        else:
            y_pred_test = pred_lambda(model, X_test)  
    scores = all_test_metrics(y_pred_test, y_test)    
    if verbose:
        print(model_name+":")
        print(scores)
    return y_pred_test, scores
    
    

In [25]:
# MODEL ANALYTICS SCRIPT
n_valid = len(validation_set)
n_test = len(test_set)

# baseline 1: majority class
y_pred_valid = [0 for i in range(n_valid)]
y_pred_test = [0 for i in range(n_test)]
_, score_majority_class_valid = test_model(None, None, y_validate, 
                                        verbose=True, model_name="Baseline 1 - Majority Class (Validation):",
                                        y_pred_test=y_pred_valid)

# baseline 2: simple word overlap
y_pred_valid = X_validate[:,5].astype(np.double)
y_pred_test = X_test[:,5].astype(np.double)
_, score_majority_class_valid = test_model(None, None, y_validate, 
                                        verbose=True, model_name="Baseline 2 - Simple Word Overlap (Validation):",
                                        y_pred_test=y_pred_valid)

# baseline 3: logistic regression
lr = LogisticRegression()
lr_lambda = lambda model, x: model.predict_proba(x)[:,1]
lr.fit(X_train, y_train)
lr_pred_valid, score_majority_class_valid = test_model(lr, X_validate, y_validate, verbose=True,
                                                       model_name="Baseline 3 - Simple Logistic Regression (Validation):",
                                                      pred_lambda=lr_lambda)


Baseline 1 - Majority Class (Validation)::
{'acc': 0.63063642434885847, 'f1': 0.0, 'auc': 0.5, 'nll': 12.757365947839453}
Baseline 2 - Simple Word Overlap (Validation)::
{'acc': 0.66991515990996564, 'f1': 0.61955127291387524, 'auc': 0.73065771989901296, 'nll': 0.66146289432170324}
Baseline 3 - Simple Logistic Regression (Validation)::
{'acc': 0.64707264587301194, 'f1': 0.43706231629613557, 'auc': 0.72917994329278035, 'nll': 0.57920871235006632}


In [27]:
#XG BOOST
import xgboost as xgb

# Set our parameters for xgboost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 4

d_train = xgb.DMatrix(X_train, label=y_train)
d_valid = xgb.DMatrix(X_validate, label=y_validate)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

bst = xgb.train(params, d_train, 400, watchlist, early_stopping_rounds=50)

Will train until valid error hasn't decreased in 50 rounds.
[0]	train-logloss:0.687110	valid-logloss:0.688171
[1]	train-logloss:0.683750	valid-logloss:0.683543
[2]	train-logloss:0.679434	valid-logloss:0.678806
[3]	train-logloss:0.674892	valid-logloss:0.674296
[4]	train-logloss:0.670768	valid-logloss:0.670166
[5]	train-logloss:0.666668	valid-logloss:0.666087
[6]	train-logloss:0.662765	valid-logloss:0.662223
[7]	train-logloss:0.658423	valid-logloss:0.658370
[8]	train-logloss:0.655223	valid-logloss:0.654428
[9]	train-logloss:0.651682	valid-logloss:0.651112
[10]	train-logloss:0.648044	valid-logloss:0.647543
[11]	train-logloss:0.644490	valid-logloss:0.644493
[12]	train-logloss:0.641185	valid-logloss:0.641218
[13]	train-logloss:0.638148	valid-logloss:0.638113
[14]	train-logloss:0.635491	valid-logloss:0.635172
[15]	train-logloss:0.632429	valid-logloss:0.632367
[16]	train-logloss:0.629236	valid-logloss:0.629545
[17]	train-logloss:0.626260	valid-logloss:0.626780
[18]	train-logloss:0.623977	vali