In [None]:
import numpy as np
import pandas as pd
import difflib
import copy
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, roc_auc_score, confusion_matrix, recall_score
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import tensorflow as tf

import lightgbm as lgb

BASE_PATH = '/kaggle/input/foursquare-location-matching/'

pairs = pd.concat([
    pd.read_csv(f'{BASE_PATH}pairs.csv'), 
    pd.read_csv('/kaggle/input/4square-sup/features.csv')]
, axis=1)

pairs['match'] = pairs['match'].apply(lambda x: int(x))

feats = [
    'lat_diff', 'lon_diff', 'url_sim', 
    'addr_sim', 'name_sim', 'cat_union', 
    'zip_sim', 'city_sim', 'state_sim',
    'phone_sim'
]

X, y = pairs[feats], pairs['match']

## Running a threshhold-bagging experiment

Given a dictionary of threshholds for each column
1. Subset data
2. Train the models
3. Record the prediction of each model

In [None]:
# tdic = {
#     'url_sim': [0.1, 0.3, 0.61, 0.8, 0.91],
#     'cat_union': [0.10, 0.2, 0.33, 0.8],
#     'lon_diff': [0.000178, 0.000690, 0.003257],
#     'lat_diff': [0.000147, 0.0005, 0.001, 0.002, 10, 50],
#     'addr_sim': [0.15, 0.32, 0.55, 0.8, 0.9],
#     'zip_sim': [0.05, 0.1, 0.5, 0.95],
#     'name_sim': [0.2, 0.41, 0.66, 0.91, 0.95],
#     'phone_sim': [0.1, 0.5714, 0.6],
# }

In [None]:
# def train_models(subset, xgb_name, lgbm_name, Features=feats):
#     Xs, ys = subset[Features], subset['match']
            
#     X_train, X_val, y_train, y_val = train_test_split(Xs, ys, test_size=0.3, random_state=102)
            
#     xgbc = xgb.XGBClassifier(
#                 n_estimators=200,
#                 max_depth=8,
#                 max_leaves=20,
#                 learning_rate=1e-1,
#                 early_stopping_rounds=20,
#             ).fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)])
            
#     lgbmc = lgb.LGBMClassifier(
#                 num_leaves=35,
#                 max_depth=5,
#                 learning_rate=0.05,
#                 n_estimators=200,
#                 objective='binary',
#                 early_stopping_rounds=30,
#                 subsample_freq=1,
#             ).fit(X_train, y_train, eval_set=(X_val, y_val))
    
#     xgbc.save_model(f'{xgb_name}.json')
#     lgbmc.booster_.save_model(f'{lgbm_name}.txt')
    
#     return xgbc, lgbmc

# def gen_predict_data(Dic):
#     xgb_dict, lgbm_dict = {}, {}
    
#     for key, ts in Dic.items():
#         for tval in ts:
#             # Get the subset of the data based on the threshhold
#             subset = pairs[pairs[key] >= tval]
            
#             keyt = f'{key}>={tval}'
#             xgb_name, lgbm_name = f'xgb-{keyt}', f'lgbm-{keyt}'
            
#             xgb, lgbm = train_models(subset, xgb_name, lgbm_name)
            
#             xgb_dict[xgb_name] = xgb.predict(X)
#             lgbm_dict[lgbm_name] = lgbm.predict(X)
    
#     return pd.DataFrame(xgb_dict), pd.DataFrame(lgbm_dict)

# def eval_data(x, e):
    
#     xgb_name, lgbm_name = f'xgb-{e}', f'lgbm-{e}'
            
#     xgb, lgbm = train_models(pd.concat([x, y], axis=1), xgb_name, lgbm_name, Features=x.columns)
    
#     return xgb.predict(x), lgbm.predict(x)


# def plot_xgb(models: list):
    
#     figure, axis = plt.subplots(len(models), 3, figsize=(40, 20))
    
#     for i, model in enumerate(models):
#         xgb.plot_importance(model, ax=axis[i][0], show_values=False, title='weight', max_num_features=10)
#         xgb.plot_importance(model, importance_type='gain', ax=axis[i][1], show_values=False, title='gain', max_num_features=10)
#         xgb.plot_importance(model, importance_type='cover', ax=axis[i][2], show_values=False, title='cover', max_num_features=10)

#     plt.show()

# def gen_metrics(preds):
#     dic = {'F': [], 'Acc': [], 'Pers': [], 'Recall': []}
    
#     for pred in preds:
#         dic['F'].append(f1_score(pred, y))
#         dic['Acc'].append(accuracy_score(pred, y))
#         dic['Pers'].append(precision_score(pred, y))
#         dic['Recall'].append(recall_score(pred, y))
    
#     return pd.DataFrame(dic, [f'e{i}' for i in range(1,8)])

# def print_result(yt, yp):
#     print(f'f-score: {f1_score(yt, yp)}')
#     print(f'Accuracy: {accuracy_score(yt, yp)}')
#     print(f'Precision: {precision_score(yt, yp)}')
#     print(f'Recall Score: {recall_score(yt, yp)}')

## Evaluation

Six data features:
1. Just the features: This would be considered a benchmark where the other datasets are compared to see if their increase in accuracy is significant.
2. Features + xgb predictions
3. Features + lgbm predictions
4. xgb predictions
5. lgbm predictions
6. Features + lgbm predictions + xgb predictions
7. lgbm predictions + xgb predictions

In [None]:
# %%capture --no-display

# # Getting the threshold Predictions
# xgb_threshold_preds, lgbm_threshold_preds  = gen_predict_data(tdic)

# # Concatenations of predictions and features
# feat_xgb_preds = pd.concat([X, xgb_threshold_preds], axis=1)
# feat_lgbm_preds = pd.concat([X, lgbm_threshold_preds], axis=1)
# feat_xgb_lgbm_preds = pd.concat([X, xgb_threshold_preds, lgbm_threshold_preds], axis=1)
# xgb_lgbm_preds = pd.concat([xgb_threshold_preds, lgbm_threshold_preds], axis=1)

# # Evaluation Models
# # 1. Benchmark: Running a model with only the generated features
# e1_xgb, e1_lgbm = eval_data(X, 1)

# # 2. 
# e2_xgb, e2_lgbm = eval_data(feat_xgb_preds, 2)

# # 3. 
# e3_xgb, e3_lgbm = eval_data(feat_lgbm_preds, 3)

# # 4. 
# e4_xgb, e4_lgbm = eval_data(xgb_threshold_preds, 4)

# # 5.
# e5_xgb, e5_lgbm = eval_data(lgbm_threshold_preds, 5)

# # 6.
# e6_xgb, e6_lgbm = eval_data(feat_xgb_lgbm_preds, 6)

# # 7.
# e7_xgb, e7_lgbm = eval_data(xgb_lgbm_preds, 7)

In [None]:
# # Writting out for future reference

# xgb_threshold_preds.to_csv('xgb_threshold_preds.csv', index='False')
# lgbm_threshold_preds.to_csv('lgbm_threshold_preds.csv', index='False')

# feat_xgb_preds.to_csv('feat_xgb_preds.csv', index='False')
# feat_lgbm_preds.to_csv('feat_lgbm_preds.csv', index='False')
# feat_xgb_lgbm_preds.to_csv('feat_xgb_lgbm_preds.csv', index='False')
# xgb_lgbm_preds.to_csv('xgb_lgbm_preds.csv', index='False')

In [None]:
PRED_PATH = '/kaggle/input/4square-sup/predictions_exp1/'
xgb_threshold_preds = pd.read_csv(f'{PRED_PATH}xgb_threshold_preds.csv').iloc[:, 1:]
lgbm_threshold_preds = pd.read_csv(f'{PRED_PATH}lgbm_threshold_preds.csv').iloc[:, 1:]
# feat_xgb_preds = pd.read_csv(f'{PRED_PATH}feat_xgb_preds.csv').iloc[:, 1:]
# feat_lgbm_preds = pd.read_csv(f'{PRED_PATH}feat_lgbm_preds.csv').iloc[:, 1:]
# feat_xgb_lgbm_preds = pd.read_csv(f'{PRED_PATH}feat_xgb_lgbm_preds.csv').iloc[:, 1:]
xgb_lgbm_preds = pd.read_csv(f'{PRED_PATH}xgb_lgbm_preds.csv').iloc[:, 1:]

In [None]:
preds_list = [
    xgb_threshold_preds, 
    lgbm_threshold_preds, 
    xgb_lgbm_preds
]

## Sanity Check for prediction ensembling

In [None]:
# xgb_tp = xgb_threshold_preds.mean(axis=1).values
# lgbm_tp = lgbm_threshold_preds.mean(axis=1).values
# xgb_lgbm_tp = xgb_lgbm_preds.mean(axis=1).values

# thresh = 0.5

# xgb_tp[xgb_tp >= thresh] = 1
# xgb_tp[xgb_tp < thresh] = 0

# lgbm_tp[lgbm_tp >= thresh] = 1
# lgbm_tp[lgbm_tp < thresh] = 0

# xgb_lgbm_tp[xgb_lgbm_tp >= thresh] = 1
# xgb_lgbm_tp[xgb_lgbm_tp >= thresh] = 0

In [None]:
# tf.keras.metrics.binary_crossentropy(xgb_tp, y.astype(float))

In [None]:
# tf.keras.metrics.binary_crossentropy(lgbm_tp, y.astype(float))

In [None]:
# tf.keras.metrics.binary_crossentropy(xgb_lgbm_tp, y.astype(float))

## Comparing the accuracy of the models

- lgbm vs xgb

In [None]:
# abc = xgb.Booster()
# abc.load_model("/kaggle/working/xgb-1.json")
# abc.predict(xgb.DMatrix(X))

In [None]:
# xgb_eval_metrics  = gen_metrics([e1_xgb, e2_xgb, e3_xgb, e4_xgb, e5_xgb, e6_xgb, e7_xgb])
# lgbm_eval_metrics = gen_metrics([e1_lgbm, e2_lgbm, e3_lgbm, e4_lgbm, e5_lgbm, e6_lgbm, e7_lgbm])

In [None]:
# xgb_eval_metrics

In [None]:
# lgbm_eval_metrics

## Ensembling using Deep Learning
- Can only use the predictions from xgb and lgbm since there are missing values within the features: e4, e5, and e7

In [None]:
def seq_model(ls, x):
    X_train, X_val, y_train, y_val = train_test_split(x, y, test_size=0.3, random_state=102)
    ls.insert(0, tf.keras.layers.InputLayer(input_shape=X_train.shape[1]))
    m = tf.keras.Sequential(layers=ls)

    m.compile(optimizer='adam', loss='binary_crossentropy')

    es = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True
    )
    
    m.fit(X_train, y_train, batch_size=32, epochs=1,
       callbacks=[es],
       validation_data=(X_val, y_val),
        verbose=False)
    
    pred = m.predict(x)
    
    return pred

def eval_nn(ls, xs):
    dic = {'F': [], 'Acc': [], 'Pers': [], 'Recall': []}
    preds = []
    
    for x in xs:
    
        pred = seq_model(copy.deepcopy(ls), x)
        
        pred[pred >= 0.6] = 1
        pred[pred < 0.6] = 0
        
        dic['F'].append(f1_score(pred, y))
        dic['Acc'].append(accuracy_score(pred, y))
        dic['Pers'].append(precision_score(pred, y))
        dic['Recall'].append(recall_score(pred, y))
        preds.append(pred)
    
    return pd.DataFrame(dic), pred
        

l1 = [tf.keras.layers.Dense(1, activation='sigmoid')]
l2 = [tf.keras.layers.BatchNormalization(), 
      tf.keras.layers.Dense(1, activation='sigmoid')]
l3 = [tf.keras.layers.Dense(8), 
      tf.keras.layers.Dense(1, activation='sigmoid')]
l4 = [tf.keras.layers.Dense(16), 
      tf.keras.layers.Dense(1, activation='sigmoid')]
l5 = [tf.keras.layers.Dense(8), 
      tf.keras.layers.Dense(16), 
      tf.keras.layers.Dense(1, activation='sigmoid')]

In [None]:
l1_perf, l1_preds = eval_nn(l1, preds_list)
l2_perf, l2_preds = eval_nn(l2, preds_list)
l3_perf, l3_preds = eval_nn(l3, preds_list)
l4_perf, l4_preds = eval_nn(l4, preds_list)
l5_perf, l5_preds = eval_nn(l5, preds_list)

In [None]:
l1_perf.to_csv('l1_perf.csv', index=False)
l2_perf.to_csv('l2_perf.csv', index=False)
l3_perf.to_csv('l3_perf.csv', index=False)
l4_perf.to_csv('l4_perf.csv', index=False)
l5_perf.to_csv('l5_perf.csv', index=False)

pd.DataFrame(l1_preds).to_csv('l1_preds.csv', index=False)
pd.DataFrame(l2_preds).to_csv('l2_preds.csv', index=False)
pd.DataFrame(l3_preds).to_csv('l3_preds.csv', index=False)
pd.DataFrame(l4_preds).to_csv('l4_preds.csv', index=False)
pd.DataFrame(l5_preds).to_csv('l5_preds.csv', index=False)

In [None]:
print(
    tf.keras.metrics.binary_crossentropy(l1_preds, y.astype(float)).numpy().mean(), 
    tf.keras.metrics.binary_crossentropy(l2_preds, y.astype(float)).numpy().mean(),
    tf.keras.metrics.binary_crossentropy(l3_preds, y.astype(float)).numpy().mean(), 
    tf.keras.metrics.binary_crossentropy(l4_preds, y.astype(float)).numpy().mean(),
    tf.keras.metrics.binary_crossentropy(l5_preds, y.astype(float)).numpy().mean()
)

In [None]:
l1_perf

In [None]:
l2_perf

In [None]:
l3_perf

In [None]:
l4_perf

In [None]:
l5_perf